Quickstart

Install the core package and one benchmark extra:

bash

pip install praxis-eval
pip install "praxis-eval[libero]"

Inspect the benchmark contract before writing a policy adapter:

python

from praxis_eval import get_driver

driver = get_driver("libero")
print(driver.contract.observation_keys)
print(driver.contract.action)

Run evaluation with a local policy:

python

import numpy as np

from praxis_eval import EvalConfig, LocalPolicy, evaluate


class ZeroPolicy:
    def reset(self, episode_ids=None) -> None:
        pass

    def act(self, observations, *, action_spec=None, policy_kwargs=None, episode_ids=None):
        del policy_kwargs, episode_ids
        if action_spec is None or action_spec.shape is None:
            raise ValueError("Expected an ActionSpec from the benchmark driver.")
        return np.zeros(
            (len(observations), *action_spec.shape),
            dtype=action_spec.dtype,
        )


result = evaluate(
    "libero",
    policy=LocalPolicy(ZeroPolicy()),
    config=EvalConfig(
        task="libero_10",
        task_ids=(0,),
        num_eval_per_task=1,
        num_parallel_env=1,
        output_dir="eval/libero_10_task0",
    ),
)

print(result.overall)
print(result.artifacts["results_path"])

For remote serving, install the remote extra and pass a RemotePolicy:

bash

pip install "praxis-eval[remote]"

python

from praxis_eval import EvalConfig, RemotePolicy, evaluate

result = evaluate(
    "robocasa",
    policy=RemotePolicy("127.0.0.1:50051", timeout=30.0),
    config=EvalConfig(
        task="CloseToasterOvenDoor",
        num_eval_per_task=5,
        output_dir="eval/robocasa_close_toaster",
    ),
)

Use praxis-eval-verify only on a machine with the required simulator runtime:

bash

praxis-eval-verify --help
praxis-eval-verify libero --task libero_10 --task-id 0

Quickstart ​

Quickstart