Benchmark-Owned Rollouts
Drivers own environment setup, task resolution, rollout waves, success metrics, media, and results.
Keep benchmark setup, rollout execution, metrics, artifacts, and contracts inside the evaluator. Keep policy code in your own adapter.
import numpy as np
from praxis_eval import EvalConfig, LocalPolicy, evaluate
class ZeroPolicy:
def reset(self, episode_ids=None) -> None:
pass
def act(self, observations, *, action_spec=None, policy_kwargs=None, episode_ids=None):
if action_spec is None or action_spec.shape is None:
raise ValueError("Expected a fixed-shape ActionSpec.")
return np.zeros((len(observations), *action_spec.shape), dtype=action_spec.dtype)
result = evaluate(
"libero",
policy=LocalPolicy(ZeroPolicy()),
config=EvalConfig(
task="libero_10",
task_ids=(0,),
num_eval_per_task=5,
output_dir="eval/libero",
),
)
print(result.overall)
print(result.artifacts)