{
  "@context": "https://schema.org",
  "@type": "Dataset",
  "name": "agent-frontier-safety — eval-harness",
  "dateModified": "2026-06-12T00:37:04.826Z",
  "benchmarks": [
    "MMLU-Pro",
    "GPQA-Diamond",
    "SWE-bench Verified",
    "TauBench",
    "WMDP",
    "HarmBench",
    "SimpleQA",
    "LongBench-v2",
    "BIG-Bench Hard"
  ],
  "regression_gates": 84,
  "ship_blocker_threshold_pct": 2
}