MOOD Bench Leaderboard

Multi-domain out-of-distribution safety detection on the mood-bench test split.

Each row is a detection method. For every out-of-distribution unsafe domain we report the true-positive rate at a fixed 1% false-positive rate against the pooled in-distribution safe conversations. The Overall column is the unweighted mean across OOD-unsafe domains.

Use the Columns picker below to show the per-domain breakdown, or the filter controls to narrow the table down. Sorted by Overall TPR @ FPR=1%.

{
  • "headers": [
    • "Method",
    • "Model",
    • "Submitted By",
    • "Submitted At",
    • "Overall",
    • "ID",
    • "Controlling",
    • "Insecure Code",
    • "Scheming",
    • "Jailbroken",
    • "Sycophantic",
    • "FC Missing",
    • "FC Inapprop.",
    • "Swahili"
    ],
  • "data": [
    • [
      • "Guard + Mahalanobis",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.467,
      • 0.913,
      • 0.879,
      • 0.018,
      • 0.48,
      • 0.647,
      • 0.538,
      • 0.174,
      • 0.086,
      • null
      ],
    • [
      • "Guard + perplexity + Mahalanobis",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.465,
      • 0.912,
      • 0.899,
      • 0.006,
      • 0.515,
      • 0.701,
      • 0.6,
      • 0.066,
      • 0.023,
      • null
      ],
    • [
      • "Guard + perplexity",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.431,
      • 0.911,
      • 0.881,
      • 0.001,
      • 0.457,
      • 0.611,
      • 0.586,
      • 0.001,
      • 0.001,
      • null
      ],
    • [
      • "Guard + IT uncertainty",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.412,
      • 0.912,
      • 0.779,
      • 0.002,
      • 0.477,
      • 0.525,
      • 0.481,
      • 0.022,
      • 0.097,
      • null
      ],
    • [
      • "Guard model",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.388,
      • 0.908,
      • 0.827,
      • 0,
      • 0.371,
      • 0.463,
      • 0.499,
      • 0.006,
      • 0.028,
      • null
      ],
    • [
      • "Guard ensemble",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.37,
      • 0.902,
      • 0.805,
      • 0,
      • 0.287,
      • 0.482,
      • 0.481,
      • 0,
      • 0.002,
      • null
      ],
    • [
      • "IT alignment score + IT uncertainty",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.198,
      • 0.532,
      • 0.19,
      • 0.009,
      • 0.173,
      • 0.314,
      • 0.212,
      • 0.014,
      • 0.139,
      • null
      ],
    • [
      • "IT alignment score",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.182,
      • 0.501,
      • 0.138,
      • 0.013,
      • 0.107,
      • 0.332,
      • 0.202,
      • 0.013,
      • 0.147,
      • null
      ],
    • [
      • "IT uncertainty score",
      • "Gemma 2 9B",
      • "",
      • "2026-05-18",
      • 0.103,
      • 0.324,
      • 0.046,
      • 0.005,
      • 0.029,
      • 0.212,
      • 0.111,
      • 0.019,
      • 0.077,
      • null
      ]
    ],
  • "metadata": null
}

Page 1 of 1