<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  <url>
    <loc>https://ai-eval.org/</loc>
    <changefreq>weekly</changefreq>
    <priority>0.9</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/reading-room</loc>
    <changefreq>daily</changefreq>
    <priority>0.8</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/deep-dive</loc>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/glossary</loc>
    <changefreq>monthly</changefreq>
    <priority>0.6</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/researchgate-holistic-evaluation-structured-criteria-rubrics</loc>
    <lastmod>2026-06-08</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/arxiv-proofagent-harness-open-infrastructure-for-adversarial-evaluation-of-ai-ag</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/arxiv-agentatlas-beyond-outcome-leaderboards-for-llm-agents</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/arxiv-open-world-evaluations-for-measuring-frontier-ai-capabilities</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/arxiv-code-as-agent-harness</loc>
    <lastmod>2026-05-31</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/openreview-agent-harness-engineering-survey</loc>
    <lastmod>2026-05-31</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-teaching-claude-why</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/adaline-evaluating-ai-agents-in-2026-benchmarks-for-teams</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/openai-gpt-5-5-system-card</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-an-update-on-recent-claude-code-quality-reports</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/google-research-evaluating-alignment-of-behavioral-dispositions-in-llms</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/google-research-building-better-ai-benchmarks-how-many-raters-are-enough</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/arxiv-meta-harness-end-to-end-optimization-of-model-harnesses</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-harness-design-for-long-running-application-development</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-eval-awareness-in-claude-opus-4-6-s-browsecomp-performance</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/openai-developers-run-long-horizon-tasks-with-codex</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/aws-evaluating-ai-agents-real-world-lessons-from-amazon</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-quantifying-infrastructure-noise-in-agentic-coding-evals</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/vercel-agents-md-outperforms-skills-in-our-agent-evals</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/microsoft-introducing-the-evals-for-agent-interop-starter-kit</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-designing-ai-resistant-technical-evaluations</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/post/anthropic-demystifying-evals-for-ai-agents</loc>
    <lastmod>2026-05-30</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/deep-dive/arxiv-open-world-evaluations-for-measuring-frontier-ai-capabilities</loc>
    <lastmod>2026-06-10</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/deep-dive/anthropic-demystifying-evals-for-ai-agents</loc>
    <lastmod>2026-06-10</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/deep-dive/openreview-agent-harness-engineering-survey</loc>
    <lastmod>2026-05-31</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
  <url>
    <loc>https://ai-eval.org/deep-dive/arxiv-code-as-agent-harness</loc>
    <lastmod>2026-05-31</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.7</priority>
  </url>
</urlset>
