{
  "_meta": {
    "schema_version": "1.0",
    "site_url": "https://calibrationledger.com",
    "feed_url": "https://calibrationledger.com/api/beta-findings.json",
    "human_readable_url": "https://calibrationledger.com/beta/",
    "license": "https://creativecommons.org/licenses/by/4.0/",
    "methodology_url": "https://calibrationledger.com/methodology/",
    "discipline": [
      "Each finding cites a public, third-party-published source.",
      "Calibration Ledger has not independently recomputed these scores.",
      "Phase 1 launch (Q3 2027) will recompute under data-licensing agreements."
    ],
    "schema": {
      "finding": {
        "source_name": "human-readable name of the predictive source being cited",
        "source_class": "one of: AI models, Human forecasters, Forecaster aggregator platform, Prediction market, Analyst firms, Scientific papers, Review platforms",
        "metric": "what was measured (Brier, ECE, replication rate, calibration curve, etc.)",
        "value": "reported value as published by the source",
        "context": "scope, time window, methodology summary",
        "citation_text": "full bibliographic citation",
        "citation_url": "public verifiable URL (DOI, arXiv, official page)",
        "measured_at": "ISO 8601 date the measurement applies to (often paper publication date)"
      }
    }
  },
  "findings": [
    {
      "source_name": "Good Judgment Project Superforecasters",
      "source_class": "Human forecasters",
      "metric": "Mean Brier score",
      "value": "≈ 0.25 (vs. 0.37 control group)",
      "context": "Across the IARPA Aggregative Contingent Estimation forecasting tournament (2011–2014); superforecasters were the top-2% of forecasters identified by year-1 accuracy and trained in probabilistic reasoning.",
      "citation_text": "Mellers, B., Stone, E., Atanasov, P., Rohrbaugh, N., Metz, S. E., Ungar, L., Bishop, M. M., Horowitz, M., Merkle, E., & Tetlock, P. (2015). The psychology of intelligence analysis: Drivers of prediction accuracy in world politics. Journal of Experimental Psychology: Applied, 21(1), 1–14.",
      "citation_url": "https://doi.org/10.1037/xap0000040",
      "measured_at": "2014-12-31"
    },
    {
      "source_name": "Metaculus community-prediction aggregate",
      "source_class": "Forecaster aggregator platform",
      "metric": "Brier score (binary questions, all-time)",
      "value": "public — reported on Metaculus track-record page",
      "context": "Metaculus's aggregated community prediction across all resolved binary questions on the platform. Metaculus publishes its own track record openly. Specific time-windowed Brier varies; the platform's methodology and live numbers are public at the citation URL.",
      "citation_text": "Metaculus, Track Record + Scoring Methodology (publicly maintained dashboard).",
      "citation_url": "https://www.metaculus.com/questions/track-record/",
      "measured_at": "2026-04-27"
    },
    {
      "source_name": "Manifold Markets — platform calibration",
      "source_class": "Prediction market",
      "metric": "Calibration curve (predicted prob vs. observed frequency)",
      "value": "public — Manifold publishes a live calibration plot of all resolved binary markets",
      "context": "Manifold Markets publishes a live calibration plot showing market closing-probability vs. observed YES-fraction across all resolved binary markets. Visually well-calibrated within ±~5 percentage points across the 10–90% probability range as of mid-2025.",
      "citation_text": "Manifold Markets, public Calibration Plot.",
      "citation_url": "https://manifold.markets/calibration",
      "measured_at": "2026-04-27"
    },
    {
      "source_name": "GPT-4 (OpenAI) — pre-RLHF vs post-RLHF calibration",
      "source_class": "AI models",
      "metric": "Expected Calibration Error (ECE) on multiple-choice benchmarks",
      "value": "pre-RLHF: well-calibrated; post-RLHF: degraded calibration (per OpenAI's own measurement)",
      "context": "OpenAI's GPT-4 System Card explicitly reports that the base GPT-4 model is well-calibrated on multiple-choice benchmarks (calibration plot in §3.2 of the system card), and that RLHF post-training degraded calibration. This is a rare publisher-acknowledged calibration finding for a frontier LLM.",
      "citation_text": "OpenAI (2023). GPT-4 Technical Report. arXiv:2303.08774. §3.2 \"Calibration\".",
      "citation_url": "https://arxiv.org/abs/2303.08774",
      "measured_at": "2023-03-15"
    },
    {
      "source_name": "Sell-side equity analysts — earnings forecast accuracy",
      "source_class": "Analyst firms",
      "metric": "Systematic optimism + analyst-disagreement-vs-error correlation (proper-scoring-rule analogue for point forecasts)",
      "value": "public — survey of decades of empirical work",
      "context": "A widely cited literature review of decades of empirical work on sell-side analyst earnings forecasts. Findings include: forecasts are systematically optimistic, optimism declines with horizon, recommendations have informational content for investors only when conditioned on forecast revision history, and consensus-disagreement among analysts is a useful proxy for forecast uncertainty (a calibration-adjacent property).",
      "citation_text": "Bradshaw, M. T. (2011). Analysts' Forecasts: What Do We Know After Decades of Work? Working paper, Boston College Carroll School of Management.",
      "citation_url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1880339",
      "measured_at": "2011-06-30"
    },
    {
      "source_name": "Open Science Collaboration — psychological science replication rate",
      "source_class": "Scientific papers",
      "metric": "Replication rate + effect-size shrinkage",
      "value": "36% of replications produced a statistically significant result (vs. 97% in originals); mean effect size halved on replication",
      "context": "Landmark large-scale replication of 100 psychology experiments published in three top journals. Findings provide a base rate against which any future per-paper or per-journal calibration claim must be evaluated. Comparable replication studies in economics (Camerer et al. 2016) and biomedical sciences are cited in the original paper for cross-discipline context.",
      "citation_text": "Open Science Collaboration (2015). Estimating the reproducibility of psychological science. Science 349 (6251), aac4716.",
      "citation_url": "https://doi.org/10.1126/science.aac4716",
      "measured_at": "2015-08-28"
    },
    {
      "source_name": "Anthropic — Claude / language model self-knowledge",
      "source_class": "AI models",
      "metric": "P(IK) — probability the model assigns to 'I know the answer'; P(True) — calibration of confidence in own answers",
      "value": "large language models are well-calibrated on their own knowledge, with calibration improving with model scale",
      "context": "Anthropic study finding that base language models are well-calibrated on whether they know the answer to a question (P(IK)) and on whether their answers are true (P(True)). This is a calibration-adjacent finding for AI models: not predictive forecasting per se, but the same proper-scoring-rule machinery applied to model self-confidence on factual questions.",
      "citation_text": "Kadavath, S., Conerly, T., Askell, A., et al. (2022). Language Models (Mostly) Know What They Know. arXiv:2207.05221.",
      "citation_url": "https://arxiv.org/abs/2207.05221",
      "measured_at": "2022-07-11"
    },
    {
      "source_name": "Camerer et al. — social science experiment replication (Nature/Science 2010-2015)",
      "source_class": "Scientific papers",
      "metric": "Replication rate + median effect-size shrinkage",
      "value": "13 of 21 social science experiments replicated (62%); average effect size 50% of original",
      "context": "Companion study to the Open Science Collaboration 2015 effort, focused on the 21 social-behavioral experiments published in Nature and Science 2010-2015 that met inclusion criteria. Higher replication rate than psychology overall (62% vs 36%), but effect sizes still systematically shrank — base rate for any per-paper Phase 1 scoring of social-science publications.",
      "citation_text": "Camerer, C. F., Dreber, A., Holzmeister, F., et al. (2018). Evaluating the replicability of social science experiments in Nature and Science between 2010 and 2015. Nature Human Behaviour 2, 637–644.",
      "citation_url": "https://doi.org/10.1038/s41562-018-0399-z",
      "measured_at": "2018-08-27"
    },
    {
      "source_name": "Federal Reserve Survey of Professional Forecasters — GDP / inflation accuracy",
      "source_class": "Analyst firms",
      "metric": "Real-time forecast error vs. final-revised outcome (RMSE per horizon; coverage of probability ranges)",
      "value": "public — Philadelphia Fed maintains historical SPF data + accuracy reports back to 1968",
      "context": "The Federal Reserve Bank of Philadelphia's Survey of Professional Forecasters is the longest-running quarterly survey of US macroeconomic forecasts. The Philadelphia Fed publishes per-horizon forecast accuracy statistics (RMSE for point forecasts; probability-range coverage for binned probability questions like recession in next 4 quarters). Cross-vertical Phase 1 reference for analyst-class calibration.",
      "citation_text": "Federal Reserve Bank of Philadelphia, Survey of Professional Forecasters — Documentation and Forecast Accuracy.",
      "citation_url": "https://www.philadelphiafed.org/surveys-and-data/real-time-data-research/survey-of-professional-forecasters",
      "measured_at": "2026-04-27"
    },
    {
      "source_name": "Hausfather et al. — climate model projections vs. observed warming",
      "source_class": "Scientific papers",
      "metric": "Implied transient climate response error; observed-vs-projected warming",
      "value": "14 of 17 surveyed climate models from 1970–2007 produced projections within natural-variability range of subsequent observed warming when adjusted for actual emissions",
      "context": "Evaluation of how well climate model projections published 1970-2007 actually tracked observed global mean surface temperature in the years following. Once corrected for actual greenhouse-gas emissions (which differed from modelers' assumed emissions), most models were skillful. A landmark finding for scoring scientific model projections — directly applicable to AI-model calibration analogues.",
      "citation_text": "Hausfather, Z., Drake, H. F., Abbott, T., & Schmidt, G. A. (2020). Evaluating the Performance of Past Climate Model Projections. Geophysical Research Letters 47(1), e2019GL085378.",
      "citation_url": "https://doi.org/10.1029/2019GL085378",
      "measured_at": "2020-01-04"
    }
  ],
  "deferred": [
    {
      "source_class": "Review platforms",
      "reason": "Calibration-specific public studies on aggregated review outcomes are sparse and methodologically heterogeneous (Yelp, Amazon, IMDb, Booking.com, etc. all measure different things). Coverage deferred to Phase 1 when independent recomputation under data-licensing agreements becomes possible."
    }
  ]
}
