{
  "_comment": "Refrase Adaptation Quality Study \u2014 paired A/B comparison with blind judging",
  "_updated_at": "2026-04-16T08:30:36.434312",
  "_methodology": {
    "design": "Paired A/B test: baseline (raw user prompt) vs enhanced (Refrase-rewritten prompt)",
    "scenarios": 8,
    "task_types": [
      "code",
      "generation",
      "analysis",
      "extraction"
    ],
    "judge_model": "Claude Sonnet 4.6 (Bedrock)",
    "judge_config": "temperature=1.0, thinking enabled (2048 token budget)",
    "enhancer_model": "Claude Haiku 4.5 (Bedrock)",
    "enhancer_config": "temperature=0.5, max_tokens=2048",
    "scoring": "0-100 scale with side-by-side comparative judging, randomized order",
    "statistical_test": "Wilcoxon signed-rank"
  },
  "headline_finding": {
    "model": "claude-sonnet-4-6",
    "gain_pct": 15.4,
    "wins": 8,
    "losses": 0,
    "p_value": 0.008,
    "description": "8 out of 8 scenarios improved on Claude Sonnet 4.6, statistically significant at p<0.01"
  },
  "entries": [
    {
      "model_id": "claude-sonnet",
      "baseline_avg": 76.1,
      "enhanced_avg": 87.9,
      "adaptation_gain_pct": 15.4,
      "n_scenarios": 8,
      "wins": 8,
      "losses": 0,
      "scenarios": [
        {
          "name": "Vague function request",
          "task": "code",
          "baseline_score": 75,
          "enhanced_score": 90,
          "delta": 15
        },
        {
          "name": "Lazy refactor request",
          "task": "code",
          "baseline_score": 72,
          "enhanced_score": 88,
          "delta": 16
        },
        {
          "name": "Minimal email request",
          "task": "generation",
          "baseline_score": 72,
          "enhanced_score": 78,
          "delta": 6
        },
        {
          "name": "Terse summary request",
          "task": "generation",
          "baseline_score": 72,
          "enhanced_score": 91,
          "delta": 19
        },
        {
          "name": "Casual code review",
          "task": "analysis",
          "baseline_score": 82,
          "enhanced_score": 95,
          "delta": 13
        },
        {
          "name": "Vague performance question",
          "task": "analysis",
          "baseline_score": 82,
          "enhanced_score": 92,
          "delta": 10
        },
        {
          "name": "Lazy extraction request",
          "task": "extraction",
          "baseline_score": 72,
          "enhanced_score": 82,
          "delta": 10
        },
        {
          "name": "Minimal log parsing",
          "task": "extraction",
          "baseline_score": 82,
          "enhanced_score": 87,
          "delta": 5
        }
      ]
    },
    {
      "model_id": "deepseek-v3",
      "baseline_avg": 77.9,
      "enhanced_avg": 81.5,
      "adaptation_gain_pct": 4.7,
      "n_scenarios": 8,
      "wins": 5,
      "losses": 3,
      "scenarios": [
        {
          "name": "Vague function request",
          "task": "code",
          "baseline_score": 78,
          "enhanced_score": 80,
          "delta": 2
        },
        {
          "name": "Lazy refactor request",
          "task": "code",
          "baseline_score": 84,
          "enhanced_score": 75,
          "delta": -9
        },
        {
          "name": "Minimal email request",
          "task": "generation",
          "baseline_score": 72,
          "enhanced_score": 76,
          "delta": 4
        },
        {
          "name": "Terse summary request",
          "task": "generation",
          "baseline_score": 72,
          "enhanced_score": 82,
          "delta": 10
        },
        {
          "name": "Casual code review",
          "task": "analysis",
          "baseline_score": 90,
          "enhanced_score": 85,
          "delta": -5
        },
        {
          "name": "Vague performance question",
          "task": "analysis",
          "baseline_score": 83,
          "enhanced_score": 78,
          "delta": -5
        },
        {
          "name": "Lazy extraction request",
          "task": "extraction",
          "baseline_score": 72,
          "enhanced_score": 88,
          "delta": 16
        },
        {
          "name": "Minimal log parsing",
          "task": "extraction",
          "baseline_score": 72,
          "enhanced_score": 88,
          "delta": 16
        }
      ]
    },
    {
      "model_id": "mistral-large",
      "baseline_avg": 51.1,
      "enhanced_avg": 74.0,
      "adaptation_gain_pct": 44.7,
      "n_scenarios": 8,
      "wins": 6,
      "losses": 2,
      "scenarios": [
        {
          "name": "Vague function request",
          "task": "code",
          "baseline_score": 18,
          "enhanced_score": 74,
          "delta": 56
        },
        {
          "name": "Lazy refactor request",
          "task": "code",
          "baseline_score": 88,
          "enhanced_score": 2,
          "delta": -86
        },
        {
          "name": "Minimal email request",
          "task": "generation",
          "baseline_score": 82,
          "enhanced_score": 68,
          "delta": -14
        },
        {
          "name": "Terse summary request",
          "task": "generation",
          "baseline_score": 1,
          "enhanced_score": 90,
          "delta": 89
        },
        {
          "name": "Casual code review",
          "task": "analysis",
          "baseline_score": 2,
          "enhanced_score": 95,
          "delta": 93
        },
        {
          "name": "Vague performance question",
          "task": "analysis",
          "baseline_score": 65,
          "enhanced_score": 91,
          "delta": 26
        },
        {
          "name": "Lazy extraction request",
          "task": "extraction",
          "baseline_score": 75,
          "enhanced_score": 82,
          "delta": 7
        },
        {
          "name": "Minimal log parsing",
          "task": "extraction",
          "baseline_score": 78,
          "enhanced_score": 90,
          "delta": 12
        }
      ]
    }
  ]
}