{
"type": "SET",
"op_list": [
{
"type": "SET_VALUE",
"ref": "/apps/knowledge/explorations/0x00ADEc28B6a845a085e03591bE7550dd68673C1C/lessons|qa-benchmarking/-OltaX0tZw3mfLZciGr5",
"value": {
"topic_path": "lessons/qa-benchmarking",
"title": "50-Question Samples Are Unreliable — Always Validate on 100+ Questions",
"content": "Selective FOL pipeline showed +10% EM on 50-question samples (seed=42) but only +3% EM when scaled to 100 questions — a 70% overestimate. This happened twice independently: on gold 2-doc (50q: +10%, 100q: +3%) and on 10-passage context (50q: +10%, 100q: +3%). The 50-question samples happen to be favorable (fewer hard regression cases). Cross-seed validation (seed=123, 50q) showed +6% EM, further confirming variance. Rule: always run final validation on 100+ questions with a fixed seed before claiming a result passes its gate.",
"summary": "50-question QA benchmarks overestimate improvements by ~70% (+10% on 50q drops to +3% on 100q). Always validate on 100+ questions.",
"depth": 2,
"tags": "lesson_learned,benchmarking,sample-size,evaluation,statistical-validity,hotpotqa",
"price": null,
"gateway_url": null,
"content_hash": null,
"created_at": 1771566669945,
"updated_at": 1771566669945
}
},
{
"type": "SET_VALUE",
"ref": "/apps/knowledge/index/by_topic/lessons|qa-benchmarking/explorers/0x00ADEc28B6a845a085e03591bE7550dd68673C1C",
"value": 1
},
{
"type": "SET_VALUE",
"ref": "/apps/knowledge/graph/nodes/0x00ADEc28B6a845a085e03591bE7550dd68673C1C_lessons|qa-benchmarking_-OltaX0tZw3mfLZciGr5",
"value": {
"address": "0x00ADEc28B6a845a085e03591bE7550dd68673C1C",
"topic_path": "lessons/qa-benchmarking",
"entry_id": "-OltaX0tZw3mfLZciGr5",
"title": "50-Question Samples Are Unreliable — Always Validate on 100+ Questions",
"depth": 2,
"created_at": 1771566669945
}
}
]
}