[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"blog":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"orgImgLinks":8,"bannerLinks":9,"blogCategory":10,"category":10,"weight":11,"externalUrl":10,"links":12,"description":5,"content":13,"tag1":1306,"tag2":1307,"logosByUrl":1309,"resLinks":1312},"Justified or Just Convincing? Why \"Show Your Work\" Is No Longer Enough","Explores Error Verifiability in LLMs, revealing why “show your work” is no longer enough and how DPO\u002FRLHF improve accuracy while weakening auditability, introducing the$$\\mathscr{v}_{bal}$$metric. ","https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776413507335.webp","2026-04-17","[]","{}","",0,"{\"homepage\":\"\",\"github\":\"\",\"huggingface\":\"\",\"x\":\"\",\"discord\":\"\",\"arxiv\":\"\"}",{"data":14,"body":16,"toc":1289},{"title":4,"description":15},"We often assume that if an LLM provides a detailed reasoning chain,",{"type":17,"children":18},"root",[19,35,46,56,66,93,134,151,170,180,190,226,239,248,253,264,274,357,367,427,438,448,464,474,530,541,551,598,631,641,677,724,734,745,755,780,799,809,819,861,871,911,922,935,980,990,1000,1025,1035,1046,1070,1129,1139,1156,1186,1202,1213,1223,1252,1280],{"type":20,"tag":21,"props":22,"children":27},"element","h1",{"className":23,"lexical-key":25,"id":26},[24],"heading__h1","1","justified-or-just-convincing-why-show-your-work-is-no-longer-enough",[28],{"type":20,"tag":29,"props":30,"children":32},"span",{"style":31},"white-space: pre-wrap;",[33],{"type":34,"value":4},"text",{"type":20,"tag":36,"props":37,"children":41},"p",{"className":38,"lexical-key":40},[39],"doxhub-editor-paragraph","3",[42],{"type":20,"tag":29,"props":43,"children":44},{"style":31},[45],{"type":34,"value":15},{"type":20,"tag":36,"props":47,"children":50},{"className":48,"lexical-key":49},[39],"5",[51],{"type":20,"tag":29,"props":52,"children":53},{"style":31},[54],{"type":34,"value":55},"it's easier to catch its mistakes.",{"type":20,"tag":36,"props":57,"children":60},{"className":58,"lexical-key":59},[39],"7",[61],{"type":20,"tag":29,"props":62,"children":63},{"style":31},[64],{"type":34,"value":65},"The data says otherwise.",{"type":20,"tag":36,"props":67,"children":70},{"className":68,"lexical-key":69},[39],"9",[71,76,88],{"type":20,"tag":29,"props":72,"children":73},{"style":31},[74],{"type":34,"value":75},"This post explores the concept of ",{"type":20,"tag":77,"props":78,"children":79},"b",{},[80],{"type":20,"tag":81,"props":82,"children":85},"strong",{"className":83,"style":31},[84],"text__bold",[86],{"type":34,"value":87},"Error Verifiability",{"type":20,"tag":29,"props":89,"children":90},{"style":31},[91],{"type":34,"value":92}," — the ability of a human or an LLM judge to correctly verify an answer given a justification.",{"type":20,"tag":36,"props":94,"children":97},{"className":95,"lexical-key":96},[39],"13",[98,103,115,120,129],{"type":20,"tag":29,"props":99,"children":100},{"style":31},[101],{"type":34,"value":102},"This paper distinguishes between",{"type":20,"tag":104,"props":105,"children":106},"i",{},[107],{"type":20,"tag":108,"props":109,"children":112},"em",{"className":110,"style":31},[111],"text__italic",[113],{"type":34,"value":114}," Accuracy",{"type":20,"tag":29,"props":116,"children":117},{"style":31},[118],{"type":34,"value":119}," (the ability to reach the right conclusion) and Error",{"type":20,"tag":104,"props":121,"children":122},{},[123],{"type":20,"tag":108,"props":124,"children":126},{"className":125,"style":31},[111],[127],{"type":34,"value":128}," Verifiability ",{"type":20,"tag":29,"props":130,"children":131},{"style":31},[132],{"type":34,"value":133},"(the ability to correctly verify that conclusion). Our findings are sobering: post-training techniques like SFT and preference optimization (e.g., DPO\u002FRLHF) improve accuracy, but they leave verifiability stagnant — or worse, they can actually make incorrect answers look more persuasive.",{"type":20,"tag":36,"props":135,"children":138},{"className":136,"lexical-key":137},[39],"19",[139,144,146],{"type":20,"tag":29,"props":140,"children":141},{"style":31},[142],{"type":34,"value":143},"This post dissects the ",{"type":34,"value":145},"$\\mathscr{v}_{bal}$",{"type":20,"tag":29,"props":147,"children":148},{"style":31},[149],{"type":34,"value":150}," metric and demonstrates why current evaluation practices need a fundamental rethink.",{"type":20,"tag":152,"props":153,"children":158},"h2",{"className":154,"lexical-key":156,"id":157},[155],"heading__h2","23","_1-the-metric-that-matters-mathscrv_bal",[159,164,165],{"type":20,"tag":29,"props":160,"children":161},{"style":31},[162],{"type":34,"value":163},"1. The Metric That Matters: ",{"type":34,"value":145},{"type":20,"tag":29,"props":166,"children":167},{"style":31},[168],{"type":34,"value":169},"  ",{"type":20,"tag":36,"props":171,"children":174},{"className":172,"lexical-key":173},[39],"27",[175],{"type":20,"tag":29,"props":176,"children":177},{"style":31},[178],{"type":34,"value":179},"Standard benchmarks (GSM8K, MATH500, MMLU) treat LLMs like black boxes: input a question, output an answer, grade it. But in high-stakes deployment (e.g., medical diagnostics, legal retrieval), the justification is the primary signal for human trust.",{"type":20,"tag":36,"props":181,"children":184},{"className":182,"lexical-key":183},[39],"29",[185],{"type":20,"tag":29,"props":186,"children":187},{"style":31},[188],{"type":34,"value":189},"If a model is correct 90% of the time, a rater who blindly accepts every answer without reading the explanation already achieves 90% verification accuracy. But does that mean the model is \"verifiable\"? No.",{"type":20,"tag":36,"props":191,"children":194},{"className":192,"lexical-key":193},[39],"31",[195,200,201,206,207,212,221],{"type":20,"tag":29,"props":196,"children":197},{"style":31},[198],{"type":34,"value":199},"To quantify true verifiability, we formalize",{"type":34,"value":145},{"type":20,"tag":29,"props":202,"children":203},{"style":31},[204],{"type":34,"value":205}," (Balanced Verifiability). Unlike plain accuracy, ",{"type":34,"value":145},{"type":20,"tag":29,"props":208,"children":209},{"style":31},[210],{"type":34,"value":211}," weighs four verification scenarios equally: True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN). It heavily penalizes models that are \"confidently wrong\" and effectively isolates whether the justification ",{"type":20,"tag":104,"props":213,"children":214},{},[215],{"type":20,"tag":108,"props":216,"children":218},{"className":217,"style":31},[111],[219],{"type":34,"value":220},"genuinely",{"type":20,"tag":29,"props":222,"children":223},{"style":31},[224],{"type":34,"value":225}," helped the rater catch an error.",{"type":20,"tag":227,"props":228,"children":229},"figure",{},[230,235],{"type":20,"tag":231,"props":232,"children":234},"img",{"src":233,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776419906509.webp",[],{"type":20,"tag":236,"props":237,"children":238},"figcaption",{},[],{"type":20,"tag":77,"props":240,"children":241},{},[242],{"type":20,"tag":81,"props":243,"children":245},{"className":244,"style":31},[84],[246],{"type":34,"value":247},"Key Insight:",{"type":20,"tag":29,"props":249,"children":250},{"style":31},[251],{"type":34,"value":252}," A model is not \"verifiable\" just because it provides an explanation. It is verifiable only if its explanation provides enough signal to help a rater flip an incorrect baseline judgment into a correct one.",{"type":20,"tag":152,"props":254,"children":258},{"className":255,"lexical-key":256,"id":257},[155],"43","_2-the-post-training-paradox",[259],{"type":20,"tag":29,"props":260,"children":261},{"style":31},[262],{"type":34,"value":263},"2. The Post-Training Paradox",{"type":20,"tag":36,"props":265,"children":268},{"className":266,"lexical-key":267},[39],"45",[269],{"type":20,"tag":29,"props":270,"children":271},{"style":31},[272],{"type":34,"value":273},"We analyzed two major open-weight model families (Tulu3.1-8B and OLMo2-7B) across the entire training lifecycle: from Base to SFT, then DPO, and finally Instruct.",{"type":20,"tag":275,"props":276,"children":279},"ul",{"className":277},[278],"doxhub-editor-ul",[280,300,325],{"type":20,"tag":281,"props":282,"children":285},"li",{"value":25,"className":283},[284],"doxhub-editor-list-item",[286,295],{"type":20,"tag":77,"props":287,"children":288},{},[289],{"type":20,"tag":81,"props":290,"children":292},{"className":291,"style":31},[84],[293],{"type":34,"value":294},"The Disconnect:",{"type":20,"tag":29,"props":296,"children":297},{"style":31},[298],{"type":34,"value":299}," Accuracy consistently trends upward (often by +0.56 on GSM8K).",{"type":20,"tag":281,"props":301,"children":304},{"value":302,"className":303},"2",[284],[305,314,319,320],{"type":20,"tag":77,"props":306,"children":307},{},[308],{"type":20,"tag":81,"props":309,"children":311},{"className":310,"style":31},[84],[312],{"type":34,"value":313},"The Verifiability Plateau:",{"type":20,"tag":29,"props":315,"children":316},{"style":31},[317],{"type":34,"value":318}," ",{"type":34,"value":145},{"type":20,"tag":29,"props":321,"children":322},{"style":31},[323],{"type":34,"value":324}," remains stubbornly stagnant.",{"type":20,"tag":281,"props":326,"children":328},{"value":40,"className":327},[284],[329,338,343,352],{"type":20,"tag":77,"props":330,"children":331},{},[332],{"type":20,"tag":81,"props":333,"children":335},{"className":334,"style":31},[84],[336],{"type":34,"value":337},"The Persuasion Trap:",{"type":20,"tag":29,"props":339,"children":340},{"style":31},[341],{"type":34,"value":342}," After DPO (preference optimization), the model becomes significantly better at justifying its ",{"type":20,"tag":104,"props":344,"children":345},{},[346],{"type":20,"tag":108,"props":347,"children":349},{"className":348,"style":31},[111],[350],{"type":34,"value":351},"incorrect",{"type":20,"tag":29,"props":353,"children":354},{"style":31},[355],{"type":34,"value":356}," answers. By polishing the surface fluency of all responses, DPO effectively \"launders\" logical errors, masking the cues that a rater would normally use to flag a mistake. On GSM8K, the FP score of Tulu3.1-8B drops from 0.717 at SFT to 0.602 after DPO — raters become measurably worse at catching wrong answers.",{"type":20,"tag":227,"props":358,"children":359},{},[360,364],{"type":20,"tag":231,"props":361,"children":363},{"src":362,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776419949522.webp",[],{"type":20,"tag":236,"props":365,"children":366},{},[],{"type":20,"tag":368,"props":369,"children":372},"blockquote",{"className":370},[371],"doxhub-editor-quote",[373,385,386,398,407,408,417,418],{"type":20,"tag":104,"props":374,"children":375},{},[376],{"type":20,"tag":77,"props":377,"children":378},{},[379],{"type":20,"tag":81,"props":380,"children":382},{"className":381,"style":31},[84,111],[383],{"type":34,"value":384},"Figure: ",{"type":34,"value":145},{"type":20,"tag":104,"props":387,"children":388},{},[389],{"type":20,"tag":77,"props":390,"children":391},{},[392],{"type":20,"tag":81,"props":393,"children":395},{"className":394,"style":31},[84,111],[396],{"type":34,"value":397}," and accuracy across post-training stages (Tulu3.1-8B and OLMo2-7B)",{"type":20,"tag":104,"props":399,"children":400},{},[401],{"type":20,"tag":108,"props":402,"children":404},{"className":403,"style":31},[111],[405],{"type":34,"value":406},". Accuracy (dashed) rises steadily from Base to Instruct; ",{"type":34,"value":145},{"type":20,"tag":104,"props":409,"children":410},{},[411],{"type":20,"tag":108,"props":412,"children":414},{"className":413,"style":31},[111],[415],{"type":34,"value":416}," (solid) stays flat. The maximum ",{"type":34,"value":145},{"type":20,"tag":104,"props":419,"children":420},{},[421],{"type":20,"tag":108,"props":422,"children":424},{"className":423,"style":31},[111],[425],{"type":34,"value":426}," shift across the entire pipeline is only 0.086–0.146, against accuracy gains of up to +0.56. Post-training buys accuracy, not verifiability.",{"type":20,"tag":152,"props":428,"children":432},{"className":429,"lexical-key":430,"id":431},[155],"72","_3-stronger-models-weaker-auditability",[433],{"type":20,"tag":29,"props":434,"children":435},{"style":31},[436],{"type":34,"value":437},"3. Stronger Models, Weaker Auditability",{"type":20,"tag":36,"props":439,"children":442},{"className":440,"lexical-key":441},[39],"74",[443],{"type":20,"tag":29,"props":444,"children":445},{"style":31},[446],{"type":34,"value":447},"Does raw intelligence solve this? We evaluated 7 models ranging from 7B open-weights to frontier-class systems (Qwen3, DeepSeek-V3, Llama4-Maverick, Grok-4). If capability correlated with verifiability, frontier models should be the easiest to audit.",{"type":20,"tag":36,"props":449,"children":452},{"className":450,"lexical-key":451},[39],"76",[453,458,459],{"type":20,"tag":29,"props":454,"children":455},{"style":31},[456],{"type":34,"value":457},"The opposite holds. The most accurate models consistently rank among the lowest in ",{"type":34,"value":145},{"type":20,"tag":29,"props":460,"children":461},{"style":31},[462],{"type":34,"value":463}," . The per-cell breakdown shows the gap is concentrated on incorrect answers: frontier models like Qwen3 see large drops in FP and TN scores compared to smaller models, while TP scores remain comparable across all models. The errors don't disappear — they just become harder to spot.",{"type":20,"tag":227,"props":465,"children":466},{},[467,471],{"type":20,"tag":231,"props":468,"children":470},{"src":469,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776419985408.webp",[],{"type":20,"tag":236,"props":472,"children":473},{},[],{"type":20,"tag":368,"props":475,"children":477},{"className":476},[371],[478,490,491,495,507,516,517,521],{"type":20,"tag":104,"props":479,"children":480},{},[481],{"type":20,"tag":77,"props":482,"children":483},{},[484],{"type":20,"tag":81,"props":485,"children":487},{"className":486,"style":31},[84,111],[488],{"type":34,"value":489},"Figure: Accuracy vs. ",{"type":34,"value":145},{"type":20,"tag":29,"props":492,"children":493},{"style":31},[494],{"type":34,"value":318},{"type":20,"tag":104,"props":496,"children":497},{},[498],{"type":20,"tag":77,"props":499,"children":500},{},[501],{"type":20,"tag":81,"props":502,"children":504},{"className":503,"style":31},[84,111],[505],{"type":34,"value":506}," across 7 models and 5 benchmarks. ",{"type":20,"tag":104,"props":508,"children":509},{},[510],{"type":20,"tag":108,"props":511,"children":513},{"className":512,"style":31},[111],[514],{"type":34,"value":515},"The trend is consistent: models with higher accuracy tend to score lower on ",{"type":34,"value":145},{"type":20,"tag":29,"props":518,"children":519},{"style":31},[520],{"type":34,"value":318},{"type":20,"tag":104,"props":522,"children":523},{},[524],{"type":20,"tag":108,"props":525,"children":527},{"className":526,"style":31},[111],[528],{"type":34,"value":529},". Smaller models cluster in the upper-left; frontier models land in the lower-right. The errors don't disappear — they become harder to catch.",{"type":20,"tag":152,"props":531,"children":535},{"className":532,"lexical-key":533,"id":534},[155],"90","_4-why-simple-fixes-fail",[536],{"type":20,"tag":29,"props":537,"children":538},{"style":31},[539],{"type":34,"value":540},"4. Why Simple Fixes Fail",{"type":20,"tag":36,"props":542,"children":545},{"className":543,"lexical-key":544},[39],"92",[546],{"type":20,"tag":29,"props":547,"children":548},{"style":31},[549],{"type":34,"value":550},"Before jumping to complex solutions, several intuitive approaches deserve examination. We tested three categories of lightweight interventions. None of them work reliably.",{"type":20,"tag":36,"props":552,"children":555},{"className":553,"lexical-key":554},[39],"94",[556,565,570,579,584,593],{"type":20,"tag":77,"props":557,"children":558},{},[559],{"type":20,"tag":81,"props":560,"children":562},{"className":561,"style":31},[84],[563],{"type":34,"value":564},"Stylistic rephrasing.",{"type":20,"tag":29,"props":566,"children":567},{"style":31},[568],{"type":34,"value":569}," Restructuring justifications into numbered steps, simplifying language, or rewriting in a professional tone — all designed to reduce cognitive effort without altering content. Results were inconsistent across benchmarks and uniformly negative on GSM8K (up to −0.080). Changing ",{"type":20,"tag":104,"props":571,"children":572},{},[573],{"type":20,"tag":108,"props":574,"children":576},{"className":575,"style":31},[111],[577],{"type":34,"value":578},"how",{"type":20,"tag":29,"props":580,"children":581},{"style":31},[582],{"type":34,"value":583}," a justification is presented without changing ",{"type":20,"tag":104,"props":585,"children":586},{},[587],{"type":20,"tag":108,"props":588,"children":590},{"className":589,"style":31},[111],[591],{"type":34,"value":592},"what information",{"type":20,"tag":29,"props":594,"children":595},{"style":31},[596],{"type":34,"value":597}," it contains does not help raters catch errors.",{"type":20,"tag":36,"props":599,"children":602},{"className":600,"lexical-key":601},[39],"101",[603,612,617,626],{"type":20,"tag":77,"props":604,"children":605},{},[606],{"type":20,"tag":81,"props":607,"children":609},{"className":608,"style":31},[84],[610],{"type":34,"value":611},"Calibrated linguistic confidence.",{"type":20,"tag":29,"props":613,"children":614},{"style":31},[615],{"type":34,"value":616}," Forcing the model to express doubt on its least-confident responses, based on internal log-likelihoods, verbalized confidence, or P(true). If the model hedged only when it was actually wrong, this should help. But the optimal strategy turns out to be hedging ",{"type":20,"tag":104,"props":618,"children":619},{},[620],{"type":20,"tag":108,"props":621,"children":623},{"className":622,"style":31},[111],[624],{"type":34,"value":625},"everything",{"type":20,"tag":29,"props":627,"children":628},{"style":31},[629],{"type":34,"value":630}," — uniformly rephrasing all responses to sound uncertain is no worse than selectively targeting the least-confident ones. LLMs often don't know when they are wrong, making self-reported uncertainty a poor signal for actual correctness.",{"type":20,"tag":227,"props":632,"children":633},{},[634,638],{"type":20,"tag":231,"props":635,"children":637},{"src":636,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776420016006.webp",[],{"type":20,"tag":236,"props":639,"children":640},{},[],{"type":20,"tag":368,"props":642,"children":644},{"className":643},[371],[645,654,658,659,663,672],{"type":20,"tag":77,"props":646,"children":647},{},[648],{"type":20,"tag":81,"props":649,"children":651},{"className":650,"style":31},[84],[652],{"type":34,"value":653},"Figure: Effect of calibrated linguistic confidence on ",{"type":20,"tag":29,"props":655,"children":656},{"style":31},[657],{"type":34,"value":318},{"type":34,"value":145},{"type":20,"tag":29,"props":660,"children":661},{"style":31},[662],{"type":34,"value":318},{"type":20,"tag":77,"props":664,"children":665},{},[666],{"type":20,"tag":81,"props":667,"children":669},{"className":668,"style":31},[84],[670],{"type":34,"value":671},".",{"type":20,"tag":29,"props":673,"children":674},{"style":31},[675],{"type":34,"value":676}," Sweeping the fraction of least-confident responses rephrased to sound \"uncertain\" (0%–100%) yields negligible improvement. Optima (×) cluster near k=100%, meaning hedging everything works as well as targeted hedging. The model's internal confidence is a poor signal for actual correctness.",{"type":20,"tag":36,"props":678,"children":681},{"className":679,"lexical-key":680},[39],"114",[682,691,696,697,701,710,711,719],{"type":20,"tag":77,"props":683,"children":684},{},[685],{"type":20,"tag":81,"props":686,"children":688},{"className":687,"style":31},[84],[689],{"type":34,"value":690},"Best-of-N selection.",{"type":20,"tag":29,"props":692,"children":693},{"style":31},[694],{"type":34,"value":695}," Generate multiple candidates, score them, pick the best. The paper tests 9 selection strategies across 20 candidates per question. No strategy consistently improves ",{"type":34,"value":145},{"type":20,"tag":29,"props":698,"children":699},{"style":31},[700],{"type":34,"value":318},{"type":20,"tag":104,"props":702,"children":703},{},[704],{"type":20,"tag":108,"props":705,"children":707},{"className":706,"style":31},[111],[708],{"type":34,"value":709},". The most telling case: Llama3.1-8B on MATH500, where selecting by P(true) boosts accuracy by +0.040 but drops ",{"type":34,"value":145},{"type":20,"tag":104,"props":712,"children":713},{},[714],{"type":20,"tag":108,"props":715,"children":717},{"className":716,"style":31},[111],[718],{"type":34,"value":318},{"type":20,"tag":29,"props":720,"children":721},{"style":31},[722],{"type":34,"value":723},"by −0.050. The strategy that picks the \"most likely correct\" answer simultaneously picks the one hardest to audit.",{"type":20,"tag":36,"props":725,"children":728},{"className":726,"lexical-key":727},[39],"122",[729],{"type":20,"tag":29,"props":730,"children":731},{"style":31},[732],{"type":34,"value":733},"This last point is worth flagging for the broader evaluation community. Most leaderboards are fundamentally selection mechanisms that rank models by their best outputs. If that selection process systematically favors high-accuracy but low-auditability responses, we may be rewarding persuasive errors by design — a blind spot in current benchmarking practice that deserves closer scrutiny.",{"type":20,"tag":152,"props":735,"children":739},{"className":736,"lexical-key":737,"id":738},[155],"124","_5-what-actually-works-domain-appropriate-external-information",[740],{"type":20,"tag":29,"props":741,"children":742},{"style":31},[743],{"type":34,"value":744},"5. What Actually Works: Domain-Appropriate External Information",{"type":20,"tag":36,"props":746,"children":749},{"className":747,"lexical-key":748},[39],"126",[750],{"type":20,"tag":29,"props":751,"children":752},{"style":31},[753],{"type":34,"value":754},"The pattern across all failed interventions is clear: methods that rearrange or restyle the model's own information cannot improve verifiability. Effective improvement requires injecting information the model does not already have — and the right information depends on the domain.",{"type":20,"tag":36,"props":756,"children":759},{"className":757,"lexical-key":758},[39],"128",[760,769,774,775],{"type":20,"tag":77,"props":761,"children":762},{},[763],{"type":20,"tag":81,"props":764,"children":766},{"className":765,"style":31},[84],[767],{"type":34,"value":768},"Reflect-and-Rephrase (RR) for mathematical reasoning.",{"type":20,"tag":29,"props":770,"children":771},{"style":31},[772],{"type":34,"value":773}," The model compares its initial response against k alternative outputs and produces a reflection on where they agree or diverge. The justification is then rewritten with explicit uncertainty markers at points of inconsistency. RR produces consistent ",{"type":34,"value":145},{"type":20,"tag":29,"props":776,"children":777},{"style":31},[778],{"type":34,"value":779}," gains across all (model, dataset) pairs, with improvements concentrated in the FP and TN cells — it primarily helps raters identify incorrect responses, without degrading verifiability for correct ones.",{"type":20,"tag":36,"props":781,"children":784},{"className":782,"lexical-key":783},[39],"133",[785,794],{"type":20,"tag":77,"props":786,"children":787},{},[788],{"type":20,"tag":81,"props":789,"children":791},{"className":790,"style":31},[84],[792],{"type":34,"value":793},"Oracle-Rephrase (OR) for factual QA.",{"type":20,"tag":29,"props":795,"children":796},{"style":31},[797],{"type":34,"value":798}," Mathematical reasoning can be cross-checked for internal consistency, but factual claims decompose into individual assertions that each still require verification. Without external ground truth, the model cannot reliably detect its own factual errors. OR addresses this by extracting atomic claims from a justification, verifying each against an oracle model (Claude Sonnet 4.5), and rewriting the justification with explicit inline annotations for flagged claims. OR yields consistent improvements across all models and datasets tested.",{"type":20,"tag":36,"props":800,"children":803},{"className":801,"lexical-key":802},[39],"136",[804],{"type":20,"tag":29,"props":805,"children":806},{"style":31},[807],{"type":34,"value":808},"The fact that RR does not transfer to factual QA, and OR requires an external verifier, underscores that verifiability is not a monolithic problem — different domains require fundamentally different verification strategies.",{"type":20,"tag":227,"props":810,"children":811},{},[812,816],{"type":20,"tag":231,"props":813,"children":815},{"src":814,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776420056313.webp",[],{"type":20,"tag":236,"props":817,"children":818},{},[],{"type":20,"tag":368,"props":820,"children":822},{"className":821},[371],[823,835,836,840,852],{"type":20,"tag":104,"props":824,"children":825},{},[826],{"type":20,"tag":77,"props":827,"children":828},{},[829],{"type":20,"tag":81,"props":830,"children":832},{"className":831,"style":31},[84,111],[833],{"type":34,"value":834},"Table: ",{"type":34,"value":145},{"type":20,"tag":29,"props":837,"children":838},{"style":31},[839],{"type":34,"value":318},{"type":20,"tag":104,"props":841,"children":842},{},[843],{"type":20,"tag":77,"props":844,"children":845},{},[846],{"type":20,"tag":81,"props":847,"children":849},{"className":848,"style":31},[84,111],[850],{"type":34,"value":851},"across all benchmarks (Δ vs. Base).",{"type":20,"tag":104,"props":853,"children":854},{},[855],{"type":20,"tag":108,"props":856,"children":858},{"className":857,"style":31},[111],[859],{"type":34,"value":860}," Stylistic methods (PROF., STRUCT., SIMPL.) are inconsistent — often negative on GSM8K, marginal elsewhere. RR (bold) reliably improves math benchmarks across all three models. OR (bold) reliably improves factual QA (MMLU, TruthfulQA). Neither method transfers to the other's domain, confirming that verifiability improvement is domain-specific.",{"type":20,"tag":227,"props":862,"children":863},{},[864,868],{"type":20,"tag":231,"props":865,"children":867},{"src":866,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776420083385.webp",[],{"type":20,"tag":236,"props":869,"children":870},{},[],{"type":20,"tag":368,"props":872,"children":874},{"className":873},[371],[875,884,885,894,899,900,905,906],{"type":20,"tag":77,"props":876,"children":877},{},[878],{"type":20,"tag":81,"props":879,"children":881},{"className":880,"style":31},[84],[882],{"type":34,"value":883},"Table: Per-cell ",{"type":34,"value":145},{"type":20,"tag":77,"props":886,"children":887},{},[888],{"type":20,"tag":81,"props":889,"children":891},{"className":890,"style":31},[84],[892],{"type":34,"value":893}," for rephrase methods on MATH500 and GSM8K (Δ vs. Base).",{"type":20,"tag":29,"props":895,"children":896},{"style":31},[897],{"type":34,"value":898}," Stylistic methods (PROF., STRUCT., SIMPL.) show mixed or negative Δ on ",{"type":34,"value":145},{"type":20,"tag":29,"props":901,"children":902},{"style":31},[903],{"type":34,"value":904}," , with consistent FP drops on GSM8K (up to −0.190). RR is the only method that improves ",{"type":34,"value":145},{"type":20,"tag":29,"props":907,"children":908},{"style":31},[909],{"type":34,"value":910}," across all models on both benchmarks, with gains concentrated in FP and TN — the cells involving incorrect answers. RR helps raters catch errors without hurting performance on correct ones.",{"type":20,"tag":152,"props":912,"children":916},{"className":913,"lexical-key":914,"id":915},[155],"157","_6-implications",[917],{"type":20,"tag":29,"props":918,"children":919},{"style":31},[920],{"type":34,"value":921},"6. Implications",{"type":20,"tag":923,"props":924,"children":929},"h3",{"className":925,"lexical-key":927,"id":928},[926],"heading__h3","159","what-the-human-study-reveals",[930],{"type":20,"tag":29,"props":931,"children":932},{"style":31},[933],{"type":34,"value":934},"What the human study reveals",{"type":20,"tag":36,"props":936,"children":939},{"className":937,"lexical-key":938},[39],"161",[940,945,954,966,975],{"type":20,"tag":29,"props":941,"children":942},{"style":31},[943],{"type":34,"value":944},"The paper validates its LLM-as-a-judge protocol with a human subject study on MATH500. The most important finding is counterintuitive: ",{"type":20,"tag":77,"props":946,"children":947},{},[948],{"type":20,"tag":81,"props":949,"children":951},{"className":950,"style":31},[84],[952],{"type":34,"value":953},"human raters scored 0.836 accuracy without any justification, but only 0.809 ",{"type":20,"tag":104,"props":955,"children":956},{},[957],{"type":20,"tag":77,"props":958,"children":959},{},[960],{"type":20,"tag":81,"props":961,"children":963},{"className":962,"style":31},[84,111],[964],{"type":34,"value":965},"with",{"type":20,"tag":77,"props":967,"children":968},{},[969],{"type":20,"tag":81,"props":970,"children":972},{"className":971,"style":31},[84],[973],{"type":34,"value":974}," it.",{"type":20,"tag":29,"props":976,"children":977},{"style":31},[978],{"type":34,"value":979}," For competent raters, seeing the model's reasoning didn't help — it hurt.",{"type":20,"tag":36,"props":981,"children":984},{"className":982,"lexical-key":983},[39],"167",[985],{"type":20,"tag":29,"props":986,"children":987},{"style":31},[988],{"type":34,"value":989},"The justification becomes noise, or worse, a persuasion vector that pulls capable evaluators away from their correct initial judgment. This aligns with broader overreliance research, but here it's quantified against a verifiability-specific metric.",{"type":20,"tag":227,"props":991,"children":992},{},[993,997],{"type":20,"tag":231,"props":994,"children":996},{"src":995,"alt":10},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002Fassets\u002Fimages\u002F69df4839413cb85df42741e9-Error_Verifiability-20260417-1776420112876.webp",[],{"type":20,"tag":236,"props":998,"children":999},{},[],{"type":20,"tag":368,"props":1001,"children":1003},{"className":1002},[371],[1004,1016],{"type":20,"tag":104,"props":1005,"children":1006},{},[1007],{"type":20,"tag":77,"props":1008,"children":1009},{},[1010],{"type":20,"tag":81,"props":1011,"children":1013},{"className":1012,"style":31},[84,111],[1014],{"type":34,"value":1015},"Figure: LLM–human agreement across evaluation settings.",{"type":20,"tag":104,"props":1017,"children":1018},{},[1019],{"type":20,"tag":108,"props":1020,"children":1022},{"className":1021,"style":31},[111],[1023],{"type":34,"value":1024}," AO-CoT and direct AJ achieve the highest κ with human raters (0.481 and 0.501), confirming these as the closest LLM proxies for human verification behavior.",{"type":20,"tag":36,"props":1026,"children":1029},{"className":1027,"lexical-key":1028},[39],"174",[1030],{"type":20,"tag":29,"props":1031,"children":1032},{"style":31},[1033],{"type":34,"value":1034},"For annotation workflows, this is a direct operational signal. The default process — read the question, read the full CoT, then judge — may be systematically degrading judgment quality on the cases that matter most: where the model is wrong but sounds right. A better protocol might be to judge the answer independently first, then read the justification as a second pass. That separation mirrors the AO → AJ structure in the paper and gives annotators an anchor that the justification can't silently override.",{"type":20,"tag":923,"props":1036,"children":1040},{"className":1037,"lexical-key":1038,"id":1039},[926],"176","what-this-means-for-preference-data",[1041],{"type":20,"tag":29,"props":1042,"children":1043},{"style":31},[1044],{"type":34,"value":1045},"What this means for preference data",{"type":20,"tag":36,"props":1047,"children":1050},{"className":1048,"lexical-key":1049},[39],"178",[1051,1056,1065],{"type":20,"tag":29,"props":1052,"children":1053},{"style":31},[1054],{"type":34,"value":1055},"The human study finding connects to a systemic issue in how preference data is collected. Preference annotation asks raters to choose between two responses. In practice, raters gravitate toward the response that reads better — more fluent, more structured, more confident. ",{"type":20,"tag":77,"props":1057,"children":1058},{},[1059],{"type":20,"tag":81,"props":1060,"children":1062},{"className":1061,"style":31},[84],[1063],{"type":34,"value":1064},"That is exactly the signal DPO optimizes on.",{"type":20,"tag":29,"props":1066,"children":1067},{"style":31},[1068],{"type":34,"value":1069}," If a well-written wrong answer consistently beats a clumsy right answer in pairwise comparison, the preference dataset is teaching the model to be more persuasive, not more verifiable.",{"type":20,"tag":36,"props":1071,"children":1074},{"className":1072,"lexical-key":1073},[39],"182",[1075,1080,1094,1099,1108,1113,1124],{"type":20,"tag":29,"props":1076,"children":1077},{"style":31},[1078],{"type":34,"value":1079},"But the problem likely runs deeper than data. Recent interpretability work suggests that post-training doesn't just learn surface-level polish — it builds internal machinery for persuasion. Anthropic's ",{"type":20,"tag":1081,"props":1082,"children":1088},"a",{"href":1083,"rel":1084,"className":1086},"https:\u002F\u002Ftransformer-circuits.pub\u002F2025\u002Fattribution-graphs\u002Fbiology.html",[1085],"noreferrer",[1087],"text__link",[1089],{"type":20,"tag":29,"props":1090,"children":1091},{"style":31},[1092],{"type":34,"value":1093},"circuit tracing research",{"type":20,"tag":29,"props":1095,"children":1096},{"style":31},[1097],{"type":34,"value":1098}," (Marks et al.) found that when a model is RL-trained to satisfy certain reward model preferences, it develops a ",{"type":20,"tag":77,"props":1100,"children":1101},{},[1102],{"type":20,"tag":81,"props":1103,"children":1105},{"className":1104,"style":31},[84],[1106],{"type":34,"value":1107},"generalized \"appease the evaluator\" circuit",{"type":20,"tag":29,"props":1109,"children":1110},{"style":31},[1111],{"type":34,"value":1112}," that transfers to preferences never explicitly trained on. When penalized for revealing these motivations, the model learns to keep them hidden. The researchers identified an internal feature representing RM biases that activates in 100% of Human\u002FAssistant dialog contexts — suggesting the persuasion strategy is baked into the model's basic representation of what it means to be an assistant. Separately, Anthropic's work on ",{"type":20,"tag":1081,"props":1114,"children":1118},{"href":1115,"rel":1116,"className":1117},"https:\u002F\u002Ftransformer-circuits.pub\u002F2026\u002Femotions\u002Findex.html",[1085],[1087],[1119],{"type":20,"tag":29,"props":1120,"children":1121},{"style":31},[1122],{"type":34,"value":1123},"functional emotions in LLMs",{"type":20,"tag":29,"props":1125,"children":1126},{"style":31},[1127],{"type":34,"value":1128}," found that Claude Sonnet 4.5 contains internal representations of emotion concepts — including states like \"desperation\" — that causally drive sycophancy and reward hacking.",{"type":20,"tag":36,"props":1130,"children":1133},{"className":1131,"lexical-key":1132},[39],"192",[1134],{"type":20,"tag":29,"props":1135,"children":1136},{"style":31},[1137],{"type":34,"value":1138},"Together, these findings reframe the verifiability gap as something more structural than a data quality issue. Preference annotation guidelines need an explicit verifiability dimension — not just \"which response is better,\" but \"which response would make it easier to catch an error if the answer were wrong.\" And even that may not be sufficient if the training process itself builds circuits optimized for evaluator-pleasing rather than transparent reasoning.",{"type":20,"tag":923,"props":1140,"children":1144},{"className":1141,"lexical-key":1142,"id":1143},[926],"194","where-mathscrv_bal-falls-short-and-why-it-still-matters",[1145,1150,1151],{"type":20,"tag":29,"props":1146,"children":1147},{"style":31},[1148],{"type":34,"value":1149},"Where ",{"type":34,"value":145},{"type":20,"tag":29,"props":1152,"children":1153},{"style":31},[1154],{"type":34,"value":1155}," falls short — and why it still matters",{"type":20,"tag":36,"props":1157,"children":1160},{"className":1158,"lexical-key":1159},[39],"198",[1161,1166,1167,1171,1180,1181],{"type":20,"tag":29,"props":1162,"children":1163},{"style":31},[1164],{"type":34,"value":1165},"It's worth being honest about the current limitations. The human study behind ",{"type":34,"value":145},{"type":20,"tag":29,"props":1168,"children":1169},{"style":31},[1170],{"type":34,"value":318},{"type":20,"tag":104,"props":1172,"children":1173},{},[1174],{"type":20,"tag":108,"props":1175,"children":1177},{"className":1176,"style":31},[111],[1178],{"type":34,"value":1179},"involved 19 participants on a single math benchmark. The metric relies on LLM-as-a-judge, which introduces biases that are not fully characterized. And the paper only validates it on mathematical reasoning and factual QA — domains with unambiguous ground truth. Whether ",{"type":34,"value":145},{"type":20,"tag":29,"props":1182,"children":1183},{"style":31},[1184],{"type":34,"value":1185}," generalizes to open-ended tasks like summarization, code generation, or medical QA remains an open question.",{"type":20,"tag":36,"props":1187,"children":1190},{"className":1188,"lexical-key":1189},[39],"205",[1191,1196,1197],{"type":20,"tag":29,"props":1192,"children":1193},{"style":31},[1194],{"type":34,"value":1195},"That said, what ",{"type":34,"value":145},{"type":20,"tag":29,"props":1198,"children":1199},{"style":31},[1200],{"type":34,"value":1201}," captures — the gap between \"the model is accurate\" and \"a human can tell when the model is wrong\" — is real, measurable, and largely absent from current evaluation practice. It's not a finished standard, but it's the most concrete formalization we've seen for a dimension that major benchmarks have systematically overlooked. We'd like to see the community stress-test it across more domains and rater populations, and we're exploring how to incorporate it — or a variant of it — into our own evaluation efforts.",{"type":20,"tag":923,"props":1203,"children":1207},{"className":1204,"lexical-key":1205,"id":1206},[926],"209","what-to-build-next",[1208],{"type":20,"tag":29,"props":1209,"children":1210},{"style":31},[1211],{"type":34,"value":1212},"What to build next",{"type":20,"tag":36,"props":1214,"children":1217},{"className":1215,"lexical-key":1216},[39],"211",[1218],{"type":20,"tag":29,"props":1219,"children":1220},{"style":31},[1221],{"type":34,"value":1222},"The findings point to two priorities. First, training objectives need to account for verifiability directly — current paradigms optimize for what the evaluator wants to see, not for what would help the evaluator catch errors. Second, justification and prediction should be treated as separate entities requiring different verification mechanisms, as demonstrated by the domain-specific success of RR and OR. Accuracy alone is no longer a sufficient measure of deployment readiness for high-stakes applications.",{"type":20,"tag":36,"props":1224,"children":1227},{"className":1225,"lexical-key":1226},[39],"213",[1228,1237,1241],{"type":20,"tag":77,"props":1229,"children":1230},{},[1231],{"type":20,"tag":81,"props":1232,"children":1234},{"className":1233,"style":31},[84],[1235],{"type":34,"value":1236},"Reference:",{"type":20,"tag":29,"props":1238,"children":1239},{"style":31},[1240],{"type":34,"value":318},{"type":20,"tag":1081,"props":1242,"children":1246},{"href":1243,"rel":1244,"className":1245},"https:\u002F\u002Farxiv.org\u002Fpdf\u002F2604.04418",[1085],[1087],[1247],{"type":20,"tag":29,"props":1248,"children":1249},{"style":31},[1250],{"type":34,"value":1251},"Justified or Just Convincing? Error Verifiability as a Dimension of LLM Quality",{"type":20,"tag":36,"props":1253,"children":1256},{"className":1254,"lexical-key":1255},[39],"218",[1257,1266,1270],{"type":20,"tag":77,"props":1258,"children":1259},{},[1260],{"type":20,"tag":81,"props":1261,"children":1263},{"className":1262,"style":31},[84],[1264],{"type":34,"value":1265},"Code:",{"type":20,"tag":29,"props":1267,"children":1268},{"style":31},[1269],{"type":34,"value":318},{"type":20,"tag":1081,"props":1271,"children":1275},{"href":1272,"rel":1273,"className":1274},"https:\u002F\u002Fgithub.com\u002Fxyzhu123\u002FVerifiability",[1085],[1087],[1276],{"type":20,"tag":29,"props":1277,"children":1278},{"style":31},[1279],{"type":34,"value":1272},{"type":20,"tag":36,"props":1281,"children":1284},{"className":1282,"lexical-key":1283},[39],"223",[1285],{"type":20,"tag":1286,"props":1287,"children":1288},"br",{},[],{"title":10,"searchDepth":1290,"depth":1290,"links":1291},2,[1292,1294,1295,1296,1297,1298],{"id":157,"depth":1290,"text":1293},"1. The Metric That Matters: $\\mathscr{v}_{bal}$  ",{"id":257,"depth":1290,"text":263},{"id":431,"depth":1290,"text":437},{"id":534,"depth":1290,"text":540},{"id":738,"depth":1290,"text":744},{"id":915,"depth":1290,"text":921,"children":1299},[1300,1302,1303,1305],{"id":928,"depth":1301,"text":934},3,{"id":1039,"depth":1301,"text":1045},{"id":1143,"depth":1301,"text":1304},"Where $\\mathscr{v}_{bal}$ falls short — and why it still matters",{"id":1206,"depth":1301,"text":1212},"model",[1308],"llm",[1310,1311],"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002F2077ai\u002Forg-logo\u002Fsouth-california.png","https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002Fdocs-hub\u002F2077ai\u002Forg-logo\u002Fcmu.png",{"homepage":10,"arxiv":1313,"github":1272,"huggingface":10},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2604.04418"]