[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"dataset":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"links":8,"weight":9,"category":10,"description":5,"content":11,"metaBannerImg":203},"CriticLeanBench: A Benchmark for Evaluating Mathematical Formalization Critics","CriticLeanBench is a specialized benchmark designed to evaluate the critical reasoning of AI models, specifically on the task of validating the translation of natural language mathematics into formal Lean 4 theorem statements.","\u002Fdatasets-banner-images\u002Fcriticleanbench-banner.jpg","2025-07-08","https:\u002F\u002Fdataset.data4o.xyz\u002Fshare\u002Fdataset\u002Fpreview?datasetId=68c7d75d00652cb5b51c3315",2,"Reasoning",{"data":12,"body":14,"toc":199},{"title":4,"description":13},"",{"type":15,"children":16},"root",[17,25,32,150,155,185,191],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"criticleanbench-a-benchmark-for-evaluating-mathematical-formalization-critics",[23],{"type":24,"value":4},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"introduction",[30],{"type":24,"value":31},"Introduction",{"type":18,"tag":33,"props":34,"children":35},"table",{},[36,55],{"type":18,"tag":37,"props":38,"children":39},"thead",{},[40],{"type":18,"tag":41,"props":42,"children":43},"tr",{},[44,50],{"type":18,"tag":45,"props":46,"children":47},"th",{},[48],{"type":24,"value":49},"Dataset",{"type":18,"tag":45,"props":51,"children":52},{},[53],{"type":24,"value":54},"CriticLeanBench",{"type":18,"tag":56,"props":57,"children":58},"tbody",{},[59,73,86,99,112,124,137],{"type":18,"tag":41,"props":60,"children":61},{},[62,68],{"type":18,"tag":63,"props":64,"children":65},"td",{},[66],{"type":24,"value":67},"Modalities",{"type":18,"tag":63,"props":69,"children":70},{},[71],{"type":24,"value":72},"Text, Image",{"type":18,"tag":41,"props":74,"children":75},{},[76,81],{"type":18,"tag":63,"props":77,"children":78},{},[79],{"type":24,"value":80},"Formats",{"type":18,"tag":63,"props":82,"children":83},{},[84],{"type":24,"value":85},"parquet",{"type":18,"tag":41,"props":87,"children":88},{},[89,94],{"type":18,"tag":63,"props":90,"children":91},{},[92],{"type":24,"value":93},"Languages",{"type":18,"tag":63,"props":95,"children":96},{},[97],{"type":24,"value":98},"English",{"type":18,"tag":41,"props":100,"children":101},{},[102,107],{"type":18,"tag":63,"props":103,"children":104},{},[105],{"type":24,"value":106},"Size",{"type":18,"tag":63,"props":108,"children":109},{},[110],{"type":24,"value":111},"393kB",{"type":18,"tag":41,"props":113,"children":114},{},[115,120],{"type":18,"tag":63,"props":116,"children":117},{},[118],{"type":24,"value":119},"Release Date",{"type":18,"tag":63,"props":121,"children":122},{},[123],{"type":24,"value":7},{"type":18,"tag":41,"props":125,"children":126},{},[127,132],{"type":18,"tag":63,"props":128,"children":129},{},[130],{"type":24,"value":131},"Domain",{"type":18,"tag":63,"props":133,"children":134},{},[135],{"type":24,"value":136},"Mathematical Reasoning, Formal Verification",{"type":18,"tag":41,"props":138,"children":139},{},[140,145],{"type":18,"tag":63,"props":141,"children":142},{},[143],{"type":24,"value":144},"License",{"type":18,"tag":63,"props":146,"children":147},{},[148],{"type":24,"value":149},"Apache license 2.0",{"type":18,"tag":151,"props":152,"children":153},"p",{},[154],{"type":24,"value":5},{"type":18,"tag":156,"props":157,"children":158},"ul",{},[159,173],{"type":18,"tag":160,"props":161,"children":162},"li",{},[163,165,171],{"type":24,"value":164},"Its core purpose is to assess a model's ability to act as a \"critic,\" determining whether a piece of Lean 4 code faithfully captures the ",{"type":18,"tag":166,"props":167,"children":168},"strong",{},[169],{"type":24,"value":170},"semantic and logical intent",{"type":24,"value":172}," of the original mathematical problem, going beyond simple syntactic correctness.",{"type":18,"tag":160,"props":174,"children":175},{},[176,178,183],{"type":24,"value":177},"The dataset consists of 500 human-verified pairs, critically balanced between ",{"type":18,"tag":166,"props":179,"children":180},{},[181],{"type":24,"value":182},"250 correct formalizations and 250 incorrect ones",{"type":24,"value":184}," that feature common and representative error patterns.",{"type":18,"tag":26,"props":186,"children":188},{"id":187},"data-sample",[189],{"type":24,"value":190},"Data Sample",{"type":18,"tag":192,"props":193,"children":198},"iframe",{"src":8,"style":194,"frameBorder":195,"allowFullScreen":196,"loading":197},"width: 100%; height: 800px;","0",true,"lazy",[],{"title":13,"searchDepth":9,"depth":9,"links":200},[201,202],{"id":28,"depth":9,"text":31},{"id":187,"depth":9,"text":190},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002FBanner_dataset\u002Fdataset_criticlean.png"]