[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"dataset":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"links":8,"weight":9,"category":10,"description":5,"content":11,"metaBannerImg":218},"FormalMATH: A Large-Scale Benchmark for Formal Mathematical Reasoning in Lean4","FormalMATH is a large-scale benchmark designed to evaluate and advance the capabilities of Large Language Models in the challenging domain of formal mathematical reasoning.","\u002Fdatasets-banner-images\u002Fformalmath-banner.jpg","2025-05-05","https:\u002F\u002Fdataset.data4o.xyz\u002Fshare\u002Fdataset\u002Fpreview?datasetId=68c7d79400652cb5b51c3407",2,"Reasoning",{"data":12,"body":14,"toc":214},{"title":4,"description":13},"",{"type":15,"children":16},"root",[17,25,32,164,185,200,206],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"formalmath-a-large-scale-benchmark-for-formal-mathematical-reasoning-in-lean4",[23],{"type":24,"value":4},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"introduction",[30],{"type":24,"value":31},"Introduction",{"type":18,"tag":33,"props":34,"children":36},"table",{"style":35},"width: 100%",[37],{"type":18,"tag":38,"props":39,"children":40},"tbody",{},[41,60,75,90,105,120,134,149],{"type":18,"tag":42,"props":43,"children":44},"tr",{},[45,47,54,55],{"type":24,"value":46},"\n  ",{"type":18,"tag":48,"props":49,"children":51},"td",{"style":50},"width: 240px;",[52],{"type":24,"value":53},"Dataset",{"type":24,"value":46},{"type":18,"tag":48,"props":56,"children":57},{},[58],{"type":24,"value":59},"FormalMATH",{"type":18,"tag":42,"props":61,"children":62},{},[63,64,69,70],{"type":24,"value":46},{"type":18,"tag":48,"props":65,"children":66},{},[67],{"type":24,"value":68},"Modalities",{"type":24,"value":46},{"type":18,"tag":48,"props":71,"children":72},{},[73],{"type":24,"value":74},"Text",{"type":18,"tag":42,"props":76,"children":77},{},[78,79,84,85],{"type":24,"value":46},{"type":18,"tag":48,"props":80,"children":81},{},[82],{"type":24,"value":83},"Formats",{"type":24,"value":46},{"type":18,"tag":48,"props":86,"children":87},{},[88],{"type":24,"value":89},"json",{"type":18,"tag":42,"props":91,"children":92},{},[93,94,99,100],{"type":24,"value":46},{"type":18,"tag":48,"props":95,"children":96},{},[97],{"type":24,"value":98},"Languages",{"type":24,"value":46},{"type":18,"tag":48,"props":101,"children":102},{},[103],{"type":24,"value":104},"English, Chinese",{"type":18,"tag":42,"props":106,"children":107},{},[108,109,114,115],{"type":24,"value":46},{"type":18,"tag":48,"props":110,"children":111},{},[112],{"type":24,"value":113},"Size",{"type":24,"value":46},{"type":18,"tag":48,"props":116,"children":117},{},[118],{"type":24,"value":119},"5.26MB",{"type":18,"tag":42,"props":121,"children":122},{},[123,124,129,130],{"type":24,"value":46},{"type":18,"tag":48,"props":125,"children":126},{},[127],{"type":24,"value":128},"Release Date",{"type":24,"value":46},{"type":18,"tag":48,"props":131,"children":132},{},[133],{"type":24,"value":7},{"type":18,"tag":42,"props":135,"children":136},{},[137,138,143,144],{"type":24,"value":46},{"type":18,"tag":48,"props":139,"children":140},{},[141],{"type":24,"value":142},"Domain",{"type":24,"value":46},{"type":18,"tag":48,"props":145,"children":146},{},[147],{"type":24,"value":148},"Mathematics",{"type":18,"tag":42,"props":150,"children":151},{},[152,153,158,159],{"type":24,"value":46},{"type":18,"tag":48,"props":154,"children":155},{},[156],{"type":24,"value":157},"License",{"type":24,"value":46},{"type":18,"tag":48,"props":160,"children":161},{},[162],{"type":24,"value":163},"mit",{"type":18,"tag":165,"props":166,"children":167},"p",{},[168,170,176,178,183],{"type":24,"value":169},"FormalMATH is a large-scale benchmark designed to evaluate and advance the capabilities of Large Language Models in the challenging domain of ",{"type":18,"tag":171,"props":172,"children":173},"strong",{},[174],{"type":24,"value":175},"formal mathematical reasoning",{"type":24,"value":177},". It contains 5,560 formally verified problems, all expressed as theorem statements in the ",{"type":18,"tag":171,"props":179,"children":180},{},[181],{"type":24,"value":182},"Lean4",{"type":24,"value":184}," proof assistant.",{"type":18,"tag":186,"props":187,"children":188},"ul",{},[189,195],{"type":18,"tag":190,"props":191,"children":192},"li",{},[193],{"type":24,"value":194},"The dataset is comprehensive in scope, featuring problems that range from high-school Olympiad challenges to undergraduate-level theorems. It covers diverse mathematical fields, including algebra, applied mathematics, calculus, number theory, and discrete mathematics. Each entry consists of a natural language problem paired with its corresponding formal Lean4 statement, along with solutions and other metadata.",{"type":18,"tag":190,"props":196,"children":197},{},[198],{"type":24,"value":199},"Created using a novel human-in-the-loop autoformalization pipeline, FormalMATH serves as a robust and difficult testbed for current and future LLM-based theorem provers. Evaluations show that even state-of-the-art models struggle with this benchmark, highlighting significant room for improvement in automated reasoning.",{"type":18,"tag":26,"props":201,"children":203},{"id":202},"data-sample",[204],{"type":24,"value":205},"Data Sample",{"type":18,"tag":207,"props":208,"children":213},"iframe",{"src":8,"style":209,"frameBorder":210,"allowFullScreen":211,"loading":212},"width: 100%; height: 800px;","0",true,"lazy",[],{"title":13,"searchDepth":9,"depth":9,"links":215},[216,217],{"id":28,"depth":9,"text":31},{"id":202,"depth":9,"text":205},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002FBanner_dataset\u002Fdataset_farmalmath.png"]