[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"blog":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"orgImgLinks":8,"bannerLinks":9,"blogCategory":10,"category":11,"weight":12,"externalUrl":13,"links":14,"description":5,"content":15,"tag1":597,"tag2":598,"resLinks":600},"GPT-5 Series vs. Gemini 3 Pro: The Verdict from SuperGPQA","Detailed benchmark results from SuperGPQA revealing how Google's Gemini 3 Pro compares to OpenAI's GPT-5.2 and GPT-5.1-Thinking across 285 graduate-level disciplines.","https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002FBanner_blog\u002Fbanner_20251216.png","2025-12-16","[]","{}","Benchmark","undefined",0,"","{\"homepage\":\"\",\"github\":\"\",\"huggingface\":\"\",\"x\":\"\",\"discord\":\"\",\"arxiv\":\"\"}",{"data":16,"body":18,"toc":587},{"title":4,"description":17},"The release of OpenAI’s GPT-5.2 Pro has reignited the race for AI supremacy, promising significant leaps in reasoning and professional capabilities. But how does it actually perform when tested against the world's hardest domain-specific questions?",{"type":19,"children":20},"root",[21,36,64,128,137,145,160,172,195,202,214,237,247,256,268,354,364,415,422,434,444,453,517,539,561,580],{"type":22,"tag":23,"props":24,"children":28},"element","h1",{"className":25,"id":27},[26],"heading__h1","gpt-5-series-vs-gemini-3-pro-the-verdict-from-supergpqa",[29],{"type":22,"tag":30,"props":31,"children":33},"span",{"style":32},"white-space: pre-wrap;",[34],{"type":35,"value":4},"text",{"type":22,"tag":37,"props":38,"children":41},"p",{"className":39},[40],"doxhub-editor-paragraph",[42,47,59],{"type":22,"tag":30,"props":43,"children":44},{"style":32},[45],{"type":35,"value":46},"The release of OpenAI’s ",{"type":22,"tag":48,"props":49,"children":50},"b",{},[51],{"type":22,"tag":52,"props":53,"children":56},"strong",{"className":54,"style":32},[55],"text__bold",[57],{"type":35,"value":58},"GPT-5.2 Pro",{"type":22,"tag":30,"props":60,"children":61},{"style":32},[62],{"type":35,"value":63}," has reignited the race for AI supremacy, promising significant leaps in reasoning and professional capabilities. But how does it actually perform when tested against the world's hardest domain-specific questions?",{"type":22,"tag":37,"props":65,"children":67},{"className":66},[40],[68,73,82,87,95,100,109,114,123],{"type":22,"tag":30,"props":69,"children":70},{"style":32},[71],{"type":35,"value":72},"We put the leading frontier models—including ",{"type":22,"tag":48,"props":74,"children":75},{},[76],{"type":22,"tag":52,"props":77,"children":79},{"className":78,"style":32},[55],[80],{"type":35,"value":81},"Google’s Gemini 3 Pro Preview",{"type":22,"tag":30,"props":83,"children":84},{"style":32},[85],{"type":35,"value":86},", ",{"type":22,"tag":48,"props":88,"children":89},{},[90],{"type":22,"tag":52,"props":91,"children":93},{"className":92,"style":32},[55],[94],{"type":35,"value":58},{"type":22,"tag":30,"props":96,"children":97},{"style":32},[98],{"type":35,"value":99},", and ",{"type":22,"tag":48,"props":101,"children":102},{},[103],{"type":22,"tag":52,"props":104,"children":106},{"className":105,"style":32},[55],[107],{"type":35,"value":108},"GPT-5.1-Thinking",{"type":22,"tag":30,"props":110,"children":111},{"style":32},[112],{"type":35,"value":113},"—to the test on ",{"type":22,"tag":48,"props":115,"children":116},{},[117],{"type":22,"tag":52,"props":118,"children":120},{"className":119,"style":32},[55],[121],{"type":35,"value":122},"SuperGPQA",{"type":22,"tag":30,"props":124,"children":125},{"style":32},[126],{"type":35,"value":127},", our gold standard benchmark for graduate-level knowledge covering 285 specialized disciplines from Quantum Mechanics to Agronomy, SuperGPQA bypasses surface-level internet knowledge to evaluate deep reasoning.",{"type":22,"tag":37,"props":129,"children":131},{"className":130},[40],[132],{"type":22,"tag":30,"props":133,"children":134},{"style":32},[135],{"type":35,"value":136},"The results are in, and they signal a shift in the hierarchy of \"hard science\" capabilities.",{"type":22,"tag":37,"props":138,"children":140},{"className":139},[40],[141],{"type":22,"tag":142,"props":143,"children":144},"br",{},[],{"type":22,"tag":146,"props":147,"children":148},"figure",{},[149,155],{"type":22,"tag":150,"props":151,"children":154},"img",{"src":152,"alt":153},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002F20251216\u002F1.webp","Discipline accuracy distribution by model",[],{"type":22,"tag":156,"props":157,"children":158},"figcaption",{},[159],{"type":35,"value":153},{"type":22,"tag":161,"props":162,"children":166},"h2",{"className":163,"id":165},[164],"heading__h2","the-verdict-gemini-3-pro-leads-in-specialized-knowledge",[167],{"type":22,"tag":30,"props":168,"children":169},{"style":32},[170],{"type":35,"value":171},"The Verdict: Gemini 3 Pro Leads in Specialized Knowledge",{"type":22,"tag":37,"props":173,"children":175},{"className":174},[40],[176,181,190],{"type":22,"tag":30,"props":177,"children":178},{"style":32},[179],{"type":35,"value":180},"Contrary to the expectation that newer is always better, our data shows that ",{"type":22,"tag":48,"props":182,"children":183},{},[184],{"type":22,"tag":52,"props":185,"children":187},{"className":186,"style":32},[55],[188],{"type":35,"value":189},"Gemini 3 Pro Preview",{"type":22,"tag":30,"props":191,"children":192},{"style":32},[193],{"type":35,"value":194}," currently holds the edge in complex, high-stakes scientific domains. While the GPT-5 series demonstrates impressive reasoning, Gemini's underlying knowledge density in specialized fields appears superior.",{"type":22,"tag":37,"props":196,"children":198},{"className":197},[40],[199],{"type":22,"tag":142,"props":200,"children":201},{},[],{"type":22,"tag":146,"props":203,"children":204},{},[205,210],{"type":22,"tag":150,"props":206,"children":209},{"src":207,"alt":208},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002F20251216\u002F2.webp","Gemini 3 Pro Preview outperforms GPT-5 variants in overall graduate-level accuracy on SuperGPQA",[],{"type":22,"tag":156,"props":211,"children":212},{},[213],{"type":35,"value":208},{"type":22,"tag":37,"props":215,"children":217},{"className":216},[40],[218,223,232],{"type":22,"tag":30,"props":219,"children":220},{"style":32},[221],{"type":35,"value":222},"In the ",{"type":22,"tag":48,"props":224,"children":225},{},[226],{"type":22,"tag":52,"props":227,"children":229},{"className":228,"style":32},[55],[230],{"type":35,"value":231},"Physics",{"type":22,"tag":30,"props":233,"children":234},{"style":32},[235],{"type":35,"value":236}," domain alone, which aggregates over 2,000 graduate-level questions, the performance gap is distinct. Gemini 3 Pro consistently ranks at the top, outperforming the GPT-5 series in subfields that require precise physical intuition and calculation.",{"type":22,"tag":161,"props":238,"children":241},{"className":239,"id":240},[164],"discipline-deep-dive-where-the-models-diverge",[242],{"type":22,"tag":30,"props":243,"children":244},{"style":32},[245],{"type":35,"value":246},"Discipline Deep Dive: Where the Models Diverge",{"type":22,"tag":37,"props":248,"children":250},{"className":249},[40],[251],{"type":22,"tag":30,"props":252,"children":253},{"style":32},[254],{"type":35,"value":255},"The aggregate scores tell only half the story. The true test of an expert model is its performance in \"long-tail\" disciplines—subjects that aren't just reasoning puzzles, but require deep, memorized professional knowledge.",{"type":22,"tag":257,"props":258,"children":262},"h3",{"className":259,"id":261},[260],"heading__h3","hard-physics-the-reasoning-test",[263],{"type":22,"tag":30,"props":264,"children":265},{"style":32},[266],{"type":35,"value":267},"Hard Physics: The Reasoning Test",{"type":22,"tag":37,"props":269,"children":271},{"className":270},[40],[272,281,286,295,300,308,313,322,327,335,340,349],{"type":22,"tag":48,"props":273,"children":274},{},[275],{"type":22,"tag":52,"props":276,"children":278},{"className":277,"style":32},[55],[279],{"type":35,"value":280},"Relativity",{"type":22,"tag":30,"props":282,"children":283},{"style":32},[284],{"type":35,"value":285}," is one of the most conceptually demanding subfields in our benchmark. Here, Gemini 3 Pro achieved a commanding ",{"type":22,"tag":48,"props":287,"children":288},{},[289],{"type":22,"tag":52,"props":290,"children":292},{"className":291,"style":32},[55],[293],{"type":35,"value":294},"79.75%",{"type":22,"tag":30,"props":296,"children":297},{"style":32},[298],{"type":35,"value":299}," accuracy. In comparison, OpenAI's specialized reasoning model, ",{"type":22,"tag":48,"props":301,"children":302},{},[303],{"type":22,"tag":52,"props":304,"children":306},{"className":305,"style":32},[55],[307],{"type":35,"value":108},{"type":22,"tag":30,"props":309,"children":310},{"style":32},[311],{"type":35,"value":312},", scored ",{"type":22,"tag":48,"props":314,"children":315},{},[316],{"type":22,"tag":52,"props":317,"children":319},{"className":318,"style":32},[55],[320],{"type":35,"value":321},"74.68%",{"type":22,"tag":30,"props":323,"children":324},{"style":32},[325],{"type":35,"value":326},", while the new ",{"type":22,"tag":48,"props":328,"children":329},{},[330],{"type":22,"tag":52,"props":331,"children":333},{"className":332,"style":32},[55],[334],{"type":35,"value":58},{"type":22,"tag":30,"props":336,"children":337},{"style":32},[338],{"type":35,"value":339}," trailed at ",{"type":22,"tag":48,"props":341,"children":342},{},[343],{"type":22,"tag":52,"props":344,"children":346},{"className":345,"style":32},[55],[347],{"type":35,"value":348},"70.89%",{"type":22,"tag":30,"props":350,"children":351},{"style":32},[352],{"type":35,"value":353},". This suggests that for theoretical physics, Gemini's internal world model is more robust.",{"type":22,"tag":257,"props":355,"children":358},{"className":356,"id":357},[260],"specialized-agriculture-the-knowledge-test",[359],{"type":22,"tag":30,"props":360,"children":361},{"style":32},[362],{"type":35,"value":363},"Specialized Agriculture: The Knowledge Test",{"type":22,"tag":37,"props":365,"children":367},{"className":366},[40],[368,373,382,387,396,401,410],{"type":22,"tag":30,"props":369,"children":370},{"style":32},[371],{"type":35,"value":372},"In ",{"type":22,"tag":48,"props":374,"children":375},{},[376],{"type":22,"tag":52,"props":377,"children":379},{"className":378,"style":32},[55],[380],{"type":35,"value":381},"Aquaculture",{"type":22,"tag":30,"props":383,"children":384},{"style":32},[385],{"type":35,"value":386},", a niche field often overlooked by general benchmarks, the difference is even more stark. Gemini 3 Pro maintained a robust ",{"type":22,"tag":48,"props":388,"children":389},{},[390],{"type":22,"tag":52,"props":391,"children":393},{"className":392,"style":32},[55],[394],{"type":35,"value":395},"62.50%",{"type":22,"tag":30,"props":397,"children":398},{"style":32},[399],{"type":35,"value":400}," accuracy, proving its versatility. In contrast, GPT-5.2 Pro struggled significantly, achieving only ",{"type":22,"tag":48,"props":402,"children":403},{},[404],{"type":22,"tag":52,"props":405,"children":407},{"className":406,"style":32},[55],[408],{"type":35,"value":409},"48.21% ",{"type":22,"tag":30,"props":411,"children":412},{"style":32},[413],{"type":35,"value":414},"- a gap of over 14 percentage points.",{"type":22,"tag":37,"props":416,"children":418},{"className":417},[40],[419],{"type":22,"tag":142,"props":420,"children":421},{},[],{"type":22,"tag":146,"props":423,"children":424},{},[425,430],{"type":22,"tag":150,"props":426,"children":429},{"src":427,"alt":428},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002F20251216\u002F3.webp","Gemini 3 Pro demonstrates superior breadth, encompassing GPT-5.2 Pro across diverse scientific disciplines",[],{"type":22,"tag":156,"props":431,"children":432},{},[433],{"type":35,"value":428},{"type":22,"tag":161,"props":435,"children":438},{"className":436,"id":437},[164],"conclusion",[439],{"type":22,"tag":30,"props":440,"children":441},{"style":32},[442],{"type":35,"value":443},"Conclusion",{"type":22,"tag":37,"props":445,"children":447},{"className":446},[40],[448],{"type":22,"tag":30,"props":449,"children":450},{"style":32},[451],{"type":35,"value":452},"For developers and enterprises choosing between these frontier models, the SuperGPQA verdict is clear:",{"type":22,"tag":454,"props":455,"children":458},"ul",{"className":456},[457],"doxhub-editor-ul",[459,479],{"type":22,"tag":460,"props":461,"children":465},"li",{"value":462,"className":463},"1",[464],"doxhub-editor-list-item",[466,474],{"type":22,"tag":48,"props":467,"children":468},{},[469],{"type":22,"tag":52,"props":470,"children":472},{"className":471,"style":32},[55],[473],{"type":35,"value":108},{"type":22,"tag":30,"props":475,"children":476},{"style":32},[477],{"type":35,"value":478}," is a powerful tool for logic-heavy tasks, showing strong improvements over base models in reasoning-intensive questions.",{"type":22,"tag":460,"props":480,"children":483},{"value":481,"className":482},"2",[464],[484,489,498,503,512],{"type":22,"tag":30,"props":485,"children":486},{"style":32},[487],{"type":35,"value":488},"However, ",{"type":22,"tag":48,"props":490,"children":491},{},[492],{"type":22,"tag":52,"props":493,"children":495},{"className":494,"style":32},[55],[496],{"type":35,"value":497},"Gemini 3 Pro",{"type":22,"tag":30,"props":499,"children":500},{"style":32},[501],{"type":35,"value":502}," currently reigns supreme in ",{"type":22,"tag":48,"props":504,"children":505},{},[506],{"type":22,"tag":52,"props":507,"children":509},{"className":508,"style":32},[55],[510],{"type":35,"value":511},"domain expertise",{"type":22,"tag":30,"props":513,"children":514},{"style":32},[515],{"type":35,"value":516},". If your application requires handling specialized, graduate-level knowledge, from theoretical physics to agricultural science—Gemini 3 Pro is the statistical leader.",{"type":22,"tag":37,"props":518,"children":520},{"className":519},[40],[521,526,534],{"type":22,"tag":30,"props":522,"children":523},{"style":32},[524],{"type":35,"value":525},"As the AI landscape evolves, ",{"type":22,"tag":48,"props":527,"children":528},{},[529],{"type":22,"tag":52,"props":530,"children":532},{"className":531,"style":32},[55],[533],{"type":35,"value":122},{"type":22,"tag":30,"props":535,"children":536},{"style":32},[537],{"type":35,"value":538}," will continue to serve as the unbiased arena for measuring true machine intelligence.",{"type":22,"tag":37,"props":540,"children":542},{"className":541},[40],[543],{"type":22,"tag":544,"props":545,"children":551},"a",{"href":546,"rel":547,"className":549},"https:\u002F\u002Fsupergpqa.github.io\u002F",[548],"noreferrer",[550],"text__link",[552],{"type":22,"tag":48,"props":553,"children":554},{},[555],{"type":22,"tag":52,"props":556,"children":558},{"className":557,"style":32},[55],[559],{"type":35,"value":560},"Explore the Full Leaderboard ->",{"type":22,"tag":37,"props":562,"children":564},{"className":563},[40],[565],{"type":22,"tag":544,"props":566,"children":570},{"href":567,"rel":568,"className":569},"https:\u002F\u002Fwww.2077ai.com\u002Fblog\u002F2077AI-SuperGPQA",[548],[550],[571],{"type":22,"tag":48,"props":572,"children":573},{},[574],{"type":22,"tag":52,"props":575,"children":577},{"className":576,"style":32},[55],[578],{"type":35,"value":579},"Learn more about SuperGPQA ->",{"type":22,"tag":37,"props":581,"children":583},{"className":582},[40],[584],{"type":22,"tag":142,"props":585,"children":586},{},[],{"title":13,"searchDepth":588,"depth":588,"links":589},2,[590,591,596],{"id":165,"depth":588,"text":171},{"id":240,"depth":588,"text":246,"children":592},[593,595],{"id":261,"depth":594,"text":267},3,{"id":357,"depth":594,"text":363},{"id":437,"depth":588,"text":443},"benchmark",[599],"llm",{"homepage":546,"arxiv":601,"github":602,"huggingface":603},"https:\u002F\u002Farxiv.org\u002Fabs\u002F2502.14739","https:\u002F\u002Fgithub.com\u002FSuperGPQA\u002FSuperGPQA","https:\u002F\u002Fhuggingface.co\u002Fdatasets\u002Fm-a-p\u002FSuperGPQA"]