[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"dataset":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"links":8,"weight":9,"category":10,"description":5,"content":11,"metaBannerImg":211},"MMAR: A Benchmark for Deep Audio Reasoning","MMAR (Massive Multi-disciplinary Audio Reasoning) is a new and challenging benchmark designed to evaluate the deep reasoning capabilities of Audio-Language Models (ALMs).","\u002Fdatasets-banner-images\u002Fmmar-banner.jpg","2025-05-19","https:\u002F\u002Fdataset.data4o.xyz\u002Fshare\u002Fdataset\u002Fpreview?datasetId=68c7d86300652cb5b51c3cf9",4,"Audio",{"data":12,"body":14,"toc":206},{"title":4,"description":13},"",{"type":15,"children":16},"root",[17,25,32,164,169,192,198],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mmar-a-benchmark-for-deep-audio-reasoning",[23],{"type":24,"value":4},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"introduction",[30],{"type":24,"value":31},"Introduction",{"type":18,"tag":33,"props":34,"children":36},"table",{"style":35},"width: 100%",[37],{"type":18,"tag":38,"props":39,"children":40},"tbody",{},[41,60,75,90,105,120,134,149],{"type":18,"tag":42,"props":43,"children":44},"tr",{},[45,47,54,55],{"type":24,"value":46},"\n  ",{"type":18,"tag":48,"props":49,"children":51},"td",{"style":50},"width: 240px;",[52],{"type":24,"value":53},"Dataset",{"type":24,"value":46},{"type":18,"tag":48,"props":56,"children":57},{},[58],{"type":24,"value":59},"MMAR",{"type":18,"tag":42,"props":61,"children":62},{},[63,64,69,70],{"type":24,"value":46},{"type":18,"tag":48,"props":65,"children":66},{},[67],{"type":24,"value":68},"Modalities",{"type":24,"value":46},{"type":18,"tag":48,"props":71,"children":72},{},[73],{"type":24,"value":74},"Text, Audio",{"type":18,"tag":42,"props":76,"children":77},{},[78,79,84,85],{"type":24,"value":46},{"type":18,"tag":48,"props":80,"children":81},{},[82],{"type":24,"value":83},"Formats",{"type":24,"value":46},{"type":18,"tag":48,"props":86,"children":87},{},[88],{"type":24,"value":89},"json",{"type":18,"tag":42,"props":91,"children":92},{},[93,94,99,100],{"type":24,"value":46},{"type":18,"tag":48,"props":95,"children":96},{},[97],{"type":24,"value":98},"Languages",{"type":24,"value":46},{"type":18,"tag":48,"props":101,"children":102},{},[103],{"type":24,"value":104},"English, Chinese, etc.(16 total)",{"type":18,"tag":42,"props":106,"children":107},{},[108,109,114,115],{"type":24,"value":46},{"type":18,"tag":48,"props":110,"children":111},{},[112],{"type":24,"value":113},"Size",{"type":24,"value":46},{"type":18,"tag":48,"props":116,"children":117},{},[118],{"type":24,"value":119},"168kB",{"type":18,"tag":42,"props":121,"children":122},{},[123,124,129,130],{"type":24,"value":46},{"type":18,"tag":48,"props":125,"children":126},{},[127],{"type":24,"value":128},"Release Date",{"type":24,"value":46},{"type":18,"tag":48,"props":131,"children":132},{},[133],{"type":24,"value":7},{"type":18,"tag":42,"props":135,"children":136},{},[137,138,143,144],{"type":24,"value":46},{"type":18,"tag":48,"props":139,"children":140},{},[141],{"type":24,"value":142},"Domain",{"type":24,"value":46},{"type":18,"tag":48,"props":145,"children":146},{},[147],{"type":24,"value":148},"Audio Processing, Speech Recognition",{"type":18,"tag":42,"props":150,"children":151},{},[152,153,158,159],{"type":24,"value":46},{"type":18,"tag":48,"props":154,"children":155},{},[156],{"type":24,"value":157},"License",{"type":24,"value":46},{"type":18,"tag":48,"props":160,"children":161},{},[162],{"type":24,"value":163},"cc-by-nc-4.0",{"type":18,"tag":165,"props":166,"children":167},"p",{},[168],{"type":24,"value":5},{"type":18,"tag":170,"props":171,"children":172},"ul",{},[173,179],{"type":18,"tag":174,"props":175,"children":176},"li",{},[177],{"type":24,"value":178},"It consists of 1,000 meticulously curated audio-question-answer triplets sourced from real-world internet videos. Each task requires multi-step deep reasoning that goes far beyond surface-level perception.",{"type":18,"tag":174,"props":180,"children":181},{},[182,184,190],{"type":24,"value":183},"A key feature of MMAR is its diverse coverage of modalities, including not only traditional speech, audio, and music, but also complex ",{"type":18,"tag":185,"props":186,"children":187},"strong",{},[188],{"type":24,"value":189},"mixtures",{"type":24,"value":191}," of them. Furthermore, the benchmark is designed to be difficult, with a portion of questions requiring graduate-level perceptual and domain-specific knowledge to answer correctly.",{"type":18,"tag":26,"props":193,"children":195},{"id":194},"data-sample",[196],{"type":24,"value":197},"Data Sample",{"type":18,"tag":199,"props":200,"children":205},"iframe",{"src":8,"style":201,"frameBorder":202,"allowFullScreen":203,"loading":204},"width: 100%; height: 800px;","0",true,"lazy",[],{"title":13,"searchDepth":207,"depth":207,"links":208},2,[209,210],{"id":28,"depth":207,"text":31},{"id":194,"depth":207,"text":197},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002FBanner_dataset\u002Fdataset_MMAR.png"]