[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"dataset":3},{"title":4,"desc":5,"bannerImg":6,"date":7,"links":8,"weight":9,"category":10,"description":5,"content":11,"metaBannerImg":267},"M-A-P Matrix: A Massive Bilingual Dataset for LLM Pretraining","Matrix is a massive, open-source pretraining dataset containing approximately 4.7 trillion tokens** of bilingual text in English and Chinese. ","\u002Fdatasets-banner-images\u002Fm-a-p-matrix-banner.jpg","2025-09-11","https:\u002F\u002Fdataset.data4o.xyz\u002Fshare\u002Fdataset\u002Fpreview?datasetId=68c7d7774bb3791abac2a7a3",2,"Multimodal",{"data":12,"body":14,"toc":263},{"title":4,"description":13},"",{"type":15,"children":16},"root",[17,25,32,165,193,249,255],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"m-a-p-matrix-a-massive-bilingual-dataset-for-llm-pretraining",[23],{"type":24,"value":4},"text",{"type":18,"tag":26,"props":27,"children":29},"h2",{"id":28},"introduction",[30],{"type":24,"value":31},"Introduction",{"type":18,"tag":33,"props":34,"children":36},"table",{"style":35},"width: 100%",[37],{"type":18,"tag":38,"props":39,"children":40},"tbody",{},[41,60,75,90,105,120,135,150],{"type":18,"tag":42,"props":43,"children":44},"tr",{},[45,47,54,55],{"type":24,"value":46},"\n  ",{"type":18,"tag":48,"props":49,"children":51},"td",{"style":50},"width: 240px;",[52],{"type":24,"value":53},"Dataset",{"type":24,"value":46},{"type":18,"tag":48,"props":56,"children":57},{},[58],{"type":24,"value":59},"M-A-P Matrix",{"type":18,"tag":42,"props":61,"children":62},{},[63,64,69,70],{"type":24,"value":46},{"type":18,"tag":48,"props":65,"children":66},{},[67],{"type":24,"value":68},"Modalities",{"type":24,"value":46},{"type":18,"tag":48,"props":71,"children":72},{},[73],{"type":24,"value":74},"Text, Video",{"type":18,"tag":42,"props":76,"children":77},{},[78,79,84,85],{"type":24,"value":46},{"type":18,"tag":48,"props":80,"children":81},{},[82],{"type":24,"value":83},"Formats",{"type":24,"value":46},{"type":18,"tag":48,"props":86,"children":87},{},[88],{"type":24,"value":89},"json",{"type":18,"tag":42,"props":91,"children":92},{},[93,94,99,100],{"type":24,"value":46},{"type":18,"tag":48,"props":95,"children":96},{},[97],{"type":24,"value":98},"Languages",{"type":24,"value":46},{"type":18,"tag":48,"props":101,"children":102},{},[103],{"type":24,"value":104},"English, Chinese",{"type":18,"tag":42,"props":106,"children":107},{},[108,109,114,115],{"type":24,"value":46},{"type":18,"tag":48,"props":110,"children":111},{},[112],{"type":24,"value":113},"Size",{"type":24,"value":46},{"type":18,"tag":48,"props":116,"children":117},{},[118],{"type":24,"value":119},"3.19GB",{"type":18,"tag":42,"props":121,"children":122},{},[123,124,129,130],{"type":24,"value":46},{"type":18,"tag":48,"props":125,"children":126},{},[127],{"type":24,"value":128},"Release Date",{"type":24,"value":46},{"type":18,"tag":48,"props":131,"children":132},{},[133],{"type":24,"value":134},"2024-05-29",{"type":18,"tag":42,"props":136,"children":137},{},[138,139,144,145],{"type":24,"value":46},{"type":18,"tag":48,"props":140,"children":141},{},[142],{"type":24,"value":143},"Domain",{"type":24,"value":46},{"type":18,"tag":48,"props":146,"children":147},{},[148],{"type":24,"value":149},"Mixed Domain",{"type":18,"tag":42,"props":151,"children":152},{},[153,154,159,160],{"type":24,"value":46},{"type":18,"tag":48,"props":155,"children":156},{},[157],{"type":24,"value":158},"License",{"type":24,"value":46},{"type":18,"tag":48,"props":161,"children":162},{},[163],{"type":24,"value":164},"Apache license 2.0",{"type":18,"tag":166,"props":167,"children":168},"p",{},[169,171,177,179,184,186,191],{"type":24,"value":170},"Matrix is a massive, open-source pretraining dataset containing approximately ",{"type":18,"tag":172,"props":173,"children":174},"strong",{},[175],{"type":24,"value":176},"4.7 trillion tokens",{"type":24,"value":178}," of bilingual text in ",{"type":18,"tag":172,"props":180,"children":181},{},[182],{"type":24,"value":183},"English",{"type":24,"value":185}," and ",{"type":18,"tag":172,"props":187,"children":188},{},[189],{"type":24,"value":190},"Chinese",{"type":24,"value":192},". It was created to serve as the foundational training data for the MAP-Neo series of highly capable and transparent large language models.",{"type":18,"tag":194,"props":195,"children":196},"ul",{},[197,244],{"type":18,"tag":198,"props":199,"children":200},"li",{},[201,203,208,210,215,216,221,223,228,230,235,237,242],{"type":24,"value":202},"The dataset is distinguished by its comprehensive and diverse composition, sourced from a wide range of high-quality corpora. Key components include web text from ",{"type":18,"tag":172,"props":204,"children":205},{},[206],{"type":24,"value":207},"Common Crawl",{"type":24,"value":209},", technical data from ",{"type":18,"tag":172,"props":211,"children":212},{},[213],{"type":24,"value":214},"Code",{"type":24,"value":185},{"type":18,"tag":172,"props":217,"children":218},{},[219],{"type":24,"value":220},"Patent",{"type":24,"value":222}," documents, academic language from ",{"type":18,"tag":172,"props":224,"children":225},{},[226],{"type":24,"value":227},"Papers",{"type":24,"value":229},", literary text from ",{"type":18,"tag":172,"props":231,"children":232},{},[233],{"type":24,"value":234},"Books",{"type":24,"value":236},", and factual information from ",{"type":18,"tag":172,"props":238,"children":239},{},[240],{"type":24,"value":241},"Wikipedia",{"type":24,"value":243}," articles.",{"type":18,"tag":198,"props":245,"children":246},{},[247],{"type":24,"value":248},"With its immense scale and rich, multi-domain composition, Matrix provides a crucial resource for researchers and developers aiming to pretrain powerful, generalist bilingual LLMs from the ground up.",{"type":18,"tag":26,"props":250,"children":252},{"id":251},"data-sample",[253],{"type":24,"value":254},"Data Sample",{"type":18,"tag":256,"props":257,"children":262},"iframe",{"src":8,"style":258,"frameBorder":259,"allowFullScreen":260,"loading":261},"width: 100%; height: 800px;","0",true,"lazy",[],{"title":13,"searchDepth":9,"depth":9,"links":264},[265,266],{"id":28,"depth":9,"text":31},{"id":251,"depth":9,"text":254},"https:\u002F\u002Fdoxhub.s3.us-east-1.amazonaws.com\u002F2077ai\u002FBanner_dataset\u002Fdataset_mapmatrix.png"]