| | --- |
| | license: mit |
| | datasets: |
| | - jhu-clsp/mmbert-decay |
| | - jhu-clsp/mmbert-midtraining |
| | - jhu-clsp/mmbert-pretrain-p1-fineweb2-langs |
| | - jhu-clsp/mmbert-pretrain-p2-fineweb2-remaining |
| | - jhu-clsp/mmbert-pretrain-p3-others |
| | pipeline_tag: fill-mask |
| | library_name: transformers |
| | language: |
| | - aai |
| | - aak |
| | - aau |
| | - aaz |
| | - aba |
| | - abi |
| | - abk |
| | - abn |
| | - abq |
| | - abs |
| | - abt |
| | - abx |
| | - aby |
| | - abz |
| | - aca |
| | - acd |
| | - ace |
| | - acf |
| | - ach |
| | - acm |
| | - acn |
| | - acr |
| | - acu |
| | - ada |
| | - ade |
| | - adh |
| | - adi |
| | - adj |
| | - adl |
| | - ady |
| | - adz |
| | - aeb |
| | - aer |
| | - aeu |
| | - aey |
| | - afr |
| | - agd |
| | - agg |
| | - agm |
| | - agn |
| | - agr |
| | - agt |
| | - agu |
| | - agw |
| | - agx |
| | - aha |
| | - ahk |
| | - aia |
| | - aii |
| | - aim |
| | - ain |
| | - ajg |
| | - aji |
| | - ajz |
| | - akb |
| | - ake |
| | - akh |
| | - akp |
| | - alj |
| | - aln |
| | - alp |
| | - alq |
| | - als |
| | - alt |
| | - aly |
| | - alz |
| | - ame |
| | - amf |
| | - amh |
| | - ami |
| | - amk |
| | - amm |
| | - amn |
| | - amp |
| | - amr |
| | - amu |
| | - amx |
| | - ang |
| | - anm |
| | - ann |
| | - anp |
| | - anv |
| | - any |
| | - aoi |
| | - aoj |
| | - aom |
| | - aoz |
| | - apb |
| | - apc |
| | - ape |
| | - apn |
| | - apr |
| | - apt |
| | - apu |
| | - apw |
| | - apy |
| | - apz |
| | - arb |
| | - are |
| | - arg |
| | - arl |
| | - arn |
| | - arp |
| | - arq |
| | - ars |
| | - ary |
| | - arz |
| | - asg |
| | - asm |
| | - aso |
| | - ast |
| | - ata |
| | - atb |
| | - atd |
| | - atg |
| | - ati |
| | - atj |
| | - atq |
| | - att |
| | - auc |
| | - aui |
| | - auy |
| | - ava |
| | - avk |
| | - avn |
| | - avt |
| | - avu |
| | - awa |
| | - awb |
| | - awx |
| | - ayo |
| | - ayp |
| | - ayr |
| | - azb |
| | - azg |
| | - azj |
| | - azz |
| | - bak |
| | - bam |
| | - ban |
| | - bao |
| | - bar |
| | - bas |
| | - bav |
| | - bba |
| | - bbb |
| | - bbc |
| | - bbj |
| | - bbk |
| | - bbo |
| | - bbr |
| | - bch |
| | - bci |
| | - bcl |
| | - bco |
| | - bcw |
| | - bdd |
| | - bdh |
| | - bdq |
| | - bea |
| | - bef |
| | - bel |
| | - bem |
| | - ben |
| | - beq |
| | - bew |
| | - bex |
| | - bfd |
| | - bfo |
| | - bgr |
| | - bgs |
| | - bgt |
| | - bgz |
| | - bhg |
| | - bhl |
| | - bho |
| | - bhp |
| | - bhw |
| | - bhz |
| | - bib |
| | - big |
| | - bim |
| | - bin |
| | - bis |
| | - biu |
| | - biv |
| | - bjn |
| | - bjp |
| | - bjr |
| | - bjv |
| | - bkd |
| | - bkl |
| | - bkq |
| | - bku |
| | - bkv |
| | - bla |
| | - blh |
| | - blk |
| | - blw |
| | - blz |
| | - bmh |
| | - bmk |
| | - bmq |
| | - bmr |
| | - bmu |
| | - bmv |
| | - bno |
| | - bnp |
| | - boa |
| | - bod |
| | - boj |
| | - bom |
| | - bon |
| | - bos |
| | - bov |
| | - box |
| | - bpr |
| | - bps |
| | - bpy |
| | - bqc |
| | - bqj |
| | - bqp |
| | - bre |
| | - brh |
| | - bru |
| | - brx |
| | - bsc |
| | - bsn |
| | - bsp |
| | - bsq |
| | - bss |
| | - btd |
| | - bth |
| | - bts |
| | - btt |
| | - btx |
| | - bud |
| | - bug |
| | - buk |
| | - bul |
| | - bum |
| | - bus |
| | - bvc |
| | - bvd |
| | - bvr |
| | - bvz |
| | - bwd |
| | - bwi |
| | - bwq |
| | - bwu |
| | - bxh |
| | - bxr |
| | - byr |
| | - byv |
| | - byx |
| | - bzd |
| | - bzh |
| | - bzi |
| | - bzj |
| | - caa |
| | - cab |
| | - cac |
| | - caf |
| | - cag |
| | - cak |
| | - cao |
| | - cap |
| | - caq |
| | - car |
| | - cas |
| | - cat |
| | - cav |
| | - cax |
| | - cbc |
| | - cbi |
| | - cbk |
| | - cbr |
| | - cbs |
| | - cbt |
| | - cbu |
| | - cbv |
| | - cce |
| | - cco |
| | - ccp |
| | - ceb |
| | - ceg |
| | - cek |
| | - ces |
| | - cfm |
| | - cgc |
| | - cgg |
| | - cha |
| | - chd |
| | - che |
| | - chf |
| | - chj |
| | - chk |
| | - cho |
| | - chq |
| | - chr |
| | - chu |
| | - chv |
| | - chw |
| | - chz |
| | - cjk |
| | - cjo |
| | - cjp |
| | - cjs |
| | - cjv |
| | - ckb |
| | - cko |
| | - ckt |
| | - cle |
| | - clu |
| | - cly |
| | - cme |
| | - cmn |
| | - cmo |
| | - cmr |
| | - cnh |
| | - cni |
| | - cnk |
| | - cnl |
| | - cnt |
| | - cnw |
| | - coe |
| | - cof |
| | - cok |
| | - con |
| | - cop |
| | - cor |
| | - cos |
| | - cot |
| | - cou |
| | - cpa |
| | - cpb |
| | - cpc |
| | - cpu |
| | - cpy |
| | - crh |
| | - crj |
| | - crk |
| | - crl |
| | - crm |
| | - crn |
| | - crs |
| | - crt |
| | - crx |
| | - csb |
| | - csk |
| | - cso |
| | - csw |
| | - csy |
| | - cta |
| | - ctd |
| | - cto |
| | - ctp |
| | - ctu |
| | - cub |
| | - cuc |
| | - cui |
| | - cuk |
| | - cul |
| | - cut |
| | - cux |
| | - cwe |
| | - cwt |
| | - cya |
| | - cym |
| | - czt |
| | - daa |
| | - dad |
| | - daf |
| | - dag |
| | - dah |
| | - dak |
| | - dan |
| | - dar |
| | - ddg |
| | - ddn |
| | - ded |
| | - des |
| | - deu |
| | - dga |
| | - dgc |
| | - dgi |
| | - dgr |
| | - dgz |
| | - dhg |
| | - dhm |
| | - dhv |
| | - did |
| | - dig |
| | - dik |
| | - diq |
| | - dis |
| | - diu |
| | - div |
| | - dje |
| | - djk |
| | - djr |
| | - dks |
| | - dln |
| | - dng |
| | - dnj |
| | - dnw |
| | - dob |
| | - doi |
| | - dop |
| | - dos |
| | - dow |
| | - drg |
| | - dru |
| | - dsb |
| | - dtb |
| | - dtp |
| | - dts |
| | - dty |
| | - dua |
| | - due |
| | - dug |
| | - duo |
| | - dur |
| | - dwr |
| | - dww |
| | - dyi |
| | - dyo |
| | - dyu |
| | - dzo |
| | - ebk |
| | - efi |
| | - eka |
| | - ekk |
| | - eko |
| | - ell |
| | - emi |
| | - eml |
| | - emp |
| | - enb |
| | - enl |
| | - enm |
| | - enq |
| | - enx |
| | - epo |
| | - eri |
| | - ese |
| | - esi |
| | - esk |
| | - ess |
| | - esu |
| | - eto |
| | - etr |
| | - etu |
| | - eus |
| | - eve |
| | - ewe |
| | - ewo |
| | - ext |
| | - eza |
| | - faa |
| | - fad |
| | - fai |
| | - fal |
| | - fan |
| | - fao |
| | - far |
| | - fas |
| | - fat |
| | - ffm |
| | - fij |
| | - fil |
| | - fin |
| | - fit |
| | - fkv |
| | - fmu |
| | - fon |
| | - for |
| | - fra |
| | - frd |
| | - fro |
| | - frp |
| | - frr |
| | - fry |
| | - fub |
| | - fud |
| | - fue |
| | - fuf |
| | - fuh |
| | - fuq |
| | - fur |
| | - fuv |
| | - gaa |
| | - gag |
| | - gah |
| | - gai |
| | - gam |
| | - gaw |
| | - gaz |
| | - gbi |
| | - gbo |
| | - gbr |
| | - gcf |
| | - gcr |
| | - gde |
| | - gdg |
| | - gdn |
| | - gdr |
| | - geb |
| | - gej |
| | - gfk |
| | - ghs |
| | - gid |
| | - gil |
| | - giz |
| | - gjn |
| | - gkn |
| | - gla |
| | - gle |
| | - glg |
| | - glk |
| | - glv |
| | - gmh |
| | - gmv |
| | - gna |
| | - gnb |
| | - gnd |
| | - gng |
| | - gnn |
| | - gnw |
| | - goa |
| | - gof |
| | - gog |
| | - goh |
| | - gom |
| | - gor |
| | - gos |
| | - got |
| | - gqr |
| | - grc |
| | - grt |
| | - gso |
| | - gsw |
| | - gub |
| | - guc |
| | - gud |
| | - gug |
| | - guh |
| | - gui |
| | - guj |
| | - guk |
| | - gul |
| | - gum |
| | - gun |
| | - guo |
| | - guq |
| | - gur |
| | - guu |
| | - guw |
| | - gux |
| | - guz |
| | - gvc |
| | - gvf |
| | - gvl |
| | - gvn |
| | - gwi |
| | - gwr |
| | - gya |
| | - gym |
| | - gyr |
| | - hac |
| | - hae |
| | - hag |
| | - hak |
| | - hat |
| | - hav |
| | - haw |
| | - hay |
| | - hbo |
| | - hch |
| | - heb |
| | - heg |
| | - heh |
| | - her |
| | - hif |
| | - hig |
| | - hil |
| | - hin |
| | - hix |
| | - hla |
| | - hmo |
| | - hmr |
| | - hne |
| | - hnj |
| | - hnn |
| | - hns |
| | - hop |
| | - hot |
| | - hra |
| | - hrv |
| | - hrx |
| | - hsb |
| | - hto |
| | - hub |
| | - hui |
| | - hun |
| | - hus |
| | - huu |
| | - huv |
| | - hvn |
| | - hwc |
| | - hye |
| | - hyw |
| | - ian |
| | - iba |
| | - ibg |
| | - ibo |
| | - icr |
| | - ido |
| | - idu |
| | - ifa |
| | - ifb |
| | - ife |
| | - ifk |
| | - ifu |
| | - ify |
| | - ige |
| | - ign |
| | - ike |
| | - ikk |
| | - ikt |
| | - ikw |
| | - ilb |
| | - ile |
| | - ilo |
| | - imo |
| | - ina |
| | - inb |
| | - ind |
| | - inh |
| | - ino |
| | - iou |
| | - ipi |
| | - iqw |
| | - iri |
| | - irk |
| | - iry |
| | - isd |
| | - ish |
| | - isl |
| | - iso |
| | - ita |
| | - itv |
| | - ium |
| | - ivb |
| | - ivv |
| | - iws |
| | - ixl |
| | - izr |
| | - izz |
| | - jaa |
| | - jac |
| | - jae |
| | - jam |
| | - jav |
| | - jbo |
| | - jbu |
| | - jic |
| | - jiv |
| | - jmc |
| | - jpn |
| | - jra |
| | - jun |
| | - jvn |
| | - kaa |
| | - kab |
| | - kac |
| | - kak |
| | - kal |
| | - kam |
| | - kan |
| | - kao |
| | - kaq |
| | - kas |
| | - kat |
| | - kaz |
| | - kbc |
| | - kbd |
| | - kbh |
| | - kbm |
| | - kbo |
| | - kbp |
| | - kbq |
| | - kbr |
| | - kby |
| | - kca |
| | - kcg |
| | - kck |
| | - kdc |
| | - kde |
| | - kdh |
| | - kdi |
| | - kdj |
| | - kdl |
| | - kdr |
| | - kea |
| | - kei |
| | - kek |
| | - ken |
| | - keo |
| | - ker |
| | - kew |
| | - kez |
| | - kff |
| | - kgf |
| | - kgk |
| | - kgp |
| | - kgr |
| | - kha |
| | - khk |
| | - khm |
| | - khs |
| | - khz |
| | - kia |
| | - kij |
| | - kik |
| | - kin |
| | - kir |
| | - kiu |
| | - kix |
| | - kjb |
| | - kje |
| | - kjh |
| | - kjs |
| | - kkc |
| | - kki |
| | - kkj |
| | - kkl |
| | - kle |
| | - klt |
| | - klv |
| | - kmb |
| | - kmg |
| | - kmh |
| | - kmk |
| | - kmm |
| | - kmo |
| | - kmr |
| | - kms |
| | - kmu |
| | - kmy |
| | - knc |
| | - kne |
| | - knf |
| | - kng |
| | - knj |
| | - knk |
| | - kno |
| | - knv |
| | - knx |
| | - kny |
| | - kog |
| | - koi |
| | - koo |
| | - kor |
| | - kos |
| | - kpe |
| | - kpf |
| | - kpg |
| | - kpj |
| | - kpq |
| | - kpr |
| | - kpv |
| | - kpw |
| | - kpx |
| | - kpz |
| | - kqc |
| | - kqe |
| | - kqf |
| | - kql |
| | - kqn |
| | - kqo |
| | - kqp |
| | - kqs |
| | - kqw |
| | - kqy |
| | - krc |
| | - kri |
| | - krj |
| | - krl |
| | - kru |
| | - krx |
| | - ksb |
| | - ksc |
| | - ksd |
| | - ksf |
| | - ksh |
| | - ksj |
| | - ksp |
| | - ksr |
| | - kss |
| | - ksw |
| | - ktb |
| | - ktj |
| | - ktm |
| | - kto |
| | - ktu |
| | - ktz |
| | - kua |
| | - kub |
| | - kud |
| | - kue |
| | - kuj |
| | - kum |
| | - kup |
| | - kus |
| | - kvg |
| | - kvj |
| | - kvn |
| | - kwd |
| | - kwf |
| | - kwi |
| | - kwj |
| | - kwn |
| | - kwy |
| | - kxc |
| | - kxm |
| | - kxw |
| | - kyc |
| | - kyf |
| | - kyg |
| | - kyq |
| | - kyu |
| | - kyz |
| | - kze |
| | - kzf |
| | - kzj |
| | - lac |
| | - lad |
| | - lai |
| | - laj |
| | - lam |
| | - lao |
| | - lap |
| | - lat |
| | - lbb |
| | - lbe |
| | - lbj |
| | - lbk |
| | - lcm |
| | - lcp |
| | - ldi |
| | - ldn |
| | - lee |
| | - lef |
| | - leh |
| | - lem |
| | - leu |
| | - lew |
| | - lex |
| | - lez |
| | - lfn |
| | - lgg |
| | - lgl |
| | - lgm |
| | - lhi |
| | - lhu |
| | - lia |
| | - lid |
| | - lif |
| | - lij |
| | - lim |
| | - lin |
| | - lip |
| | - lis |
| | - lit |
| | - liv |
| | - ljp |
| | - lki |
| | - llb |
| | - lld |
| | - llg |
| | - lln |
| | - lmk |
| | - lmo |
| | - lmp |
| | - lnd |
| | - lob |
| | - loe |
| | - log |
| | - lok |
| | - lol |
| | - lom |
| | - loq |
| | - loz |
| | - lrc |
| | - lsi |
| | - lsm |
| | - ltg |
| | - ltz |
| | - lua |
| | - lub |
| | - luc |
| | - lud |
| | - lue |
| | - lug |
| | - lun |
| | - luo |
| | - lus |
| | - lvs |
| | - lwg |
| | - lwo |
| | - lww |
| | - lzh |
| | - maa |
| | - mad |
| | - maf |
| | - mag |
| | - mah |
| | - mai |
| | - maj |
| | - mak |
| | - mal |
| | - mam |
| | - maq |
| | - mar |
| | - mas |
| | - mau |
| | - mav |
| | - maw |
| | - maz |
| | - mbb |
| | - mbc |
| | - mbd |
| | - mbf |
| | - mbh |
| | - mbi |
| | - mbj |
| | - mbl |
| | - mbs |
| | - mbt |
| | - mca |
| | - mcb |
| | - mcd |
| | - mcf |
| | - mck |
| | - mcn |
| | - mco |
| | - mcp |
| | - mcq |
| | - mcu |
| | - mda |
| | - mdf |
| | - mdy |
| | - med |
| | - mee |
| | - mej |
| | - mek |
| | - men |
| | - meq |
| | - mer |
| | - met |
| | - meu |
| | - mev |
| | - mfe |
| | - mfg |
| | - mfh |
| | - mfi |
| | - mfk |
| | - mfq |
| | - mfy |
| | - mfz |
| | - mgc |
| | - mgh |
| | - mgo |
| | - mgr |
| | - mhi |
| | - mhl |
| | - mhr |
| | - mhw |
| | - mhx |
| | - mhy |
| | - mib |
| | - mic |
| | - mie |
| | - mif |
| | - mig |
| | - mih |
| | - mil |
| | - mim |
| | - min |
| | - mio |
| | - mip |
| | - miq |
| | - mir |
| | - mit |
| | - miy |
| | - miz |
| | - mjc |
| | - mjw |
| | - mkd |
| | - mkl |
| | - mkn |
| | - mks |
| | - mkz |
| | - mlh |
| | - mlp |
| | - mlt |
| | - mlu |
| | - mmn |
| | - mmo |
| | - mmx |
| | - mna |
| | - mnb |
| | - mnf |
| | - mni |
| | - mnk |
| | - mns |
| | - mnw |
| | - mnx |
| | - mny |
| | - moa |
| | - moc |
| | - mog |
| | - moh |
| | - mop |
| | - mor |
| | - mos |
| | - mox |
| | - mpg |
| | - mph |
| | - mpm |
| | - mpp |
| | - mps |
| | - mpt |
| | - mpx |
| | - mqb |
| | - mqj |
| | - mqy |
| | - mrg |
| | - mri |
| | - mrj |
| | - mrq |
| | - mrv |
| | - mrw |
| | - msb |
| | - msc |
| | - mse |
| | - msk |
| | - msy |
| | - mta |
| | - mtg |
| | - mti |
| | - mto |
| | - mtp |
| | - mua |
| | - mug |
| | - muh |
| | - mui |
| | - mup |
| | - mur |
| | - mus |
| | - mux |
| | - muy |
| | - mva |
| | - mvn |
| | - mvp |
| | - mwc |
| | - mwf |
| | - mwl |
| | - mwm |
| | - mwn |
| | - mwp |
| | - mwq |
| | - mwv |
| | - mww |
| | - mxb |
| | - mxp |
| | - mxq |
| | - mxt |
| | - mxv |
| | - mya |
| | - myb |
| | - myk |
| | - myu |
| | - myv |
| | - myw |
| | - myx |
| | - myy |
| | - mza |
| | - mzh |
| | - mzk |
| | - mzl |
| | - mzm |
| | - mzn |
| | - mzw |
| | - mzz |
| | - nab |
| | - naf |
| | - nah |
| | - nak |
| | - nap |
| | - naq |
| | - nas |
| | - nav |
| | - naw |
| | - nba |
| | - nbc |
| | - nbe |
| | - nbl |
| | - nbq |
| | - nbu |
| | - nca |
| | - nch |
| | - ncj |
| | - ncl |
| | - ncq |
| | - nct |
| | - ncu |
| | - ncx |
| | - ndc |
| | - nde |
| | - ndh |
| | - ndi |
| | - ndj |
| | - ndo |
| | - nds |
| | - ndz |
| | - neb |
| | - new |
| | - nfa |
| | - nfr |
| | - ngb |
| | - ngc |
| | - ngl |
| | - ngp |
| | - ngu |
| | - nhd |
| | - nhe |
| | - nhg |
| | - nhi |
| | - nhk |
| | - nho |
| | - nhr |
| | - nhu |
| | - nhw |
| | - nhx |
| | - nhy |
| | - nia |
| | - nif |
| | - nii |
| | - nij |
| | - nim |
| | - nin |
| | - nio |
| | - niu |
| | - niy |
| | - njb |
| | - njm |
| | - njn |
| | - njo |
| | - njz |
| | - nkf |
| | - nko |
| | - nld |
| | - nlg |
| | - nma |
| | - nmf |
| | - nmh |
| | - nmo |
| | - nmw |
| | - nmz |
| | - nnb |
| | - nng |
| | - nnh |
| | - nnl |
| | - nno |
| | - nnp |
| | - nnq |
| | - nnw |
| | - noa |
| | - nob |
| | - nod |
| | - nog |
| | - non |
| | - nop |
| | - not |
| | - nou |
| | - nov |
| | - nph |
| | - npi |
| | - npl |
| | - npo |
| | - npy |
| | - nqo |
| | - nre |
| | - nrf |
| | - nri |
| | - nrm |
| | - nsa |
| | - nse |
| | - nsm |
| | - nsn |
| | - nso |
| | - nss |
| | - nst |
| | - nsu |
| | - ntp |
| | - ntr |
| | - ntu |
| | - nuj |
| | - nus |
| | - nuy |
| | - nvm |
| | - nwb |
| | - nwi |
| | - nwx |
| | - nxd |
| | - nya |
| | - nyf |
| | - nyk |
| | - nyn |
| | - nyo |
| | - nyu |
| | - nyy |
| | - nza |
| | - nzi |
| | - nzm |
| | - obo |
| | - oci |
| | - ogo |
| | - ojb |
| | - oke |
| | - oku |
| | - okv |
| | - old |
| | - olo |
| | - omb |
| | - omw |
| | - ong |
| | - ons |
| | - ood |
| | - opm |
| | - orv |
| | - ory |
| | - oss |
| | - ota |
| | - otd |
| | - ote |
| | - otm |
| | - otn |
| | - oto |
| | - otq |
| | - ots |
| | - otw |
| | - oym |
| | - ozm |
| | - pab |
| | - pad |
| | - pag |
| | - pah |
| | - pam |
| | - pan |
| | - pao |
| | - pap |
| | - pau |
| | - pbb |
| | - pbc |
| | - pbi |
| | - pbt |
| | - pcd |
| | - pck |
| | - pcm |
| | - pdc |
| | - pdt |
| | - pem |
| | - pfe |
| | - pfl |
| | - phm |
| | - pib |
| | - pio |
| | - pir |
| | - pis |
| | - pjt |
| | - pkb |
| | - plg |
| | - pls |
| | - plt |
| | - plu |
| | - plw |
| | - pma |
| | - pmf |
| | - pmq |
| | - pms |
| | - pmx |
| | - pnb |
| | - pne |
| | - pnt |
| | - pny |
| | - poe |
| | - poh |
| | - poi |
| | - pol |
| | - pon |
| | - por |
| | - pos |
| | - pot |
| | - pov |
| | - poy |
| | - ppk |
| | - ppo |
| | - pps |
| | - prf |
| | - prg |
| | - pri |
| | - prq |
| | - pse |
| | - pss |
| | - ptp |
| | - ptu |
| | - pui |
| | - pwg |
| | - pwn |
| | - pww |
| | - pxm |
| | - qub |
| | - quc |
| | - quf |
| | - qug |
| | - quh |
| | - qul |
| | - qup |
| | - qus |
| | - quw |
| | - quy |
| | - quz |
| | - qva |
| | - qvc |
| | - qve |
| | - qvh |
| | - qvi |
| | - qvm |
| | - qvn |
| | - qvo |
| | - qvs |
| | - qvw |
| | - qvz |
| | - qwh |
| | - qxh |
| | - qxl |
| | - qxn |
| | - qxo |
| | - qxr |
| | - rad |
| | - rai |
| | - rap |
| | - rar |
| | - rav |
| | - raw |
| | - rcf |
| | - rej |
| | - rel |
| | - rgu |
| | - rhg |
| | - ria |
| | - rim |
| | - rjs |
| | - rkb |
| | - rmc |
| | - rme |
| | - rml |
| | - rmn |
| | - rmo |
| | - rmq |
| | - rmy |
| | - rnd |
| | - rng |
| | - rnl |
| | - roh |
| | - ron |
| | - roo |
| | - rop |
| | - row |
| | - rro |
| | - rtm |
| | - rub |
| | - rue |
| | - ruf |
| | - rug |
| | - run |
| | - rup |
| | - rus |
| | - rwo |
| | - sab |
| | - sag |
| | - sah |
| | - san |
| | - sas |
| | - sat |
| | - sba |
| | - sbd |
| | - sbe |
| | - sbl |
| | - sbs |
| | - sby |
| | - sck |
| | - scn |
| | - sco |
| | - sda |
| | - sdc |
| | - sdh |
| | - sdo |
| | - sdq |
| | - seh |
| | - ses |
| | - sey |
| | - sfw |
| | - sgb |
| | - sgc |
| | - sgh |
| | - sgs |
| | - sgw |
| | - sgz |
| | - shi |
| | - shk |
| | - shn |
| | - shp |
| | - shu |
| | - sid |
| | - sig |
| | - sil |
| | - sim |
| | - sin |
| | - sja |
| | - sjo |
| | - sju |
| | - skg |
| | - skr |
| | - sld |
| | - slk |
| | - sll |
| | - slv |
| | - sma |
| | - sme |
| | - smj |
| | - smk |
| | - sml |
| | - smn |
| | - smo |
| | - sms |
| | - smt |
| | - sna |
| | - snc |
| | - snd |
| | - snf |
| | - snn |
| | - snp |
| | - snw |
| | - sny |
| | - soe |
| | - som |
| | - sop |
| | - soq |
| | - sot |
| | - soy |
| | - spa |
| | - spl |
| | - spm |
| | - spp |
| | - sps |
| | - spy |
| | - srd |
| | - sri |
| | - srm |
| | - srn |
| | - srp |
| | - srq |
| | - srr |
| | - ssd |
| | - ssg |
| | - ssw |
| | - ssx |
| | - stn |
| | - stp |
| | - stq |
| | - sua |
| | - suc |
| | - sue |
| | - suk |
| | - sun |
| | - sur |
| | - sus |
| | - suz |
| | - swb |
| | - swc |
| | - swe |
| | - swg |
| | - swh |
| | - swk |
| | - swp |
| | - sxb |
| | - sxn |
| | - syb |
| | - syc |
| | - syl |
| | - szl |
| | - szy |
| | - tab |
| | - tac |
| | - tah |
| | - taj |
| | - tam |
| | - tap |
| | - taq |
| | - tar |
| | - tat |
| | - tav |
| | - taw |
| | - tay |
| | - tbc |
| | - tbg |
| | - tbk |
| | - tbl |
| | - tbo |
| | - tbw |
| | - tby |
| | - tbz |
| | - tca |
| | - tcc |
| | - tcf |
| | - tcs |
| | - tcy |
| | - tcz |
| | - ted |
| | - tee |
| | - tel |
| | - tem |
| | - teo |
| | - ter |
| | - tet |
| | - tew |
| | - tfr |
| | - tgk |
| | - tgo |
| | - tgp |
| | - tha |
| | - thk |
| | - thl |
| | - tif |
| | - tig |
| | - tih |
| | - tik |
| | - tim |
| | - tir |
| | - tiv |
| | - tiy |
| | - tke |
| | - tkl |
| | - tkr |
| | - tku |
| | - tlb |
| | - tlf |
| | - tlh |
| | - tlj |
| | - tll |
| | - tly |
| | - tmc |
| | - tmd |
| | - tna |
| | - tnc |
| | - tnk |
| | - tnn |
| | - tnp |
| | - tnr |
| | - tob |
| | - toc |
| | - tod |
| | - tog |
| | - toh |
| | - toi |
| | - toj |
| | - tok |
| | - ton |
| | - too |
| | - top |
| | - tos |
| | - tpa |
| | - tpi |
| | - tpm |
| | - tpp |
| | - tpt |
| | - tpw |
| | - tpz |
| | - tqo |
| | - trc |
| | - trn |
| | - tro |
| | - trp |
| | - trq |
| | - trs |
| | - trv |
| | - tsc |
| | - tsg |
| | - tsn |
| | - tso |
| | - tsw |
| | - tsz |
| | - ttc |
| | - tte |
| | - ttj |
| | - ttq |
| | - tuc |
| | - tue |
| | - tuf |
| | - tui |
| | - tuk |
| | - tul |
| | - tum |
| | - tuo |
| | - tur |
| | - tuv |
| | - tvk |
| | - tvl |
| | - twi |
| | - twu |
| | - twx |
| | - txq |
| | - txu |
| | - tyv |
| | - tzh |
| | - tzj |
| | - tzl |
| | - tzm |
| | - tzo |
| | - ubr |
| | - ubu |
| | - udm |
| | - udu |
| | - uig |
| | - ukr |
| | - umb |
| | - upv |
| | - ura |
| | - urb |
| | - urd |
| | - urh |
| | - uri |
| | - urk |
| | - urt |
| | - urw |
| | - ury |
| | - usa |
| | - usp |
| | - uth |
| | - uvh |
| | - uvl |
| | - uzn |
| | - uzs |
| | - vag |
| | - vap |
| | - var |
| | - vec |
| | - ven |
| | - vep |
| | - vid |
| | - vie |
| | - viv |
| | - vls |
| | - vmk |
| | - vmw |
| | - vmy |
| | - vol |
| | - vot |
| | - vro |
| | - vun |
| | - vut |
| | - waj |
| | - wal |
| | - wap |
| | - war |
| | - wat |
| | - way |
| | - wba |
| | - wbm |
| | - wbp |
| | - wed |
| | - wer |
| | - wes |
| | - wew |
| | - whg |
| | - whk |
| | - wib |
| | - wim |
| | - wiu |
| | - wln |
| | - wls |
| | - wlv |
| | - wlx |
| | - wmt |
| | - wmw |
| | - wnc |
| | - wnu |
| | - wob |
| | - wol |
| | - wos |
| | - wrk |
| | - wrs |
| | - wsg |
| | - wsk |
| | - wuu |
| | - wuv |
| | - wwa |
| | - xal |
| | - xav |
| | - xbi |
| | - xbr |
| | - xed |
| | - xho |
| | - xla |
| | - xmf |
| | - xmm |
| | - xmv |
| | - xnn |
| | - xog |
| | - xon |
| | - xrb |
| | - xsb |
| | - xsi |
| | - xsm |
| | - xsr |
| | - xsu |
| | - xtd |
| | - xtm |
| | - xtn |
| | - xuo |
| | - yaa |
| | - yad |
| | - yal |
| | - yam |
| | - yan |
| | - yao |
| | - yap |
| | - yaq |
| | - yat |
| | - yaz |
| | - ybb |
| | - yby |
| | - ycn |
| | - ydd |
| | - yim |
| | - yka |
| | - yle |
| | - yli |
| | - yml |
| | - yom |
| | - yon |
| | - yor |
| | - yrb |
| | - yre |
| | - yrk |
| | - yrl |
| | - yss |
| | - yua |
| | - yue |
| | - yuj |
| | - yup |
| | - yut |
| | - yuw |
| | - yuz |
| | - yva |
| | - zaa |
| | - zab |
| | - zac |
| | - zad |
| | - zae |
| | - zai |
| | - zam |
| | - zao |
| | - zar |
| | - zas |
| | - zat |
| | - zav |
| | - zaw |
| | - zca |
| | - zdj |
| | - zea |
| | - zgh |
| | - zia |
| | - ziw |
| | - zne |
| | - zom |
| | - zos |
| | - zpa |
| | - zpc |
| | - zpg |
| | - zpi |
| | - zpj |
| | - zpl |
| | - zpm |
| | - zpo |
| | - zpq |
| | - zpt |
| | - zpu |
| | - zpv |
| | - zpz |
| | - zsm |
| | - zsr |
| | - ztq |
| | - zty |
| | - zul |
| | - zyb |
| | - zyp |
| | --- |
| | |
| | # mmBERT: A Modern Multilingual Encoder |
| |
|
| | [](https://opensource.org/licenses/MIT) |
| | [](https://arxiv.org/abs/2509.06888) |
| | [](https://huggingface.co/jhu-clsp/mmBERT-base) |
| | [](https://huggingface.co/collections/jhu-clsp/mmbert-a-modern-multilingual-encoder-68b725831d7c6e3acc435ed4) |
| | [](https://github.com/jhu-clsp/mmBERT) |
| |
|
| | > TL;DR: A state-of-the-art multilingual encoder trained on 3T+ tokens across 1800+ languages, introducing novel techniques for learning low-resource languages during the decay phase. |
| |
|
| | mmBERT is a modern multilingual encoder that significantly outperforms previous generation models like XLM-R on classification, embedding, and retrieval tasks. Built on the ModernBERT architecture with novel multilingual training innovations, mmBERT demonstrates that low-resource languages can be effectively learned during the decay phase of training. It is also significantly faster than any previous multilingual encoder. |
| |
|
| | ## Table of Contents |
| | - [Highlights](#highlights) |
| | - [Quick Start](#quick-start) |
| | - [Model Description](#model-description) |
| | - [Novel Training Innovations](#novel-training-innovations) |
| | - [Model Family](#model-family) |
| | - [Training Data](#training-data) |
| | - [Usage Examples](#usage-examples) |
| | - [Fine-tuning Examples](#fine-tuning-examples) |
| | - [Model Architecture](#model-architecture) |
| | - [Citation](#citation) |
| |
|
| |
|
| | ## Quick Start |
| |
|
| | ### Installation |
| | ```bash |
| | pip install torch>=1.9.0 |
| | pip install transformers>=4.21.0 |
| | ``` |
| |
|
| | ### Usage |
| |
|
| | ```python |
| | from transformers import AutoTokenizer, AutoModel |
| | |
| | tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base") |
| | model = AutoModel.from_pretrained("jhu-clsp/mmBERT-base") |
| | |
| | inputs = tokenizer("Hello world", return_tensors="pt") |
| | outputs = model(**inputs) |
| | ``` |
| |
|
| | ## Model Description |
| |
|
| | mmBERT represents the first significant advancement over XLM-R for massively multilingual encoder models. Key features include: |
| |
|
| | 1. **Massive Language Coverage** - Trained on over 1800 languages with progressive inclusion strategy |
| | 2. **Modern Architecture** - Built on ModernBERT foundation with Flash Attention 2 and unpadding techniques |
| | 3. **Novel Training Recipe** - Introduces inverse mask scheduling and temperature sampling |
| | 4. **Open Training Data** - Complete 3T+ token dataset publicly available |
| | 5. **Decay Phase Innovation** - Demonstrates effective learning of low-resource languages in final training phase |
| |
|
| | The model uses bidirectional attention with masked language modeling objectives, optimized specifically for multilingual understanding and cross-lingual transfer. |
| |
|
| | ## Novel Training Innovations |
| |
|
| | **Progressive Language Addition**: Start with 60 high-resource languages, expand to 110 mid-resource languages, then include all 1833 languages in decay phase. |
| |
|
| | **Inverse Mask Schedule**: Reduce mask ratio from 30% → 15% → 5% across training phases for progressively refined learning. |
| |
|
| | **Inverse Temperature Sampling**: Adjust multilingual sampling from high-resource bias (τ=0.7) to uniform sampling (τ=0.3). |
| |
|
| | **Model Merging**: Combine English-focused, high-resource, and all-language decay variants using TIES merging. |
| |
|
| | ## Model Family |
| |
|
| | | Model | Total Params | Non-embed Params | Languages | Download | |
| | |:------|:-------------|:------------------|:----------|:---------| |
| | | [mmBERT-small](https://huggingface.co/jhu-clsp/mmBERT-small) | 140M | 42M | 1800+ | [](https://huggingface.co/jhu-clsp/mmBERT-small) | |
| | | [mmBERT-base](https://huggingface.co/jhu-clsp/mmBERT-base) | 307M | 110M | 1800+ | [](https://huggingface.co/jhu-clsp/mmBERT-base) | |
| |
|
| | ## Training Data |
| |
|
| | mmBERT training data is publicly available across different phases: |
| |
|
| | | Phase | Dataset | Tokens | Description | |
| | |:------|:--------|:-------|:------------| |
| | | Pre-training P1 | [mmbert-pretrain-p1](https://huggingface.co/datasets/jhu-clsp/mmbert-pretrain-p1-fineweb2-langs) | 2.3T | 60 languages, foundational training | |
| | | Pre-training P2 | [mmbert-pretrain-p2](https://huggingface.co/datasets/jhu-clsp/mmBERT-pretrain-p2-fineweb2-remaining) | - | Extension data for pre-training phase | |
| | | Pre-training P3 | [mmbert-pretrain-p3](https://huggingface.co/datasets/jhu-clsp/mmBERT-pretrain-p3-others) | - | Final pre-training data | |
| | | Mid-training | [mmbert-midtraining](https://huggingface.co/datasets/jhu-clsp/mmbert-midtraining-data) | 600B | 110 languages, context extension to 8K | |
| | | Decay Phase | [mmbert-decay](https://huggingface.co/datasets/jhu-clsp/mmbert-decay-data) | 100B | 1833 languages, premium quality | |
| |
|
| | **Data Sources**: Filtered DCLM (English), FineWeb2 (multilingual), FineWeb2-HQ (20 high-resource languages), Wikipedia (MegaWika), code repositories (StarCoder, ProLong), academic papers (ArXiv, PeS2o), and community discussions (StackExchange). |
| |
|
| | ## Model Architecture |
| |
|
| | | Parameter | mmBERT-small | mmBERT-base | |
| | |:----------|:-------------|:------------| |
| | | Layers | 22 | 22 | |
| | | Hidden Size | 384 | 768 | |
| | | Intermediate Size | 1152 | 1152 | |
| | | Attention Heads | 6 | 12 | |
| | | Total Parameters | 140M | 307M | |
| | | Non-embedding Parameters | 42M | 110M | |
| | | Max Sequence Length | 8192 | 8192 | |
| | | Vocabulary Size | 256,000 | 256,000 | |
| | | Tokenizer | Gemma 2 | Gemma 2 | |
| |
|
| | ## Usage Examples |
| |
|
| | ### Masked Language Modeling |
| |
|
| | ```python |
| | from transformers import AutoTokenizer, AutoModelForMaskedLM |
| | import torch |
| | |
| | tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base") |
| | model = AutoModelForMaskedLM.from_pretrained("jhu-clsp/mmBERT-base") |
| | |
| | def predict_masked_token(text): |
| | inputs = tokenizer(text, return_tensors="pt") |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | |
| | mask_indices = torch.where(inputs["input_ids"] == tokenizer.mask_token_id) |
| | predictions = outputs.logits[mask_indices] |
| | top_tokens = torch.topk(predictions, 5, dim=-1) |
| | |
| | return [tokenizer.decode(token) for token in top_tokens.indices[0]] |
| | |
| | # Works across languages |
| | texts = [ |
| | "The capital of France is <mask>.", |
| | "La capital de España es <mask>.", |
| | "Die Hauptstadt von Deutschland ist <mask>." |
| | ] |
| | |
| | for text in texts: |
| | predictions = predict_masked_token(text) |
| | print(f"Text: {text}") |
| | print(f"Predictions: {predictions}") |
| | ``` |
| |
|
| | ### Cross-lingual Embeddings |
| |
|
| | ```python |
| | from transformers import AutoTokenizer, AutoModel |
| | import torch |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | |
| | tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base") |
| | model = AutoModel.from_pretrained("jhu-clsp/mmBERT-base") |
| | |
| | def get_embeddings(texts): |
| | inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") |
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs) |
| | embeddings = outputs.last_hidden_state.mean(dim=1) |
| | |
| | return embeddings.numpy() |
| | |
| | multilingual_texts = [ |
| | "Artificial intelligence is transforming technology", |
| | "La inteligencia artificial está transformando la tecnología", |
| | "L'intelligence artificielle transforme la technologie", |
| | "人工智能正在改变技术" |
| | ] |
| | |
| | embeddings = get_embeddings(multilingual_texts) |
| | similarities = cosine_similarity(embeddings) |
| | print("Cross-lingual similarity matrix:") |
| | print(similarities) |
| | ``` |
| |
|
| | ## Fine-tuning Examples |
| |
|
| | ### Dense Retrieval with Sentence Transformers |
| |
|
| | <details> |
| | <summary>Click to expand dense retrieval fine-tuning example</summary> |
| |
|
| | ```python |
| | import argparse |
| | from datasets import load_dataset |
| | from sentence_transformers import ( |
| | SentenceTransformer, |
| | SentenceTransformerTrainer, |
| | SentenceTransformerTrainingArguments, |
| | ) |
| | from sentence_transformers.evaluation import TripletEvaluator |
| | from sentence_transformers.losses import CachedMultipleNegativesRankingLoss |
| | from sentence_transformers.training_args import BatchSamplers |
| | |
| | def main(): |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--lr", type=float, default=8e-5) |
| | parser.add_argument("--model_name", type=str, default="jhu-clsp/mmBERT-base") |
| | args = parser.parse_args() |
| | |
| | lr = args.lr |
| | model_name = args.model_name |
| | model_shortname = model_name.split("/")[-1] |
| | |
| | model = SentenceTransformer(model_name) |
| | |
| | dataset = load_dataset( |
| | "sentence-transformers/msmarco-co-condenser-margin-mse-sym-mnrl-mean-v1", |
| | "triplet-hard", |
| | split="train", |
| | ) |
| | dataset_dict = dataset.train_test_split(test_size=1_000, seed=12) |
| | train_dataset = dataset_dict["train"].select(range(1_250_000)) |
| | eval_dataset = dataset_dict["test"] |
| | |
| | loss = CachedMultipleNegativesRankingLoss(model, mini_batch_size=16) |
| | run_name = f"{model_shortname}-DPR-{lr}" |
| | |
| | training_args = SentenceTransformerTrainingArguments( |
| | output_dir=f"output/{model_shortname}/{run_name}", |
| | num_train_epochs=1, |
| | per_device_train_batch_size=512, |
| | per_device_eval_batch_size=512, |
| | warmup_ratio=0.05, |
| | fp16=False, |
| | bf16=True, |
| | batch_sampler=BatchSamplers.NO_DUPLICATES, |
| | learning_rate=lr, |
| | save_strategy="steps", |
| | save_steps=500, |
| | save_total_limit=2, |
| | logging_steps=500, |
| | run_name=run_name, |
| | ) |
| | |
| | dev_evaluator = TripletEvaluator( |
| | anchors=eval_dataset["query"], |
| | positives=eval_dataset["positive"], |
| | negatives=eval_dataset["negative"], |
| | name="msmarco-co-condenser-dev", |
| | ) |
| | dev_evaluator(model) |
| | |
| | trainer = SentenceTransformerTrainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | loss=loss, |
| | evaluator=dev_evaluator, |
| | ) |
| | trainer.train() |
| | |
| | model.save_pretrained(f"output/{model_shortname}/{run_name}/final") |
| | model.push_to_hub(run_name, private=False) |
| | |
| | if __name__ == "__main__": |
| | main() |
| | ``` |
| |
|
| | </details> |
| |
|
| | ### Cross-lingual Classification |
| |
|
| | <details> |
| | <summary>Click to expand multilingual classification fine-tuning example</summary> |
| |
|
| | ```python |
| | from transformers import ( |
| | AutoTokenizer, |
| | AutoModelForSequenceClassification, |
| | TrainingArguments, |
| | Trainer |
| | ) |
| | from datasets import load_dataset |
| | import numpy as np |
| | from sklearn.metrics import accuracy_score, f1_score |
| | |
| | def compute_metrics(eval_pred): |
| | predictions, labels = eval_pred |
| | predictions = np.argmax(predictions, axis=1) |
| | return { |
| | 'accuracy': accuracy_score(labels, predictions), |
| | 'f1': f1_score(labels, predictions, average='weighted') |
| | } |
| | |
| | def main(): |
| | model_name = "jhu-clsp/mmBERT-base" |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForSequenceClassification.from_pretrained( |
| | model_name, |
| | num_labels=3 |
| | ) |
| | |
| | dataset = load_dataset("xnli", "all_languages") |
| | |
| | def tokenize_function(examples): |
| | texts = [f"{p} {tokenizer.sep_token} {h}" |
| | for p, h in zip(examples["premise"], examples["hypothesis"])] |
| | |
| | return tokenizer( |
| | texts, |
| | truncation=True, |
| | padding=True, |
| | max_length=512 |
| | ) |
| | |
| | train_dataset = dataset["train"].map(tokenize_function, batched=True) |
| | eval_dataset = dataset["validation"].map(tokenize_function, batched=True) |
| | |
| | training_args = TrainingArguments( |
| | output_dir="./mmbert-xnli", |
| | learning_rate=3e-5, |
| | per_device_train_batch_size=32, |
| | per_device_eval_batch_size=32, |
| | num_train_epochs=3, |
| | weight_decay=0.01, |
| | evaluation_strategy="epoch", |
| | save_strategy="epoch", |
| | load_best_model_at_end=True, |
| | metric_for_best_model="f1", |
| | greater_is_better=True, |
| | ) |
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | compute_metrics=compute_metrics, |
| | ) |
| | |
| | trainer.train() |
| | |
| | if __name__ == "__main__": |
| | main() |
| | ``` |
| |
|
| | </details> |
| |
|
| | ### Multilingual Reranking |
| |
|
| | <details> |
| | <summary>Click to expand multilingual reranking fine-tuning example</summary> |
| |
|
| | ```python |
| | import logging |
| | from datasets import load_dataset |
| | from sentence_transformers.cross_encoder import ( |
| | CrossEncoder, |
| | CrossEncoderModelCardData, |
| | CrossEncoderTrainer, |
| | CrossEncoderTrainingArguments, |
| | ) |
| | from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator |
| | from sentence_transformers.cross_encoder.losses import BinaryCrossEntropyLoss |
| | from sentence_transformers.util import mine_hard_negatives |
| | from sentence_transformers import SentenceTransformer |
| | import torch |
| | |
| | def main(): |
| | model_name = "jhu-clsp/mmBERT-base" |
| | train_batch_size = 32 |
| | num_epochs = 2 |
| | num_hard_negatives = 7 |
| | |
| | model = CrossEncoder( |
| | model_name, |
| | model_card_data=CrossEncoderModelCardData( |
| | language="multilingual", |
| | license="mit", |
| | ), |
| | ) |
| | |
| | full_dataset = load_dataset("sentence-transformers/gooaq", split="train").select(range(50_000)) |
| | dataset_dict = full_dataset.train_test_split(test_size=1_000, seed=42) |
| | train_dataset = dataset_dict["train"] |
| | eval_dataset = dataset_dict["test"] |
| | |
| | embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu") |
| | hard_train_dataset = mine_hard_negatives( |
| | train_dataset, |
| | embedding_model, |
| | num_negatives=num_hard_negatives, |
| | margin=0, |
| | range_min=0, |
| | range_max=100, |
| | sampling_strategy="top", |
| | batch_size=2048, |
| | output_format="labeled-pair", |
| | use_faiss=True, |
| | ) |
| | |
| | loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(num_hard_negatives)) |
| | |
| | nano_beir_evaluator = CrossEncoderNanoBEIREvaluator( |
| | dataset_names=["msmarco", "nfcorpus", "nq"], |
| | batch_size=train_batch_size, |
| | ) |
| | |
| | args = CrossEncoderTrainingArguments( |
| | output_dir="./mmbert-reranker", |
| | num_train_epochs=num_epochs, |
| | per_device_train_batch_size=train_batch_size, |
| | per_device_eval_batch_size=train_batch_size, |
| | learning_rate=2e-5, |
| | warmup_ratio=0.1, |
| | fp16=False, |
| | bf16=True, |
| | dataloader_num_workers=4, |
| | load_best_model_at_end=True, |
| | metric_for_best_model="eval_msmarco_ndcg@10", |
| | eval_strategy="steps", |
| | eval_steps=1000, |
| | save_strategy="steps", |
| | save_steps=1000, |
| | save_total_limit=2, |
| | logging_steps=200, |
| | seed=42, |
| | ) |
| | |
| | trainer = CrossEncoderTrainer( |
| | model=model, |
| | args=args, |
| | train_dataset=hard_train_dataset, |
| | loss=loss, |
| | evaluator=nano_beir_evaluator, |
| | ) |
| | trainer.train() |
| | |
| | model.save_pretrained("./mmbert-reranker/final") |
| | |
| | if __name__ == "__main__": |
| | main() |
| | ``` |
| |
|
| | </details> |
| |
|
| | ## Training Data |
| |
|
| | mmBERT was trained on a carefully curated 3T+ token multilingual dataset: |
| |
|
| | | Phase | Dataset | Description | |
| | |:------|:--------|:------------| |
| | | [Pre-training P1](https://huggingface.co/datasets/jhu-clsp/mmbert-pretrain-p1-fineweb2-langs) | 2.3T tokens | 60 languages, diverse data mixture | |
| | | [Pre-training P2](https://huggingface.co/datasets/jhu-clsp/mmbert-pretrain-p2-fineweb2-langs) | - | Extension data for pre-training | |
| | | [Pre-training P3](https://huggingface.co/datasets/jhu-clsp/mmbert-pretrain-p3-fineweb2-langs) | - | Final pre-training data | |
| | | [Mid-training](https://huggingface.co/datasets/jhu-clsp/mmbert-midtraining-data) | 600B tokens | 110 languages, context extension | |
| | | [Decay Phase](https://huggingface.co/datasets/jhu-clsp/mmbert-decay-data) | 100B tokens | 1833 languages, premium quality | |
| |
|
| | **Primary Sources:** |
| | - **Filtered DCLM**: High-quality English content |
| | - **FineWeb2**: Broad multilingual web coverage (1800+ languages) |
| | - **FineWeb2-HQ**: Filtered subset of 20 high-resource languages |
| | - **Code**: StarCoder and ProLong repositories |
| | - **Academic**: ArXiv papers and PeS2o scientific content |
| | - **Reference**: Wikipedia (MegaWika) and textbooks |
| | - **Community**: StackExchange discussions |
| |
|
| |
|
| | ## Citation |
| |
|
| | If you use mmBERT in your research, please cite our work: |
| |
|
| | ```bibtex |
| | @misc{marone2025mmbertmodernmultilingualencoder, |
| | title={mmBERT: A Modern Multilingual Encoder with Annealed Language Learning}, |
| | author={Marc Marone and Orion Weller and William Fleshman and Eugene Yang and Dawn Lawrie and Benjamin Van Durme}, |
| | year={2025}, |
| | eprint={2509.06888}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL}, |
| | url={https://arxiv.org/abs/2509.06888}, |
| | } |
| | ``` |