--- license: cc-by-nc-4.0 language: - ace - acm - acq - aeb - af - ajp - ak - am - apc - ar - ars - ary - arz - as - ast - awa - ay - azb - azj - ba - bm - ban - be - bem - bn - bho - bjn - bo - bs - bug - bg - ca - ceb - cs - cjk - ckb - crh - cy - da - de - dik - dyu - dz - el - en - eo - et - eu - ee - fo - fa - fj - fi - fon - fr - fur - ff - gd - ga - gl - gn - gu - ht - ha - he - hi - hne - hr - hu - hy - ig - ilo - id - is - it - jv - ja - kab - kac - kam - kn - ks - ka - kr - kk - kbp - kea - km - ki - rw - ky - kmb - kg - ko - kmr - lo - lv - lij - li - ln - lt - lmo - ltg - lb - lua - lg - luo - lus - mag - mai - ml - mr - min - mk - plt - mt - mni - mn - mos - mi - ms - my - nl - nn - nb - ne - nso - nus - ny - oc - gaz - ory - pag - pa - pap - pl - pt - prs - pbt - qu - ro - rn - ru - sg - sa - sat - scn - shn - si - sk - sl - sm - sn - sd - so - st - es - als - sc - sr - ss - su - sv - sw - szl - ta - tt - te - tg - tl - th - ti - taq - tpi - tn - ts - tk - tum - tr - tw - tzm - ug - uk - umb - ur - uz - vec - vi - war - wo - xh - yi - yo - yue - zh - zu language_details: >- ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab, aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab, asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl, bam_Latn, ban_Latn, bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab, bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn, cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn, dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn, ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn, fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr, hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn, hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn, jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva, kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr, kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn, lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn, ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva, mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn, mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn, nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn, gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn, prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn, san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn, smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn, srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn, tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi, taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn, tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab, uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr, yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn pipeline_tag: sentence-similarity --- # Multilingual & Multimodal NLI (MMNLI) The full details of the MMNLI model, including architecture, training, and evaluation, are described in the paper [Beyond Similarity Scoring: Detecting Entailment and Contradiction in Multilingual and Multimodal Contexts](https://www.isca-speech.org/archive/Interspeech_2025/paper286.pdf) by Istaiteh, O., Mdhaffar, S., & Estève, Y. (Interspeech 2025). Please cite this paper if you use the MMNLI model in your research. This repository provides the **MMNLI model**, a multilingual and multimodal Natural Language Inference classifier. It extends the BLASER architecture into **multiclass NLI**, supporting entailment, contradiction, and neutrality across text-text, text-speech, speech-text, and speech-speech input pairs. The model is trained on the [oist/multimodal_nli_dataset](https://huggingface.co/datasets/oist/multimodal_nli_dataset). Please refer to that dataset card for details. ### Results On the test set of the dataset, the MMNLI model achieves an **F1-micro score of 0.749**. --- ## Usage The model depends on **SONAR embeddings**. You can use the official SONAR encoders (for text and speech) [from GitHub](https://github.com/facebookresearch/SONAR/tree/main) or the **ported SONAR text encoder** [`cointegrated/SONAR_200_text_encoder`](https://huggingface.co/cointegrated/SONAR_200_text_encoder). --- ### Example 1: Speech–Text Inference ```python import torch from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline from transformers import AutoModel # 1. Load SONAR encoders speech_encoder = SpeechToEmbeddingModelPipeline(encoder="sonar_speech_encoder_eng") text_encoder = TextToEmbeddingModelPipeline(encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder") # 2. Encode premise (speech) and hypothesis (text) premise_embs = speech_encoder.predict(["audio.wav"]) hypothesis_embs = text_encoder.predict(["The cat sat on the mat."], source_lang="eng_Latn") # 3. Load MMNLI model mmnli_model_name = "oist/multimodal_nli_model" mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) mmnli_model.eval() # 4. Run inference with torch.inference_mode(): logits = mmnli_model(premise_embs, hypothesis_embs) # returns [batch_size, 3] pred_class = torch.argmax(logits, dim=-1).item() print("Prediction:", pred_class) # 0 = Entailment, 1 = Neutral, 2 = Contradiction ``` ### Example 2: Text–Text Inference (Official SONAR) ```python import torch from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline from transformers import AutoModel # 1. Load official SONAR text encoder text_encoder = TextToEmbeddingModelPipeline( encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder" ) # 2. Encode premise and hypothesis premise_texts = ["Le chat s'assit sur le tapis."] hypothesis_texts = ["The cat sat on the mat."] premise_embs = text_encoder.predict(premise_texts, source_lang="fra_Latn") hypothesis_embs = text_encoder.predict(hypothesis_texts, source_lang="eng_Latn") # 3. Load MMNLI model mmnli_model = AutoModel.from_pretrained("oist/multimodal_nli_model", trust_remote_code=True) mmnli_model.eval() # 4. Run inference with torch.inference_mode(): logits = mmnli_model(premise_embs, hypothesis_embs) pred_class = torch.argmax(logits, dim=-1).item() print("Prediction:", pred_class) # 0 = Entailment, 1 = Neutral, 2 = Contradiction ``` ### Example 3: Text–Text Inference (Ported SONAR) ```python # !pip install transformers sentencepiece torch -q import torch from transformers import AutoTokenizer, AutoModel from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder # 1. Load ported SONAR text encoder sonar_model_name = "cointegrated/SONAR_200_text_encoder" encoder = M2M100Encoder.from_pretrained(sonar_model_name) tokenizer = AutoTokenizer.from_pretrained(sonar_model_name) def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): tokenizer.src_lang = lang with torch.inference_mode(): batch = tokenizer(texts, return_tensors='pt', padding=True) seq_embs = encoder(**batch).last_hidden_state mask = batch.attention_mask mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) if norm: mean_emb = torch.nn.functional.normalize(mean_emb) return mean_emb # Example sentences premise_sentences = ["Le chat s'assit sur le tapis."] hypothesis_sentences = ["The cat sat on the mat."] # 2. Encode premise and hypothesis premise_embs = encode_mean_pool(premise_sentences, tokenizer, encoder, lang="fra_Latn") hypothesis_embs = encode_mean_pool(hypothesis_sentences, tokenizer, encoder, lang="eng_Latn") mmnli_model_name = "oist/multimodal_nli_model" mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) mmnli_model.eval() # 4. Run inference with torch.inference_mode(): logits = mmnli_model(premise_embs, hypothesis_embs) # returns [batch_size, 3] pred_class = torch.argmax(logits, dim=-1).item() print("Prediction:", pred_class) # 0 = Entailment, 1 = Neutral, 2 = Contradiction ``` ### Example 4: Using BLASER Semantic Score with MMNLI You can use the BLASER semantic score in combination with the MMNLI NLI class to get a **better understanding of the relationship** between source and candidate translations. The NLI class gives the entailment/contradiction/neutral label, while the BLASER score provides a fine-grained semantic similarity. ```python # !pip install transformers sentencepiece torch -q import torch from transformers import AutoTokenizer, AutoModel from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder # ------------------------- # 1️⃣ Load ported SONAR text encoder # ------------------------- sonar_model_name = "cointegrated/SONAR_200_text_encoder" encoder = M2M100Encoder.from_pretrained(sonar_model_name) tokenizer = AutoTokenizer.from_pretrained(sonar_model_name) def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): tokenizer.src_lang = lang with torch.inference_mode(): batch = tokenizer(texts, return_tensors='pt', padding=True) seq_embs = encoder(**batch).last_hidden_state mask = batch.attention_mask mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) if norm: mean_emb = torch.nn.functional.normalize(mean_emb) return mean_emb # ------------------------- # 2️⃣ Example sentences # ------------------------- src_sentence = ["He is happy."] mt_sentences = [ "Il est content.", # entailment blaser:4.515 "Il est malheureux." # contradiction blaser: 4.41 ] # Encode source and MT sentences src_embs = encode_mean_pool(src_sentence, tokenizer, encoder, lang="eng_Latn") mt_embs = encode_mean_pool(mt_sentences, tokenizer, encoder, lang="fra_Latn") # ------------------------- # 3️⃣ Load MMNLI model # ------------------------- mmnli_model_name = "oist/multimodal_nli_model" mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) mmnli_model.eval() # ------------------------- # 4️⃣ Load BLASER QE model # ------------------------- qe_model_name = "oist/blaser_2_0_qe_ported" qe_model = AutoModel.from_pretrained(qe_model_name, trust_remote_code=True) qe_model.eval() # ------------------------- # 5️⃣ Run inference # ------------------------- for i, mt_sentence in enumerate(mt_sentences): mt_emb = mt_embs[i].unsqueeze(0) # keep batch dimension # NLI prediction with torch.inference_mode(): logits = mmnli_model(src_embs, mt_emb) pred_class = torch.argmax(logits, dim=-1).item() # BLASER semantic score with torch.inference_mode(): qe_score = qe_model(src_embs, mt_emb) # shape [1, 1] print(f"\nMT sentence: '{mt_sentence}'") print("NLI prediction:", ["Entailment", "Neutral", "Contradiction"][pred_class]) print("BLASER semantic score:", qe_score.item()) ``` --- ## Labels - 0 = Entailment - 1 = Neutral - 2 = Contradiction --- ## Citation If you use this model, please cite: ```bibtex @inproceedings{istaiteh2025beyond, title={Beyond Similarity Scoring: Detecting Entailment and Contradiction in Multilingual and Multimodal Contexts}, author={Istaiteh, Othman and Mdhaffar, Salima and Est{\`e}ve, Yannick}, booktitle={Proc. Interspeech 2025}, pages={286--290}, year={2025} }