|
|
--- |
|
|
license: cc-by-nc-4.0 |
|
|
language: |
|
|
- ace |
|
|
- acm |
|
|
- acq |
|
|
- aeb |
|
|
- af |
|
|
- ajp |
|
|
- ak |
|
|
- am |
|
|
- apc |
|
|
- ar |
|
|
- ars |
|
|
- ary |
|
|
- arz |
|
|
- as |
|
|
- ast |
|
|
- awa |
|
|
- ay |
|
|
- azb |
|
|
- azj |
|
|
- ba |
|
|
- bm |
|
|
- ban |
|
|
- be |
|
|
- bem |
|
|
- bn |
|
|
- bho |
|
|
- bjn |
|
|
- bo |
|
|
- bs |
|
|
- bug |
|
|
- bg |
|
|
- ca |
|
|
- ceb |
|
|
- cs |
|
|
- cjk |
|
|
- ckb |
|
|
- crh |
|
|
- cy |
|
|
- da |
|
|
- de |
|
|
- dik |
|
|
- dyu |
|
|
- dz |
|
|
- el |
|
|
- en |
|
|
- eo |
|
|
- et |
|
|
- eu |
|
|
- ee |
|
|
- fo |
|
|
- fa |
|
|
- fj |
|
|
- fi |
|
|
- fon |
|
|
- fr |
|
|
- fur |
|
|
- ff |
|
|
- gd |
|
|
- ga |
|
|
- gl |
|
|
- gn |
|
|
- gu |
|
|
- ht |
|
|
- ha |
|
|
- he |
|
|
- hi |
|
|
- hne |
|
|
- hr |
|
|
- hu |
|
|
- hy |
|
|
- ig |
|
|
- ilo |
|
|
- id |
|
|
- is |
|
|
- it |
|
|
- jv |
|
|
- ja |
|
|
- kab |
|
|
- kac |
|
|
- kam |
|
|
- kn |
|
|
- ks |
|
|
- ka |
|
|
- kr |
|
|
- kk |
|
|
- kbp |
|
|
- kea |
|
|
- km |
|
|
- ki |
|
|
- rw |
|
|
- ky |
|
|
- kmb |
|
|
- kg |
|
|
- ko |
|
|
- kmr |
|
|
- lo |
|
|
- lv |
|
|
- lij |
|
|
- li |
|
|
- ln |
|
|
- lt |
|
|
- lmo |
|
|
- ltg |
|
|
- lb |
|
|
- lua |
|
|
- lg |
|
|
- luo |
|
|
- lus |
|
|
- mag |
|
|
- mai |
|
|
- ml |
|
|
- mr |
|
|
- min |
|
|
- mk |
|
|
- plt |
|
|
- mt |
|
|
- mni |
|
|
- mn |
|
|
- mos |
|
|
- mi |
|
|
- ms |
|
|
- my |
|
|
- nl |
|
|
- nn |
|
|
- nb |
|
|
- ne |
|
|
- nso |
|
|
- nus |
|
|
- ny |
|
|
- oc |
|
|
- gaz |
|
|
- ory |
|
|
- pag |
|
|
- pa |
|
|
- pap |
|
|
- pl |
|
|
- pt |
|
|
- prs |
|
|
- pbt |
|
|
- qu |
|
|
- ro |
|
|
- rn |
|
|
- ru |
|
|
- sg |
|
|
- sa |
|
|
- sat |
|
|
- scn |
|
|
- shn |
|
|
- si |
|
|
- sk |
|
|
- sl |
|
|
- sm |
|
|
- sn |
|
|
- sd |
|
|
- so |
|
|
- st |
|
|
- es |
|
|
- als |
|
|
- sc |
|
|
- sr |
|
|
- ss |
|
|
- su |
|
|
- sv |
|
|
- sw |
|
|
- szl |
|
|
- ta |
|
|
- tt |
|
|
- te |
|
|
- tg |
|
|
- tl |
|
|
- th |
|
|
- ti |
|
|
- taq |
|
|
- tpi |
|
|
- tn |
|
|
- ts |
|
|
- tk |
|
|
- tum |
|
|
- tr |
|
|
- tw |
|
|
- tzm |
|
|
- ug |
|
|
- uk |
|
|
- umb |
|
|
- ur |
|
|
- uz |
|
|
- vec |
|
|
- vi |
|
|
- war |
|
|
- wo |
|
|
- xh |
|
|
- yi |
|
|
- yo |
|
|
- yue |
|
|
- zh |
|
|
- zu |
|
|
language_details: >- |
|
|
ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab, |
|
|
aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab, |
|
|
asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl, |
|
|
bam_Latn, ban_Latn, bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab, |
|
|
bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn, |
|
|
cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn, |
|
|
dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn, |
|
|
ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn, |
|
|
fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr, |
|
|
hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn, |
|
|
hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn, |
|
|
jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva, |
|
|
kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr, |
|
|
kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn, |
|
|
lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn, |
|
|
ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva, |
|
|
mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn, |
|
|
mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn, |
|
|
nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn, |
|
|
gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn, |
|
|
prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn, |
|
|
san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn, |
|
|
smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn, |
|
|
srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn, |
|
|
tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi, |
|
|
taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn, |
|
|
tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab, |
|
|
uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr, |
|
|
yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn |
|
|
pipeline_tag: sentence-similarity |
|
|
--- |
|
|
|
|
|
# Multilingual & Multimodal NLI (MMNLI) |
|
|
|
|
|
The full details of the MMNLI model, including architecture, training, and evaluation, are described in the paper [Beyond Similarity Scoring: Detecting Entailment and Contradiction in Multilingual and Multimodal Contexts](https://www.isca-speech.org/archive/Interspeech_2025/paper286.pdf) by Istaiteh, O., Mdhaffar, S., & Estève, Y. (Interspeech 2025). Please cite this paper if you use the MMNLI model in your research. |
|
|
|
|
|
|
|
|
This repository provides the **MMNLI model**, a multilingual and multimodal Natural Language Inference classifier. |
|
|
It extends the BLASER architecture into **multiclass NLI**, supporting entailment, contradiction, and neutrality across text-text, text-speech, speech-text, and speech-speech input pairs. |
|
|
|
|
|
The model is trained on the [oist/multimodal_nli_dataset](https://huggingface.co/datasets/oist/multimodal_nli_dataset). |
|
|
Please refer to that dataset card for details. |
|
|
|
|
|
### Results |
|
|
|
|
|
On the test set of the dataset, the MMNLI model achieves an **F1-micro score of 0.749**. |
|
|
|
|
|
--- |
|
|
|
|
|
## Usage |
|
|
|
|
|
The model depends on **SONAR embeddings**. You can use the official SONAR encoders (for text and speech) [from GitHub](https://github.com/facebookresearch/SONAR/tree/main) or the **ported SONAR text encoder** [`cointegrated/SONAR_200_text_encoder`](https://huggingface.co/cointegrated/SONAR_200_text_encoder). |
|
|
|
|
|
--- |
|
|
|
|
|
### Example 1: Speech–Text Inference |
|
|
|
|
|
```python |
|
|
import torch |
|
|
from sonar.inference_pipelines.speech import SpeechToEmbeddingModelPipeline |
|
|
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline |
|
|
from transformers import AutoModel |
|
|
|
|
|
# 1. Load SONAR encoders |
|
|
speech_encoder = SpeechToEmbeddingModelPipeline(encoder="sonar_speech_encoder_eng") |
|
|
text_encoder = TextToEmbeddingModelPipeline(encoder="text_sonar_basic_encoder", tokenizer="text_sonar_basic_encoder") |
|
|
|
|
|
# 2. Encode premise (speech) and hypothesis (text) |
|
|
premise_embs = speech_encoder.predict(["audio.wav"]) |
|
|
hypothesis_embs = text_encoder.predict(["The cat sat on the mat."], source_lang="eng_Latn") |
|
|
|
|
|
# 3. Load MMNLI model |
|
|
mmnli_model_name = "oist/multimodal_nli_model" |
|
|
mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) |
|
|
mmnli_model.eval() |
|
|
|
|
|
# 4. Run inference |
|
|
with torch.inference_mode(): |
|
|
logits = mmnli_model(premise_embs, hypothesis_embs) # returns [batch_size, 3] |
|
|
pred_class = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
print("Prediction:", pred_class) |
|
|
# 0 = Entailment, 1 = Neutral, 2 = Contradiction |
|
|
``` |
|
|
|
|
|
### Example 2: Text–Text Inference (Official SONAR) |
|
|
|
|
|
```python |
|
|
import torch |
|
|
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline |
|
|
from transformers import AutoModel |
|
|
|
|
|
# 1. Load official SONAR text encoder |
|
|
text_encoder = TextToEmbeddingModelPipeline( |
|
|
encoder="text_sonar_basic_encoder", |
|
|
tokenizer="text_sonar_basic_encoder" |
|
|
) |
|
|
|
|
|
# 2. Encode premise and hypothesis |
|
|
premise_texts = ["Le chat s'assit sur le tapis."] |
|
|
hypothesis_texts = ["The cat sat on the mat."] |
|
|
|
|
|
premise_embs = text_encoder.predict(premise_texts, source_lang="fra_Latn") |
|
|
hypothesis_embs = text_encoder.predict(hypothesis_texts, source_lang="eng_Latn") |
|
|
|
|
|
# 3. Load MMNLI model |
|
|
mmnli_model = AutoModel.from_pretrained("oist/multimodal_nli_model", trust_remote_code=True) |
|
|
mmnli_model.eval() |
|
|
|
|
|
# 4. Run inference |
|
|
with torch.inference_mode(): |
|
|
logits = mmnli_model(premise_embs, hypothesis_embs) |
|
|
pred_class = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
print("Prediction:", pred_class) |
|
|
# 0 = Entailment, 1 = Neutral, 2 = Contradiction |
|
|
``` |
|
|
|
|
|
### Example 3: Text–Text Inference (Ported SONAR) |
|
|
|
|
|
```python |
|
|
# !pip install transformers sentencepiece torch -q |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder |
|
|
|
|
|
# 1. Load ported SONAR text encoder |
|
|
sonar_model_name = "cointegrated/SONAR_200_text_encoder" |
|
|
encoder = M2M100Encoder.from_pretrained(sonar_model_name) |
|
|
tokenizer = AutoTokenizer.from_pretrained(sonar_model_name) |
|
|
|
|
|
def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): |
|
|
tokenizer.src_lang = lang |
|
|
with torch.inference_mode(): |
|
|
batch = tokenizer(texts, return_tensors='pt', padding=True) |
|
|
seq_embs = encoder(**batch).last_hidden_state |
|
|
mask = batch.attention_mask |
|
|
mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) |
|
|
if norm: |
|
|
mean_emb = torch.nn.functional.normalize(mean_emb) |
|
|
return mean_emb |
|
|
|
|
|
# Example sentences |
|
|
premise_sentences = ["Le chat s'assit sur le tapis."] |
|
|
hypothesis_sentences = ["The cat sat on the mat."] |
|
|
|
|
|
# 2. Encode premise and hypothesis |
|
|
premise_embs = encode_mean_pool(premise_sentences, tokenizer, encoder, lang="fra_Latn") |
|
|
hypothesis_embs = encode_mean_pool(hypothesis_sentences, tokenizer, encoder, lang="eng_Latn") |
|
|
|
|
|
|
|
|
mmnli_model_name = "oist/multimodal_nli_model" |
|
|
mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) |
|
|
mmnli_model.eval() |
|
|
|
|
|
# 4. Run inference |
|
|
with torch.inference_mode(): |
|
|
logits = mmnli_model(premise_embs, hypothesis_embs) # returns [batch_size, 3] |
|
|
pred_class = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
print("Prediction:", pred_class) |
|
|
# 0 = Entailment, 1 = Neutral, 2 = Contradiction |
|
|
``` |
|
|
### Example 4: Using BLASER Semantic Score with MMNLI |
|
|
|
|
|
You can use the BLASER semantic score in combination with the MMNLI NLI class to get a **better understanding of the relationship** between source and candidate translations. The NLI class gives the entailment/contradiction/neutral label, while the BLASER score provides a fine-grained semantic similarity. |
|
|
|
|
|
```python |
|
|
# !pip install transformers sentencepiece torch -q |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder |
|
|
|
|
|
# ------------------------- |
|
|
# 1️⃣ Load ported SONAR text encoder |
|
|
# ------------------------- |
|
|
sonar_model_name = "cointegrated/SONAR_200_text_encoder" |
|
|
encoder = M2M100Encoder.from_pretrained(sonar_model_name) |
|
|
tokenizer = AutoTokenizer.from_pretrained(sonar_model_name) |
|
|
|
|
|
def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False): |
|
|
tokenizer.src_lang = lang |
|
|
with torch.inference_mode(): |
|
|
batch = tokenizer(texts, return_tensors='pt', padding=True) |
|
|
seq_embs = encoder(**batch).last_hidden_state |
|
|
mask = batch.attention_mask |
|
|
mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1) |
|
|
if norm: |
|
|
mean_emb = torch.nn.functional.normalize(mean_emb) |
|
|
return mean_emb |
|
|
|
|
|
# ------------------------- |
|
|
# 2️⃣ Example sentences |
|
|
# ------------------------- |
|
|
src_sentence = ["He is happy."] |
|
|
mt_sentences = [ |
|
|
"Il est content.", # entailment blaser:4.515 |
|
|
"Il est malheureux." # contradiction blaser: 4.41 |
|
|
] |
|
|
|
|
|
# Encode source and MT sentences |
|
|
src_embs = encode_mean_pool(src_sentence, tokenizer, encoder, lang="eng_Latn") |
|
|
mt_embs = encode_mean_pool(mt_sentences, tokenizer, encoder, lang="fra_Latn") |
|
|
|
|
|
# ------------------------- |
|
|
# 3️⃣ Load MMNLI model |
|
|
# ------------------------- |
|
|
mmnli_model_name = "oist/multimodal_nli_model" |
|
|
mmnli_model = AutoModel.from_pretrained(mmnli_model_name, trust_remote_code=True) |
|
|
mmnli_model.eval() |
|
|
|
|
|
# ------------------------- |
|
|
# 4️⃣ Load BLASER QE model |
|
|
# ------------------------- |
|
|
qe_model_name = "oist/blaser_2_0_qe_ported" |
|
|
qe_model = AutoModel.from_pretrained(qe_model_name, trust_remote_code=True) |
|
|
qe_model.eval() |
|
|
|
|
|
# ------------------------- |
|
|
# 5️⃣ Run inference |
|
|
# ------------------------- |
|
|
for i, mt_sentence in enumerate(mt_sentences): |
|
|
mt_emb = mt_embs[i].unsqueeze(0) # keep batch dimension |
|
|
|
|
|
# NLI prediction |
|
|
with torch.inference_mode(): |
|
|
logits = mmnli_model(src_embs, mt_emb) |
|
|
pred_class = torch.argmax(logits, dim=-1).item() |
|
|
|
|
|
# BLASER semantic score |
|
|
with torch.inference_mode(): |
|
|
qe_score = qe_model(src_embs, mt_emb) # shape [1, 1] |
|
|
|
|
|
print(f"\nMT sentence: '{mt_sentence}'") |
|
|
print("NLI prediction:", ["Entailment", "Neutral", "Contradiction"][pred_class]) |
|
|
print("BLASER semantic score:", qe_score.item()) |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
--- |
|
|
|
|
|
## Labels |
|
|
|
|
|
- 0 = Entailment |
|
|
- 1 = Neutral |
|
|
- 2 = Contradiction |
|
|
|
|
|
--- |
|
|
|
|
|
## Citation |
|
|
|
|
|
If you use this model, please cite: |
|
|
|
|
|
```bibtex |
|
|
@inproceedings{istaiteh2025beyond, |
|
|
title={Beyond Similarity Scoring: Detecting Entailment and Contradiction in Multilingual and Multimodal Contexts}, |
|
|
author={Istaiteh, Othman and Mdhaffar, Salima and Est{\`e}ve, Yannick}, |
|
|
booktitle={Proc. Interspeech 2025}, |
|
|
pages={286--290}, |
|
|
year={2025} |
|
|
} |
|
|
|