Spaces:

Iscte-Sintra
/

User-Interface_v1.0

Running

ErikDaska commited on Jan 16

Commit

1cca777

verified ·

1 Parent(s): f2eb0c4

Update src/streamlit_app.py

Files changed (1) hide show

src/streamlit_app.py CHANGED Viewed

@@ -47,22 +47,27 @@ def instantiate_translation_model(model_name, text, src_lg, tgt_lg):
         return pipe(text)[0]["translation_text"]
     # ---- M2M100 ----
     elif "m2m100" in model_name:
         pipe = load_pipeline("translation", model_path)
-        # Set the source language
         pipe.tokenizer.src_lang = src_lg
-        # M2M100 requires the forced_bos_token_id to be the target lang token
         tgt_lang_id = pipe.tokenizer.convert_tokens_to_ids(tgt_lg)
         result = pipe(
             text,
             forced_bos_token_id=tgt_lang_id
         )
         return result[0]["translation_text"]
     # ---- MBART ----
     else:
         pipe = pipeline(

         return pipe(text)[0]["translation_text"]
     # ---- M2M100 ----
     elif "m2m100" in model_name:
         pipe = load_pipeline("translation", model_path)
+        # 1. Definimos a língua de origem
+        # Em modelos customizados, as vezes o src_lang precisa ser o token completo
         pipe.tokenizer.src_lang = src_lg
+        # 2. Pegamos o ID numérico do token de destino (ex: __pt__)
+        # Usamos convert_tokens_to_ids porque ele ignora a lógica interna de busca de idiomas
         tgt_lang_id = pipe.tokenizer.convert_tokens_to_ids(tgt_lg)
+        if tgt_lang_id == pipe.tokenizer.unk_token_id:
+            st.error(f"Erro: O token {tgt_lg} não foi encontrado no vocabulário do modelo!")
+            return None
+        # 3. Executamos a tradução forçando o ID de início de frase (BOS)
         result = pipe(
             text,
             forced_bos_token_id=tgt_lang_id
         )
         return result[0]["translation_text"]
     # ---- MBART ----
     else:
         pipe = pipeline(