Spaces:

kleervoyans
/

evaluator

Sleeping

App Files Files Community

kleervoyans commited on May 5, 2025

Commit

922e95d

verified ·

1 Parent(s): d970a26

Update models/translation_loader.py

Browse files

Files changed (1) hide show

models/translation_loader.py +62 -52

models/translation_loader.py CHANGED Viewed

@@ -10,51 +10,59 @@ class TranslationLoader:
         self,
         model_name: str = "facebook/nllb-200-distilled-600M",
         quantize: bool = True,
-        tgt_lang: str = "tur_Latn",
     ):
         self.model_name = model_name
         self.quantize = quantize
-        self.default_tgt = tgt_lang
-        # 1) Load translation pipeline (with optional 8-bit quantization)
-        self._load_pipeline()
-        # 2) Separately load AutoTokenizer so we can access lang_code_to_id
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # This mapping is used in the HF NLLB examples:
-            # tokenizer.lang_code_to_id["fra_Latn"] → token ID :contentReference[oaicite:1]{index=1}
-            self.lang_code_to_id = self.tokenizer.lang_code_to_id
-            logging.info("Loaded tokenizer.lang_code_to_id mapping")
-        except (AttributeError, ValueError):
-            # Fallback: some pipelines don't expose it, but the model config does
-            self.lang_code_to_id = self.pipeline.model.config.lang_code_to_id
-            logging.info("Using model.config.lang_code_to_id mapping")
-        # Precompute list of supported codes
-        self.lang_codes = list(self.lang_code_to_id.keys())
-        logging.info(f"Supported language codes (sample): {self.lang_codes[:5]}...")
-    def _load_pipeline(self):
         try:
-            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
             self.pipeline = pipeline(
                 "translation",
                 model=self.model_name,
                 tokenizer=self.model_name,
                 device_map="auto",
-                quantization_config=bnb_config,
             )
-            logging.info(f"Loaded {self.model_name} in 8-bit mode")
         except Exception as e:
-            logging.warning(f"8-bit quantization failed ({e}), loading FP32 model")
             self.pipeline = pipeline(
                 "translation",
                 model=self.model_name,
                 tokenizer=self.model_name,
                 device_map="auto",
             )
-            logging.info(f"Loaded {self.model_name} in full precision")
     def translate(
         self,
@@ -63,42 +71,44 @@ class TranslationLoader:
         tgt_lang: str = None,
     ):
         """
-        Translate `text` (str or list) from src_lang → tgt_lang.
-        If src_lang is None, auto-detect via langdetect.
-        If tgt_lang is None, use the default (Turkish).
         """
         tgt = tgt_lang or self.default_tgt
-        # Auto-detect source if not provided
-        if src_lang is None:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
-                # Find matching NLLB codes that start with the ISO
-                candidates = [c for c in self.lang_codes if c.lower().startswith(iso)]
-                # Prefer Latin-script variant if available
-                src = next((c for c in candidates if "Latn" in c), None)
-                src = src or (candidates[0] if candidates else "eng_Latn")
-                logging.info(f"Auto-detected src_lang={src} (iso='{iso}')")
-            except LangDetectException as e:
-                logging.warning(f"langdetect failed ({e}), defaulting to eng_Latn")
-                src = "eng_Latn"
-        else:
-            src = src_lang
-        # Call the pipeline with both src_lang and tgt_lang
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
-        """
-        Returns metadata for display in the UI sidebar.
-        """
-        model = getattr(self.pipeline, "model", None)
-        quantized = getattr(model, "is_loaded_in_8bit", False)
-        device = getattr(model, "device", "auto")
         return {
             "model_name": self.model_name,
-            "quantized": quantized,
             "device": str(device),
-            "default_tgt": self.default_tgt,
         }

         self,
         model_name: str = "facebook/nllb-200-distilled-600M",
         quantize: bool = True,
+        tgt_lang: str = None,  # if None, we’ll pick the Turkish code automatically
     ):
         self.model_name = model_name
         self.quantize = quantize
+        self.default_tgt = tgt_lang  # may be None
+        # ─── Load the translation pipeline ───────────────────────────────
         try:
+            bnb_cfg = BitsAndBytesConfig(load_in_8bit=self.quantize)
             self.pipeline = pipeline(
                 "translation",
                 model=self.model_name,
                 tokenizer=self.model_name,
                 device_map="auto",
+                quantization_config=bnb_cfg,
             )
+            logging.info(f"Loaded `{self.model_name}` with 8-bit={self.quantize}")
         except Exception as e:
+            logging.warning(f"8-bit load failed ({e}); falling back to full-precision")
             self.pipeline = pipeline(
                 "translation",
                 model=self.model_name,
                 tokenizer=self.model_name,
                 device_map="auto",
             )
+            logging.info(f"Loaded `{self.model_name}` in full precision")
+        # ─── Load tokenizer & grab the lang_code_to_id mapping ────────────
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
+            logging.info(f"Tokenizer loaded for {self.model_name}")
+        except Exception as e:
+            logging.error(f"Cannot load tokenizer for {self.model_name}: {e}")
+            raise ValueError(f"Failed to load tokenizer: {e}")
+        if hasattr(self.tokenizer, "lang_code_to_id"):
+            self.lang_code_to_id = self.tokenizer.lang_code_to_id
+            logging.info("Using tokenizer.lang_code_to_id mapping")
+        else:
+            allowed = ", ".join(list(self.tokenizer.config.to_dict().keys())[:5])
+            raise AttributeError(
+                f"Model `{self.model_name}`’s tokenizer has no `lang_code_to_id`. "
+                "Use a model like NLLB-200 or M2M100 that supports language codes. "
+                f"(available config keys: {allowed}…)"
+            )
+        # ─── Auto-pick the Turkish target code if none was provided ───────
+        if self.default_tgt is None:
+            tur = [c for c in self.lang_code_to_id if c.lower().startswith("tr")]
+            if not tur:
+                raise ValueError(f"No Turkish code found in mapping for {self.model_name}")
+            self.default_tgt = tur[0]
+        logging.info(f"Default target set to `{self.default_tgt}`")
     def translate(
         self,
         tgt_lang: str = None,
     ):
         """
+        - Auto-detects src_lang via langdetect if not given
+        - Uses default_tgt if tgt_lang is not passed
+        - Returns pipeline output (list of dicts with 'translation_text')
         """
         tgt = tgt_lang or self.default_tgt
+        # ─── Source-language auto-detection ─────────────────────────────
+        if src_lang:
+            src = src_lang
+        else:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
+                # find codes starting with that ISO (e.g. "en"→["en","eng_Latn",…])
+                cand = [c for c in self.lang_code_to_id if c.lower().startswith(iso)]
+                if not cand:
+                    raise LangDetectException(f"No mapping for ISO '{iso}'")
+                # prefer exact match, else first
+                exact = [c for c in cand if c.lower() == iso]
+                src = exact[0] if exact else cand[0]
+                logging.info(f"Detected src_lang={src} from ISO='{iso}'")
+            except Exception as e:
+                logging.warning(f"Language auto-detect failed ({e}); defaulting to English")
+                eng = [c for c in self.lang_code_to_id if c.lower().startswith("en")]
+                src = eng[0] if eng else list(self.lang_code_to_id)[0]
+                logging.info(f"Fallback src_lang={src}")
+        # ─── Perform translation call ────────────────────────────────────
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
+        """Return model metadata for display in your sidebar."""
+        mdl = getattr(self.pipeline, "model", None)
+        q = getattr(mdl, "is_loaded_in_8bit", False)
+        device = getattr(mdl, "device", "auto")
         return {
             "model_name": self.model_name,
+            "quantized": q,
             "device": str(device),
+            "default_target": self.default_tgt,
         }