Spaces:

kleervoyans
/

evaluator

Sleeping

App Files Files Community

kleervoyans commited on May 5, 2025

Commit

9b88b5f

verified ·

1 Parent(s): 768e15d

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -46

app.py CHANGED Viewed

@@ -1,53 +1,251 @@
-# app.py
 import streamlit as st
 import logging
 import pandas as pd
 import plotly.express as px
-from models.model_manager import ModelManager
-from evaluators.evaluator import TranslationEvaluator
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
-    level=logging.INFO
 )
 logger = logging.getLogger(__name__)
-# ────────── Cached Resources ──────────
 @st.cache_resource
 def load_resources():
     """
-    Load and cache the model manager and evaluator on first run.
     """
-    manager   = ModelManager(quantize=True)
-    evaluator = TranslationEvaluator()
-    return manager, evaluator
-# ────────── Sidebar Model Info ──────────
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
-    st.sidebar.write(f"**Model:** {info.get('model')}")
-    st.sidebar.write(f"**8-bit Quantized:** {info.get('quantized')}")
-    st.sidebar.write(f"**Device:** {info.get('device')}")
-    st.sidebar.write(f"**Default target:** {info.get('default_tgt')}")
-# ────────── Single‐text Processing ──────────
-def process_text(src: str, ref: str, manager: ModelManager, evaluator: TranslationEvaluator, metrics: list):
-    # 1) Translate (auto-detect source, default target Turkish)
-    out = manager.translate(src)  # returns list of dicts
-    hyp = out[0]["translation_text"] if isinstance(out, list) else out["translation_text"]
-    # 2) Evaluate
-    scores = evaluator.evaluate([src], [ref or ""], [hyp])
     return {
         "source":     src,
         "reference":  ref,
         "hypothesis": hyp,
-        **{m: scores[m] for m in metrics}
     }
 def _show_single_results(res: dict):
     left, right = st.columns(2)
     with left:
@@ -60,16 +258,16 @@ def _show_single_results(res: dict):
             st.write(res["reference"])
     with right:
         st.markdown("### Scores")
-        df = pd.DataFrame([{k: v for k, v in res.items() if k in ["BLEU","BERTScore","BERTurk","COMET"]}])
         st.table(df)
-# ────────── Batch‐CSV Processing ──────────
 def process_file(
     uploaded,
-    manager: ModelManager,
-    evaluator: TranslationEvaluator,
-    metrics: list,
-    batch_size: int
 ):
     df = pd.read_csv(uploaded)
     if not {"src", "ref_tr"}.issubset(df.columns):
@@ -81,29 +279,32 @@ def process_file(
         batch = df.iloc[i : i + batch_size]
         srcs = batch["src"].tolist()
         refs = batch["ref_tr"].tolist()
-        # translate batch
-        outs = manager.translate(srcs)  # list of dicts
         hyps = [o["translation_text"] for o in outs]
-        # evaluate each row
         for s, r, h in zip(srcs, refs, hyps):
-            sc = evaluator.evaluate([s], [r], [h])
             entry = {"src": s, "ref_tr": r, "hyp_tr": h}
             entry.update({m: sc[m] for m in metrics})
             results.append(entry)
         prog.progress(min(i + batch_size, total) / total)
     return pd.DataFrame(results)
-def _show_batch_viz(df: pd.DataFrame, metrics: list):
     for m in metrics:
         st.markdown(f"#### {m} Distribution")
         fig = px.histogram(df, x=m)
         st.plotly_chart(fig, use_container_width=True)
-# ────────── Main ──────────
 def main():
-    st.set_page_config(page_title="🔤 Translation→Turkish Quality", layout="wide")
     st.title("🔤 Translation → TR Quality & COMET")
-    st.markdown("Translate any language into Turkish and evaluate with BLEU, BERTScore, BERTurk & COMET.")
     # Sidebar
     with st.sidebar:
@@ -111,11 +312,11 @@ def main():
         metrics = st.multiselect(
             "Select metrics",
             ["BLEU", "BERTScore", "BERTurk", "COMET"],
-            default=["BLEU", "BERTScore", "COMET"]
         )
         batch_size = st.slider("Batch size", 1, 32, 8)
-        manager, evaluator = load_resources()
-        display_model_info(manager.get_info())
     # Tabs
     tab1, tab2 = st.tabs(["Single Sentence", "Batch CSV"])
@@ -125,22 +326,27 @@ def main():
         ref = st.text_area("Turkish reference (optional):", height=100)
         if st.button("Evaluate"):
             with st.spinner("Translating & evaluating…"):
-                res = process_text(src, ref, manager, evaluator, metrics)
             _show_single_results(res)
     with tab2:
-        uploaded = st.file_uploader("Upload CSV with `src` & `ref_tr` columns", type=["csv"])
         if uploaded:
             with st.spinner("Processing file…"):
-                df_res = process_file(uploaded, manager, evaluator, metrics, batch_size)
             st.markdown("### Batch Results")
             st.dataframe(df_res, use_container_width=True)
             _show_batch_viz(df_res, metrics)
-            st.download_button("Download results as CSV", df_res.to_csv(index=False), "results.csv")
 if __name__ == "__main__":
     try:
         main()
     except Exception as e:
         st.error(f"Unexpected error: {e}")
-        logger.exception("Unhandled exception in main()")

 import streamlit as st
 import logging
 import pandas as pd
 import plotly.express as px
+from typing import Union, List
+from langdetect import detect, LangDetectException
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSeq2SeqLM,
+    pipeline,
+    BitsAndBytesConfig,
+)
+import evaluate
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
+# ────────── Model Management ──────────
+class ModelManager:
+    """
+    Automatically selects, loads, and wraps a seq2seq translation model
+    in 8-bit (with FP32 fallback), plus language‐code auto-detection.
+    """
+    def __init__(
+        self,
+        candidates: List[str] = None,
+        quantize: bool = True,
+        default_tgt: str = None,
+    ):
+        self.candidates = candidates or [
+            "facebook/nllb-200-distilled-600M",
+            "facebook/m2m100_418M",
+        ]
+        self.quantize = quantize
+        self.default_tgt = default_tgt  # if None → auto-pick Turkish
+        self.tokenizer = None
+        self.model = None
+        self.pipeline = None
+        self.lang_codes: List[str] = []
+        self._select_and_load()
+    def _select_and_load(self):
+        last_err = None
+        for model_name in self.candidates:
+            try:
+                # 1) Load tokenizer
+                logger.info(f"Loading tokenizer for {model_name}")
+                tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+                if not hasattr(tok, "lang_code_to_id"):
+                    raise AttributeError(
+                        f"Tokenizer for {model_name} missing lang_code_to_id"
+                    )
+                # 2) Load model with bitsandbytes 8-bit quantization
+                logger.info(
+                    f"Loading model {model_name} "
+                    f"(8-bit={'on' if self.quantize else 'off'})"
+                )
+                bnb_cfg = BitsAndBytesConfig(load_in_8bit=self.quantize)
+                model = AutoModelForSeq2SeqLM.from_pretrained(
+                    model_name,
+                    device_map="auto",
+                    quantization_config=bnb_cfg,
+                )
+                logger.info(f"Model {model_name} loaded successfully")
+                # 3) Build a translation pipeline around it
+                pipe = pipeline(
+                    "translation",
+                    model=model,
+                    tokenizer=tok,
+                )
+                # 4) On success, store and break
+                self.tokenizer = tok
+                self.model = model
+                self.pipeline = pipe
+                self.lang_codes = list(tok.lang_code_to_id.keys())
+                logger.info(f"Available language codes: {self.lang_codes[:5]}…")
+                # 5) Auto-pick Turkish target if needed
+                if not self.default_tgt:
+                    tur = [
+                        code
+                        for code in self.lang_codes
+                        if code.lower().startswith("tr")
+                    ]
+                    if not tur:
+                        raise ValueError(f"No Turkish code in {model_name}")
+                    self.default_tgt = tur[0]
+                logger.info(f"Default target language: {self.default_tgt}")
+                return
+            except Exception as e:
+                logger.warning(f"Failed to load {model_name}: {e}")
+                last_err = e
+        raise RuntimeError(
+            f"Could not load any model from candidates {self.candidates}: {last_err}"
+        )
+    def translate(
+        self,
+        text: Union[str, List[str]],
+        src_lang: str = None,
+        tgt_lang: str = None,
+    ):
+        """
+        Translate `text` from src_lang → tgt_lang.
+        If src_lang is None: auto-detect via langdetect.
+        If tgt_lang is None: use default_tgt (Turkish).
+        Returns the pipeline output (list of dicts with 'translation_text').
+        """
+        tgt = tgt_lang or self.default_tgt
+        # Auto-detect source
+        if not src_lang:
+            sample = text[0] if isinstance(text, list) else text
+            try:
+                iso = detect(sample).lower()
+                candidates = [
+                    c for c in self.lang_codes if c.lower().startswith(iso)
+                ]
+                if not candidates:
+                    raise LangDetectException(f"No code for ISO '{iso}'")
+                # prefer exact match
+                exact = [c for c in candidates if c.lower() == iso]
+                src = exact[0] if exact else candidates[0]
+                logger.info(f"Auto-detected src_lang={src}")
+            except Exception as e:
+                logger.warning(f"langdetect failed ({e}); defaulting to English")
+                eng = [c for c in self.lang_codes if c.lower().startswith("en")]
+                src = eng[0] if eng else self.lang_codes[0]
+        else:
+            src = src_lang
+        # Call the pipeline with both src_lang and tgt_lang
+        return self.pipeline(text, src_lang=src, tgt_lang=tgt)
+    def get_info(self):
+        """Return metadata for sidebar display."""
+        model = getattr(self.model, "config", None)
+        quantized = getattr(self.model, "is_loaded_in_8bit", False)
+        device = getattr(self.model.device, "index", None)
+        device = f"cuda:{device}" if device is not None else "cpu"
+        return {
+            "model":     self.model.name_or_path,
+            "quantized": quantized,
+            "device":    device,
+            "default_tgt": self.default_tgt,
+        }
+# ────────── Evaluation ──────────
+class TranslationEvaluator:
+    def __init__(self):
+        self.bleu = evaluate.load("bleu")
+        self.bertscore = evaluate.load("bertscore")
+        self.comet = evaluate.load("comet", model_id="unbabel/comet-mqm-qe-da")
+        logging.info("Loaded BLEU, BERTScore, COMET")
+    def evaluate(
+        self,
+        sources: List[str],
+        references: List[str],
+        predictions: List[str],
+    ):
+        results = {}
+        # BLEU
+        results["BLEU"] = self.bleu.compute(
+            predictions=predictions,
+            references=[[r] for r in references],
+        )["bleu"]
+        # BERTScore (general)
+        bs = self.bertscore.compute(
+            predictions=predictions, references=references, lang="xx"
+        )
+        results["BERTScore"] = sum(bs["f1"]) / len(bs["f1"]) if bs["f1"] else 0.0
+        # BERTurk (Turkish)
+        bs_tr = self.bertscore.compute(
+            predictions=predictions, references=references, lang="tr"
+        )
+        results["BERTurk"] = sum(bs_tr["f1"]) / len(bs_tr["f1"]) if bs_tr["f1"] else 0.0
+        # COMET
+        co = self.comet.compute(
+            srcs=sources, hyps=predictions, refs=references
+        )
+        # `scores` may be a float or list
+        score = co.get("scores", None)
+        if isinstance(score, list):
+            results["COMET"] = score[0] if score else 0.0
+        else:
+            results["COMET"] = score or 0.0
+        return results
+# ────────── Streamlit App ──────────
 @st.cache_resource
 def load_resources():
     """
+    Load and cache ModelManager & TranslationEvaluator on first run.
     """
+    mgr = ModelManager(quantize=True)
+    ev  = TranslationEvaluator()
+    return mgr, ev
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
+    st.sidebar.write(f"**Model:** {info['model']}")
+    st.sidebar.write(f"**8-bit Quantized:** {info['quantized']}")
+    st.sidebar.write(f"**Device:** {info['device']}")
+    st.sidebar.write(f"**Default target:** {info['default_tgt']}")
+def process_text(
+    src: str,
+    ref: str,
+    mgr: ModelManager,
+    ev: TranslationEvaluator,
+    metrics: List[str],
+):
+    out = mgr.translate(src)  # list of dicts
+    hyp = out[0]["translation_text"]
+    scores = ev.evaluate([src], [ref or ""], [hyp])
     return {
         "source":     src,
         "reference":  ref,
         "hypothesis": hyp,
+        **{m: scores[m] for m in metrics},
     }
 def _show_single_results(res: dict):
     left, right = st.columns(2)
     with left:
             st.write(res["reference"])
     with right:
         st.markdown("### Scores")
+        df = pd.DataFrame([{k: v for k, v in res.items() if k in res.keys() and k in ["BLEU","BERTScore","BERTurk","COMET"]}])
         st.table(df)
 def process_file(
     uploaded,
+    mgr: ModelManager,
+    ev: TranslationEvaluator,
+    metrics: List[str],
+    batch_size: int,
 ):
     df = pd.read_csv(uploaded)
     if not {"src", "ref_tr"}.issubset(df.columns):
         batch = df.iloc[i : i + batch_size]
         srcs = batch["src"].tolist()
         refs = batch["ref_tr"].tolist()
+        outs = mgr.translate(srcs)  # batch translation
         hyps = [o["translation_text"] for o in outs]
         for s, r, h in zip(srcs, refs, hyps):
+            sc = ev.evaluate([s], [r], [h])
             entry = {"src": s, "ref_tr": r, "hyp_tr": h}
             entry.update({m: sc[m] for m in metrics})
             results.append(entry)
         prog.progress(min(i + batch_size, total) / total)
     return pd.DataFrame(results)
+def _show_batch_viz(df: pd.DataFrame, metrics: List[str]):
     for m in metrics:
         st.markdown(f"#### {m} Distribution")
         fig = px.histogram(df, x=m)
         st.plotly_chart(fig, use_container_width=True)
 def main():
+    st.set_page_config(
+        page_title="🔤 Translation→Turkish Quality", layout="wide"
+    )
     st.title("🔤 Translation → TR Quality & COMET")
+    st.markdown(
+        "Translate any language into Turkish and evaluate with BLEU, BERTScore, BERTurk & COMET."
+    )
     # Sidebar
     with st.sidebar:
         metrics = st.multiselect(
             "Select metrics",
             ["BLEU", "BERTScore", "BERTurk", "COMET"],
+            default=["BLEU", "BERTScore", "COMET"],
         )
         batch_size = st.slider("Batch size", 1, 32, 8)
+        mgr, ev = load_resources()
+        display_model_info(mgr.get_info())
     # Tabs
     tab1, tab2 = st.tabs(["Single Sentence", "Batch CSV"])
         ref = st.text_area("Turkish reference (optional):", height=100)
         if st.button("Evaluate"):
             with st.spinner("Translating & evaluating…"):
+                res = process_text(src, ref, mgr, ev, metrics)
             _show_single_results(res)
     with tab2:
+        uploaded = st.file_uploader(
+            "Upload CSV with `src` & `ref_tr` columns", type=["csv"]
+        )
         if uploaded:
             with st.spinner("Processing file…"):
+                df_res = process_file(uploaded, mgr, ev, metrics, batch_size)
             st.markdown("### Batch Results")
             st.dataframe(df_res, use_container_width=True)
             _show_batch_viz(df_res, metrics)
+            st.download_button(
+                "Download CSV", df_res.to_csv(index=False), "results.csv"
+            )
 if __name__ == "__main__":
     try:
         main()
     except Exception as e:
         st.error(f"Unexpected error: {e}")
+        logger.exception("Unhandled exception")