Spaces:

kleervoyans
/

evaluator

Sleeping

App Files Files Community

kleervoyans commited on May 6, 2025

Commit

dc2f97b

verified ·

1 Parent(s): 24c7801

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -196

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 # app.py
 import streamlit as st
 import logging
 import torch
 import pandas as pd
 import plotly.express as px
 from typing import Union, List
 from langdetect import detect, LangDetectException
@@ -16,11 +19,29 @@ from transformers import (
 )
 import evaluate
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
-    level=logging.INFO,
 )
 logger = logging.getLogger(__name__)
@@ -28,9 +49,8 @@ logger = logging.getLogger(__name__)
 # ────────── Model Manager ──────────
 class ModelManager:
     """
-    Select & load the best translation model from a candidate list,
-    using 8-bit quant if CUDA is available, else full-precision.
-    Auto-picks Turkish target code.
     """
     def __init__(
         self,
@@ -38,75 +58,62 @@ class ModelManager:
         quantize: bool = True,
         default_tgt: str = None,
     ):
-        # disable 8-bit if no GPU
         if quantize and not torch.cuda.is_available():
             logger.warning("CUDA unavailable; disabling 8-bit quantization")
             quantize = False
-        self.quantize = quantize
-        self.candidates   = candidates or [
             "facebook/nllb-200-distilled-600M",
-            "facebook/m2m100_418M",
         ]
-        self.default_tgt  = default_tgt  # will auto-pick if None
-        self.model_name   = None
-        self.tokenizer    = None
-        self.model        = None
-        self.pipeline     = None
-        self.lang_codes   = []
-        self._select_and_load()
-    def _select_and_load(self):
         last_err = None
         for name in self.candidates:
             try:
-                # 1) tokenizer
                 logger.info(f"Loading tokenizer for {name}")
                 tok = AutoTokenizer.from_pretrained(name, use_fast=True)
                 if not hasattr(tok, "lang_code_to_id"):
-                    raise AttributeError("no lang_code_to_id on tokenizer")
-                # 2) model
                 logger.info(f"Loading model {name} (8-bit={self.quantize})")
                 if self.quantize:
-                    bnb_cfg = BitsAndBytesConfig(load_in_8bit=True)
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
-                        name,
-                        device_map="auto",
-                        quantization_config=bnb_cfg,
                     )
                 else:
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
-                        name,
-                        device_map="auto",
                     )
-                logger.info(f"Loaded {name}")
-                # 3) pipeline
                 pipe = pipeline("translation", model=mdl, tokenizer=tok)
-                # store
                 self.model_name = name
                 self.tokenizer  = tok
                 self.model      = mdl
                 self.pipeline   = pipe
                 self.lang_codes = list(tok.lang_code_to_id.keys())
-                # pick Turkish code if needed
                 if not self.default_tgt:
                     tur = [c for c in self.lang_codes if c.lower().startswith("tr")]
                     if not tur:
-                        raise ValueError("No Turkish code available")
                     self.default_tgt = tur[0]
-                logger.info(f"default_tgt = {self.default_tgt}")
                 return
             except Exception as e:
-                logger.warning(f"failed to load {name}: {e}")
                 last_err = e
-        raise RuntimeError(f"no model loaded: {last_err}")
     def translate(
         self,
@@ -115,30 +122,27 @@ class ModelManager:
         tgt_lang: str = None,
     ):
         tgt = tgt_lang or self.default_tgt
-        # auto-detect source
         if not src_lang:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
-                cand = [c for c in self.lang_codes if c.lower().startswith(iso)]
-                if not cand:
-                    raise LangDetectException(f"No code for ISO '{iso}'")
-                # exact or first
-                exact = [c for c in cand if c.lower() == iso]
-                src = exact[0] if exact else cand[0]
-                logger.info(f"src_lang = {src}")
             except Exception:
                 eng = [c for c in self.lang_codes if c.lower().startswith("en")]
                 src = eng[0] if eng else self.lang_codes[0]
-                logger.warning(f"defaulting src_lang = {src}")
         else:
             src = src_lang
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
-        # figure out device for display
         dev = "cpu"
         if torch.cuda.is_available() and hasattr(self.model, "device"):
             d = self.model.device
@@ -161,39 +165,30 @@ class TranslationEvaluator:
     def evaluate(
         self,
-        sources: List[str],
-        references: List[str],
-        predictions: List[str],
     ):
-        results = {}
         # BLEU
-        bleu_r = self.bleu.compute(predictions=predictions, references=[[r] for r in references])
-        results["BLEU"] = float(bleu_r.get("bleu", 0.0))
-        # BERTScore (xx)
-        bs = self.bertscore.compute(predictions=predictions, references=references, lang="xx")
         f1 = bs.get("f1", [])
-        results["BERTScore"] = float(sum(f1) / len(f1)) if f1 else 0.0
-        # BERTurk (tr)
-        bs_tr = self.bertscore.compute(predictions=predictions, references=references, lang="tr")
-        f1t  = bs_tr.get("f1", [])
-        results["BERTurk"] = float(sum(f1t) / len(f1t)) if f1t else 0.0
         # COMET
-        cm = self.comet.compute(srcs=sources, hyps=predictions, refs=references)
-        sc = cm.get("scores", None)
-        if isinstance(sc, list):
-            results["COMET"] = float(sc[0]) if sc else 0.0
-        else:
-            results["COMET"] = float(sc or 0.0)
-        return results
 # ────────── Streamlit App ──────────
 @st.cache_resource
 def load_resources():
     mgr = ModelManager(quantize=True)
@@ -203,142 +198,120 @@ def load_resources():
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
-    st.sidebar.write(f"**Model:** {info['model']}")
-    st.sidebar.write(f"**8-bit Quantized:** {info['quantized']}")
-    st.sidebar.write(f"**Device:** {info['device']}")
-    st.sidebar.write(f"**Default target:** {info['default_tgt']}")
-def process_text(
-    src: str,
-    ref: str,
-    mgr: ModelManager,
-    ev: TranslationEvaluator,
-    metrics: List[str],
-):
-    # 1) translate
-    out = mgr.translate(src)  # list of dicts
     hyp = out[0]["translation_text"]
-    # 2) if we have a non-blank reference → compute metrics; else all Nones
-    result = {
-        "source":    src,
-        "reference": ref or None,
-        "hypothesis": hyp,
-    }
     if ref and ref.strip():
         scores = ev.evaluate([src], [ref], [hyp])
-        for m in metrics:
-            result[m] = scores.get(m, 0.0)
-    else:
-        for m in metrics:
-            result[m] = None
-    return result
-def show_single_results(res: dict, metrics: List[str]):
-    left, right = st.columns(2)
-    with left:
-        st.markdown("**Source:**");            st.write(res["source"])
-        st.markdown("**Hypothesis (TR):**");   st.write(res["hypothesis"])
-        if res["reference"]:
-            st.markdown("**Reference (TR):**"); st.write(res["reference"])
-    with right:
-        st.markdown("### Scores")
-        df = pd.DataFrame([{m: res[m] for m in metrics}])
-        df = df.replace({None: "N/A"})
-        st.table(df)
-def process_file(
-    uploaded,
-    mgr: ModelManager,
-    ev: TranslationEvaluator,
-    metrics: List[str],
-    batch_size: int,
-):
-    df = pd.read_csv(uploaded)
-    if not {"src", "ref_tr"}.issubset(df.columns):
-        raise ValueError("CSV must have `src` and `ref_tr` columns")
-    prog = st.progress(0)
-    results = []
-    total = len(df)
-    for i in range(0, total, batch_size):
-        batch = df.iloc[i : i + batch_size]
-        srcs, refs = batch["src"].tolist(), batch["ref_tr"].tolist()
-        outs = mgr.translate(srcs)
-        hyps = [o["translation_text"] for o in outs]
-        for s, r, h in zip(srcs, refs, hyps):
-            entry = {"src": s, "ref_tr": r, "hyp_tr": h}
-            if r and str(r).strip():
-                sc = ev.evaluate([s], [r], [h])
-                for m in metrics:
-                    entry[m] = sc.get(m, 0.0)
-            else:
-                for m in metrics:
-                    entry[m] = None
-            results.append(entry)
-        prog.progress(min(i + batch_size, total) / total)
-    return pd.DataFrame(results)
-def show_batch_viz(df: pd.DataFrame, metrics: List[str]):
-    for m in metrics:
-        st.markdown(f"#### {m} Distribution")
-        if df[m].dropna().empty:
-            st.write("No reference provided, so this metric is N/A.")
-            continue
-        fig = px.histogram(df, x=m)
-        st.plotly_chart(fig, use_container_width=True)
 def main():
-    st.set_page_config(page_title="🔤 Translation→Turkish Quality", layout="wide")
-    st.title("🔤 Translation → TR Quality & COMET")
-    st.markdown("Translate any language into Turkish and evaluate (optional) with BLEU, BERTScore, BERTurk & COMET.")
-    # Sidebar
     with st.sidebar:
         st.header("Settings")
-        metrics    = st.multiselect(
-            "Select metrics",
-            ["BLEU", "BERTScore", "BERTurk", "COMET"],
-            default=["BLEU", "BERTScore", "COMET"]
         )
         batch_size = st.slider("Batch size", 1, 32, 8)
-        mgr, ev    = load_resources()
-        display_model_info(mgr.get_info())
-    # Tabs
-    tab1, tab2 = st.tabs(["Single Sentence", "Batch CSV"])
     with tab1:
-        src = st.text_area("Source sentence (any language):", height=150)
-        ref = st.text_area("Turkish reference (optional):", height=100)
-        if st.button("Evaluate"):
-            with st.spinner("Translating & evaluating…"):
-                res = process_text(src, ref, mgr, ev, metrics)
-            show_single_results(res, metrics)
     with tab2:
-        uploaded = st.file_uploader("Upload CSV with `src` & `ref_tr` columns", type=["csv"])
         if uploaded:
-            with st.spinner("Processing file…"):
-                df_res = process_file(uploaded, mgr, ev, metrics, batch_size)
-            st.markdown("### Batch Results")
-            st.dataframe(df_res, use_container_width=True)
-            show_batch_viz(df_res, metrics)
-            st.download_button("Download results as CSV", df_res.to_csv(index=False), "results.csv")
-if __name__ == "__main__":
-    try:
-        main()
-    except Exception as e:
-        st.error(f"Unexpected error: {e}")
-        logger.exception("Unhandled exception")

 # app.py
 import streamlit as st
+import streamlit.components.v1 as components
 import logging
 import torch
 import pandas as pd
 import plotly.express as px
+import time
+import difflib
 from typing import Union, List
 from langdetect import detect, LangDetectException
 )
 import evaluate
+# ────────── Global CSS ──────────
+st.markdown(
+    """
+    <style>
+      /* Page */
+      .main .block-container { max-width: 900px; padding: 1rem 2rem; }
+      /* Buttons */
+      .stButton>button { background-color: #4A90E2; color: white; border-radius: 4px; }
+      .stButton>button:hover { background-color: #357ABD; }
+      /* Text areas */
+      textarea { border-radius: 4px; }
+      /* Tables */
+      .stTable table { border-radius: 4px; overflow: hidden; }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
 # ────────── Logging ──────────
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(name)s: %(message)s",
     datefmt="%Y-%m-%d %H:%M:%S",
+    level=logging.INFO
 )
 logger = logging.getLogger(__name__)
 # ────────── Model Manager ──────────
 class ModelManager:
     """
+    Selects & loads NLLB‐200 or M2M100 (8‐bit if GPU available).
+    Exposes `translate()` with auto‐lang detection + dynamic tgt_lang.
     """
     def __init__(
         self,
         quantize: bool = True,
         default_tgt: str = None,
     ):
         if quantize and not torch.cuda.is_available():
             logger.warning("CUDA unavailable; disabling 8-bit quantization")
             quantize = False
+        self.quantize    = quantize
+        self.candidates  = candidates or [
             "facebook/nllb-200-distilled-600M",
+            "facebook/m2m100_418M"
         ]
+        self.default_tgt = default_tgt
+        self.model_name = None
+        self.tokenizer  = None
+        self.model      = None
+        self.pipeline   = None
+        self.lang_codes = []
+        self._load_best()
+    def _load_best(self):
         last_err = None
         for name in self.candidates:
             try:
+                # 1) Tokenizer
                 logger.info(f"Loading tokenizer for {name}")
                 tok = AutoTokenizer.from_pretrained(name, use_fast=True)
                 if not hasattr(tok, "lang_code_to_id"):
+                    raise AttributeError("no lang_code_to_id")
+                # 2) Model (8-bit if configured)
                 logger.info(f"Loading model {name} (8-bit={self.quantize})")
                 if self.quantize:
+                    bnb = BitsAndBytesConfig(load_in_8bit=True)
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
+                        name, device_map="auto", quantization_config=bnb
                     )
                 else:
                     mdl = AutoModelForSeq2SeqLM.from_pretrained(
+                        name, device_map="auto"
                     )
+                # 3) Pipeline
                 pipe = pipeline("translation", model=mdl, tokenizer=tok)
+                # Store
                 self.model_name = name
                 self.tokenizer  = tok
                 self.model      = mdl
                 self.pipeline   = pipe
                 self.lang_codes = list(tok.lang_code_to_id.keys())
+                # Auto‐pick Turkish if needed
                 if not self.default_tgt:
                     tur = [c for c in self.lang_codes if c.lower().startswith("tr")]
                     if not tur:
+                        raise ValueError("No Turkish code found")
                     self.default_tgt = tur[0]
+                logger.info(f"Default target = {self.default_tgt}")
                 return
             except Exception as e:
+                logger.warning(f"Failed to load {name}: {e}")
                 last_err = e
+        raise RuntimeError(f"No model loaded: {last_err}")
     def translate(
         self,
         tgt_lang: str = None,
     ):
         tgt = tgt_lang or self.default_tgt
+        # auto‐detect source if missing
         if not src_lang:
             sample = text[0] if isinstance(text, list) else text
             try:
                 iso = detect(sample).lower()
+                cands = [c for c in self.lang_codes if c.lower().startswith(iso)]
+                if not cands: raise LangDetectException()
+                exact = [c for c in cands if c.lower() == iso]
+                src = exact[0] if exact else cands[0]
+                logger.info(f"Detected src_lang={src}")
             except Exception:
+                # fallback to English
                 eng = [c for c in self.lang_codes if c.lower().startswith("en")]
                 src = eng[0] if eng else self.lang_codes[0]
+                logger.warning(f"Falling back src_lang={src}")
         else:
             src = src_lang
         return self.pipeline(text, src_lang=src, tgt_lang=tgt)
     def get_info(self):
         dev = "cpu"
         if torch.cuda.is_available() and hasattr(self.model, "device"):
             d = self.model.device
     def evaluate(
         self,
+        srcs: List[str],
+        refs: List[str],
+        hyps: List[str],
     ):
+        out = {}
         # BLEU
+        b = self.bleu.compute(predictions=hyps, references=[[r] for r in refs])
+        out["BLEU"] = float(b.get("bleu", 0.0))
+        # BERTScore xx
+        bs = self.bertscore.compute(predictions=hyps, references=refs, lang="xx")
         f1 = bs.get("f1", [])
+        out["BERTScore"] = float(sum(f1)/len(f1)) if f1 else 0.0
+        # BERTurk tr
+        bt = self.bertscore.compute(predictions=hyps, references=refs, lang="tr")
+        f2 = bt.get("f1", [])
+        out["BERTurk"] = float(sum(f2)/len(f2)) if f2 else 0.0
         # COMET
+        cm = self.comet.compute(srcs=srcs, hyps=hyps, refs=refs)
+        sc = cm.get("scores")
+        out["COMET"] = float(sc[0] if isinstance(sc, list) else sc or 0.0)
+        return out
 # ────────── Streamlit App ──────────
 @st.cache_resource
 def load_resources():
     mgr = ModelManager(quantize=True)
 def display_model_info(info: dict):
     st.sidebar.markdown("### Model Info")
+    st.sidebar.write(f"• Model: **{info['model']}**")
+    st.sidebar.write(f"• Quantized: **{info['quantized']}**")
+    st.sidebar.write(f"• Device: **{info['device']}**")
+def process_and_stream(src, ref, tgt, mgr, ev, metrics):
+    # 1) call pipeline
+    out = mgr.translate(src, tgt_lang=tgt)
     hyp = out[0]["translation_text"]
+    # 2) pseudo‐stream: reveal word by word
+    placeholder = st.empty()
+    text_acc = ""
+    for w in hyp.split():
+        text_acc += w + " "
+        placeholder.markdown(f"**Hypothesis ({tgt}):**  {text_acc}")
+        time.sleep(0.05)
+    # 3) metrics (only if ref given)
+    scores = {}
     if ref and ref.strip():
         scores = ev.evaluate([src], [ref], [hyp])
+    return hyp, scores
+def show_diff(ref, hyp):
+    # side‐by‐side HTML diff
+    differ = difflib.HtmlDiff(tabsize=4, wrapcolumn=60)
+    html = differ.make_table(
+        ref.split(), hyp.split(),
+        fromdesc="Reference", todesc="Hypothesis",
+        context=True, numlines=1
+    )
+    components.html(html, height=200, scrolling=True)
 def main():
+    st.set_page_config(page_title="🔤 Multi‐Lang ↑TR + Eval", layout="wide")
+    st.title("🌐 Translate → 🔠 Turkish & Evaluate")
+    st.write("Choose target, translate from any language, and (optionally) eval against a reference.")
+    # Sidebar: load models & then dynamic tgt dropdown
     with st.sidebar:
         st.header("Settings")
+        mgr, ev = load_resources()
+        info = mgr.get_info()
+        display_model_info(info)
+        tgt = st.selectbox(
+            "Target language code",
+            options=mgr.lang_codes,
+            index=mgr.lang_codes.index(info["default_tgt"])
+        )
+        metrics = st.multiselect(
+            "Metrics",
+            ["BLEU","BERTScore","BERTurk","COMET"],
+            default=["BLEU","BERTScore","COMET"]
         )
         batch_size = st.slider("Batch size", 1, 32, 8)
+    tab1, tab2 = st.tabs(["Single sentence","Batch CSV"])
     with tab1:
+        src = st.text_area("Source sentence:", height=120)
+        ref = st.text_area("Turkish reference (optional):", height=80)
+        if st.button("Translate & Eval"):
+            with st.spinner("Working…"):
+                hyp, scores = process_and_stream(src, ref, tgt, mgr, ev, metrics)
+            # show scores
+            df = {m: (scores.get(m) if ref.strip() else None) for m in metrics}
+            st.markdown("### Scores")
+            st.table(pd.DataFrame([df]).replace({None:"N/A"}))
+            # diff
+            if ref.strip():
+                st.markdown("### Diff view")
+                show_diff(ref, hyp)
     with tab2:
+        uploaded = st.file_uploader("Upload CSV with `src`,`ref_tr`", type=["csv"])
         if uploaded:
+            df = pd.read_csv(uploaded)
+            if not {"src","ref_tr"}.issubset(df):
+                st.error("CSV needs `src` and `ref_tr` columns.")
+            else:
+                with st.spinner("Batch translating…"):
+                    out_rows = []
+                    prog = st.progress(0)
+                    for i in range(0, len(df), batch_size):
+                        batch = df.iloc[i : i+batch_size]
+                        srcs, refs = batch["src"].tolist(), batch["ref_tr"].tolist()
+                        outs = mgr.translate(srcs, tgt_lang=tgt)
+                        hyps = [o["translation_text"] for o in outs]
+                        for s, r, h in zip(srcs, refs, hyps):
+                            row = {"src":s, "ref_tr":r, "hyp_tr":h}
+                            if r.strip():
+                                sc = ev.evaluate([s],[r],[h])
+                                for m in metrics: row[m] = sc[m]
+                            else:
+                                for m in metrics: row[m] = None
+                            out_rows.append(row)
+                        prog.progress(min(i+batch_size,len(df))/len(df))
+                    res_df = pd.DataFrame(out_rows)
+                st.markdown("### Batch Results")
+                st.dataframe(res_df, use_container_width=True)
+                # viz
+                for m in metrics:
+                    st.markdown(f"#### {m} Histogram")
+                    col = res_df[m].dropna()
+                    if col.empty:
+                        st.write("No valid refs → metric N/A.")
+                    else:
+                        fig = px.histogram(res_df, x=m)
+                        st.plotly_chart(fig, use_container_width=True)
+                st.download_button("Download CSV", res_df.to_csv(index=False), "results.csv")
+if __name__=="__main__":
+    main()