Spaces:

Dusit-P
/

Thai-Sentiment-GUI

Sleeping

App Files Files Community

Dusit-P commited on Oct 3, 2025

Commit

c1fbd91

verified ·

1 Parent(s): c6bf28b

Update app.py

Browse files

Files changed (1) hide show

app.py +186 -186

app.py CHANGED Viewed

@@ -1,186 +1,186 @@
-import os, json, importlib.util, tempfile, traceback, torch, re, math
-import torch.nn as nn
-import torch.nn.functional as F
-import gradio as gr
-import pandas as pd
-import plotly.graph_objects as go
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
-from transformers import AutoTokenizer, AutoModel
-# ===== Settings =====
-REPO_ID       = os.getenv("REPO_ID", "Dusit-P/thai-sentiment-wcb")
-DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "WCB")   # default model
-HF_TOKEN      = os.getenv("HF_TOKEN", None)
-# ---- theme colors ----
-NEG_COLOR = "#F87171"   # red-400
-POS_COLOR = "#34D399"   # emerald-400
-TEMPLATE  = "plotly_white"
-CACHE = {}
-# ---------- load models from common/models.py ----------
-def _import_models():
-    if "models_module" in CACHE:
-        return CACHE["models_module"]
-    models_py = hf_hub_download(REPO_ID, filename="common/models.py", token=HF_TOKEN)
-    spec = importlib.util.spec_from_file_location("models", models_py)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    CACHE["models_module"] = mod
-    return mod
-def load_model(model_name: str):
-    key = f"model:{model_name}"
-    if key in CACHE:
-        return CACHE[key]
-    cfg_path = hf_hub_download(REPO_ID, filename=f"{model_name}/config.json", token=HF_TOKEN)
-    w_path   = hf_hub_download(REPO_ID, filename=f"{model_name}/model.safetensors", token=HF_TOKEN)
-    with open(cfg_path, "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-    base_model = cfg.get("base_model", "airesearch/wangchanberta-base-att-spm-uncased")
-    arch_name  = cfg.get("architecture", model_name)
-    tok = AutoTokenizer.from_pretrained(base_model)
-    models = _import_models()
-    model = models._build(arch_name, base_model, int(cfg.get("num_labels",2)), cfg.get("pooling_after_lstm","masked_mean"))
-    state = load_file(w_path)
-    model.load_state_dict(state, strict=False)
-    model.eval()
-    CACHE[key] = (model, tok, cfg)
-    return CACHE[key]
-# ---------- helpers ----------
-def _format_pct(x: float) -> str:
-    return f"{x*100:.2f}%"
-_INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""}
-_RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]")
-def _norm_text(v) -> str:
-    if v is None: return ""
-    if isinstance(v, float) and math.isnan(v): return ""
-    return str(v).strip()
-def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
-    if not s: return False
-    if s.lower() in _INVALID_STRINGS: return False
-    if not _RE_HAS_LETTER.search(s): return False
-    if len(s.replace(" ", "")) < min_chars: return False
-    return True
-def _clean_texts(texts):
-    all_norm = [_norm_text(t) for t in texts]
-    cleaned = [t for t in all_norm if _is_substantive_text(t)]
-    skipped = len(all_norm) - len(cleaned)
-    return cleaned, skipped
-def _make_figures(df: pd.DataFrame):
-    total = len(df)
-    neg = int((df["label"] == "negative").sum())
-    pos = int((df["label"] == "positive").sum())
-    neg_avg = pd.to_numeric(df["negative(%)"].str.rstrip("%"), errors="coerce").mean()
-    pos_avg = pd.to_numeric(df["positive(%)"].str.rstrip("%"), errors="coerce").mean()
-    info = (
-        f"**Summary**  \n"
-        f"- Total: {total}  \n"
-        f"- Negative: {neg}  \n"
-        f"- Positive: {pos}  \n"
-        f"- Avg negative: {neg_avg:.2f}%  \n"
-        f"- Avg positive: {pos_avg:.2f}%"
-    )
-    fig_bar = go.Figure()
-    fig_bar.add_bar(name="negative", x=["negative"], y=[neg], marker_color=NEG_COLOR)
-    fig_bar.add_bar(name="positive", x=["positive"], y=[pos], marker_color=POS_COLOR)
-    fig_bar.update_layout(barmode="group", title="Label counts", template=TEMPLATE)
-    fig_pie = go.Figure(go.Pie(
-        labels=["negative", "positive"],
-        values=[neg, pos],
-        hole=0.35,
-        sort=False,
-        marker=dict(colors=[NEG_COLOR, POS_COLOR])
-    ))
-    fig_pie.update_layout(title="Label share", template=TEMPLATE)
-    return fig_bar, fig_pie, info
-# ---------- core prediction ----------
-def _predict_batch(texts, model_name, batch_size=32):
-    model, tok, cfg = load_model(model_name)
-    results = []
-    for i in range(0, len(texts), batch_size):
-        chunk = texts[i:i+batch_size]
-        enc = tok(chunk, padding=True, truncation=True,
-                  max_length=cfg.get("max_length",128), return_tensors="pt")
-        with torch.no_grad():
-            logits = model(enc["input_ids"], enc["attention_mask"])
-            probs = F.softmax(logits, dim=1).cpu().numpy()
-        for txt, p in zip(chunk, probs):
-            neg, pos = float(p[0]), float(p[1])
-            label = "positive" if pos >= neg else "negative"
-            results.append({
-                "review": txt,
-                "negative(%)": _format_pct(neg),
-                "positive(%)": _format_pct(pos),
-                "label": label,
-            })
-    return results
-def predict_one(text: str, model_choice: str):
-    s = _norm_text(text)
-    if not _is_substantive_text(s):
-        return {"negative": 0.0, "positive": 0.0}, "invalid"
-    out = _predict_batch([s], model_choice)[0]
-    probs = {
-        "negative": float(out["negative(%)"].rstrip("%"))/100.0,
-        "positive": float(out["positive(%)"].rstrip("%"))/100.0,
-    }
-    return probs, out["label"]
-def predict_many(text_block: str, model_choice: str):
-    raw_lines = (text_block or "").splitlines()
-    cleaned, skipped = _clean_texts(raw_lines)
-    if len(cleaned) == 0:
-        empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
-        return empty, go.Figure(), go.Figure(), "No valid text"
-    results = _predict_batch(cleaned, model_choice)
-    df = pd.DataFrame(results)
-    fig_bar, fig_pie, info_md = _make_figures(df)
-    info_md = f"{info_md}  \n- Skipped: {skipped}"
-    return df, fig_bar, fig_pie, info_md
-# ---------- Gradio UI ----------
-AVAILABLE_CHOICES = ["WCB", "WCB_BiLSTM", "WCB_CNN_BiLSTM", "WCB_4Layer_BiLSTM"]
-if DEFAULT_MODEL not in AVAILABLE_CHOICES:
-    DEFAULT_MODEL = "WCB"
-with gr.Blocks(title="Thai Sentiment GUI") as demo:
-    gr.Markdown("### Thai Sentiment (WangchanBERTa Variants)")
-    model_radio = gr.Radio(choices=AVAILABLE_CHOICES, value=DEFAULT_MODEL, label="เลือกโมเดล")
-    with gr.Tab("Single"):
-        t1 = gr.Textbox(lines=3, label="ข้อความรีวิว (1 ข้อความ)")
-        probs = gr.Label(label="Probabilities")
-        pred  = gr.Textbox(label="Prediction", interactive=False)
-        gr.Button("Predict").click(predict_one, [t1, model_radio], [probs, pred])
-    with gr.Tab("Batch (หลายข้อความ)"):
-        t2 = gr.Textbox(lines=8, label="พิมพ์หลายรีวิว (บรรทัดละ 1 รีวิว)")
-        df2  = gr.Dataframe(label="ผลลัพธ์", interactive=False)
-        bar2 = gr.Plot(label="Label counts (bar)")
-        pie2 = gr.Plot(label="Label share (pie)")
-        sum2 = gr.Markdown()
-        gr.Button("Run Batch").click(predict_many, [t2, model_radio], [df2, bar2, pie2, sum2])
-if __name__ == "__main__":
-    demo.launch()

+import os, json, importlib.util, tempfile, traceback, torch, re, math
+import torch.nn as nn
+import torch.nn.functional as F
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import AutoTokenizer, AutoModel
+# ===== Settings =====
+REPO_ID       = os.getenv("REPO_ID", "Dusit-P/thai-sentiment")
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "WCB")   # default model
+HF_TOKEN      = os.getenv("HF_TOKEN", None)
+# ---- theme colors ----
+NEG_COLOR = "#F87171"   # red-400
+POS_COLOR = "#34D399"   # emerald-400
+TEMPLATE  = "plotly_white"
+CACHE = {}
+# ---------- load models from common/models.py ----------
+def _import_models():
+    if "models_module" in CACHE:
+        return CACHE["models_module"]
+    models_py = hf_hub_download(REPO_ID, filename="common/models.py", token=HF_TOKEN)
+    spec = importlib.util.spec_from_file_location("models", models_py)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    CACHE["models_module"] = mod
+    return mod
+def load_model(model_name: str):
+    key = f"model:{model_name}"
+    if key in CACHE:
+        return CACHE[key]
+    cfg_path = hf_hub_download(REPO_ID, filename=f"{model_name}/config.json", token=HF_TOKEN)
+    w_path   = hf_hub_download(REPO_ID, filename=f"{model_name}/model.safetensors", token=HF_TOKEN)
+    with open(cfg_path, "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+    base_model = cfg.get("base_model", "airesearch/wangchanberta-base-att-spm-uncased")
+    arch_name  = cfg.get("architecture", model_name)
+    tok = AutoTokenizer.from_pretrained(base_model)
+    models = _import_models()
+    model = models._build(arch_name, base_model, int(cfg.get("num_labels",2)), cfg.get("pooling_after_lstm","masked_mean"))
+    state = load_file(w_path)
+    model.load_state_dict(state, strict=False)
+    model.eval()
+    CACHE[key] = (model, tok, cfg)
+    return CACHE[key]
+# ---------- helpers ----------
+def _format_pct(x: float) -> str:
+    return f"{x*100:.2f}%"
+_INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""}
+_RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]")
+def _norm_text(v) -> str:
+    if v is None: return ""
+    if isinstance(v, float) and math.isnan(v): return ""
+    return str(v).strip()
+def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
+    if not s: return False
+    if s.lower() in _INVALID_STRINGS: return False
+    if not _RE_HAS_LETTER.search(s): return False
+    if len(s.replace(" ", "")) < min_chars: return False
+    return True
+def _clean_texts(texts):
+    all_norm = [_norm_text(t) for t in texts]
+    cleaned = [t for t in all_norm if _is_substantive_text(t)]
+    skipped = len(all_norm) - len(cleaned)
+    return cleaned, skipped
+def _make_figures(df: pd.DataFrame):
+    total = len(df)
+    neg = int((df["label"] == "negative").sum())
+    pos = int((df["label"] == "positive").sum())
+    neg_avg = pd.to_numeric(df["negative(%)"].str.rstrip("%"), errors="coerce").mean()
+    pos_avg = pd.to_numeric(df["positive(%)"].str.rstrip("%"), errors="coerce").mean()
+    info = (
+        f"**Summary**  \n"
+        f"- Total: {total}  \n"
+        f"- Negative: {neg}  \n"
+        f"- Positive: {pos}  \n"
+        f"- Avg negative: {neg_avg:.2f}%  \n"
+        f"- Avg positive: {pos_avg:.2f}%"
+    )
+    fig_bar = go.Figure()
+    fig_bar.add_bar(name="negative", x=["negative"], y=[neg], marker_color=NEG_COLOR)
+    fig_bar.add_bar(name="positive", x=["positive"], y=[pos], marker_color=POS_COLOR)
+    fig_bar.update_layout(barmode="group", title="Label counts", template=TEMPLATE)
+    fig_pie = go.Figure(go.Pie(
+        labels=["negative", "positive"],
+        values=[neg, pos],
+        hole=0.35,
+        sort=False,
+        marker=dict(colors=[NEG_COLOR, POS_COLOR])
+    ))
+    fig_pie.update_layout(title="Label share", template=TEMPLATE)
+    return fig_bar, fig_pie, info
+# ---------- core prediction ----------
+def _predict_batch(texts, model_name, batch_size=32):
+    model, tok, cfg = load_model(model_name)
+    results = []
+    for i in range(0, len(texts), batch_size):
+        chunk = texts[i:i+batch_size]
+        enc = tok(chunk, padding=True, truncation=True,
+                  max_length=cfg.get("max_length",128), return_tensors="pt")
+        with torch.no_grad():
+            logits = model(enc["input_ids"], enc["attention_mask"])
+            probs = F.softmax(logits, dim=1).cpu().numpy()
+        for txt, p in zip(chunk, probs):
+            neg, pos = float(p[0]), float(p[1])
+            label = "positive" if pos >= neg else "negative"
+            results.append({
+                "review": txt,
+                "negative(%)": _format_pct(neg),
+                "positive(%)": _format_pct(pos),
+                "label": label,
+            })
+    return results
+def predict_one(text: str, model_choice: str):
+    s = _norm_text(text)
+    if not _is_substantive_text(s):
+        return {"negative": 0.0, "positive": 0.0}, "invalid"
+    out = _predict_batch([s], model_choice)[0]
+    probs = {
+        "negative": float(out["negative(%)"].rstrip("%"))/100.0,
+        "positive": float(out["positive(%)"].rstrip("%"))/100.0,
+    }
+    return probs, out["label"]
+def predict_many(text_block: str, model_choice: str):
+    raw_lines = (text_block or "").splitlines()
+    cleaned, skipped = _clean_texts(raw_lines)
+    if len(cleaned) == 0:
+        empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
+        return empty, go.Figure(), go.Figure(), "No valid text"
+    results = _predict_batch(cleaned, model_choice)
+    df = pd.DataFrame(results)
+    fig_bar, fig_pie, info_md = _make_figures(df)
+    info_md = f"{info_md}  \n- Skipped: {skipped}"
+    return df, fig_bar, fig_pie, info_md
+# ---------- Gradio UI ----------
+AVAILABLE_CHOICES = ["WCB", "WCB_BiLSTM", "WCB_CNN_BiLSTM", "WCB_4Layer_BiLSTM"]
+if DEFAULT_MODEL not in AVAILABLE_CHOICES:
+    DEFAULT_MODEL = "WCB"
+with gr.Blocks(title="Thai Sentiment GUI") as demo:
+    gr.Markdown("### Thai Sentiment (WangchanBERTa Variants)")
+    model_radio = gr.Radio(choices=AVAILABLE_CHOICES, value=DEFAULT_MODEL, label="เลือกโมเดล")
+    with gr.Tab("Single"):
+        t1 = gr.Textbox(lines=3, label="ข้อความรีวิว (1 ข้อความ)")
+        probs = gr.Label(label="Probabilities")
+        pred  = gr.Textbox(label="Prediction", interactive=False)
+        gr.Button("Predict").click(predict_one, [t1, model_radio], [probs, pred])
+    with gr.Tab("Batch (หลายข้อความ)"):
+        t2 = gr.Textbox(lines=8, label="พิมพ์หลายรีวิว (บรรทัดละ 1 รีวิว)")
+        df2  = gr.Dataframe(label="ผลลัพธ์", interactive=False)
+        bar2 = gr.Plot(label="Label counts (bar)")
+        pie2 = gr.Plot(label="Label share (pie)")
+        sum2 = gr.Markdown()
+        gr.Button("Run Batch").click(predict_many, [t2, model_radio], [df2, bar2, pie2, sum2])
+if __name__ == "__main__":
+    demo.launch()