Spaces:

Dusit-P
/

Thai-Sentiment-GUI

Sleeping

App Files Files Community

Dusit-P commited on Oct 3, 2025

Commit

5dbe10a

verified ·

1 Parent(s): c1fbd91

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -52

app.py CHANGED Viewed

@@ -1,26 +1,29 @@
-import os, json, importlib.util, tempfile, traceback, torch, re, math
-import torch.nn as nn
-import torch.nn.functional as F
 import gradio as gr
-import pandas as pd
 import plotly.graph_objects as go
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from transformers import AutoTokenizer, AutoModel
-# ===== Settings =====
-REPO_ID       = os.getenv("REPO_ID", "Dusit-P/thai-sentiment")
-DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "WCB")   # default model
 HF_TOKEN      = os.getenv("HF_TOKEN", None)
-# ---- theme colors ----
-NEG_COLOR = "#F87171"   # red-400
-POS_COLOR = "#34D399"   # emerald-400
 TEMPLATE  = "plotly_white"
 CACHE = {}
-# ---------- load models from common/models.py ----------
 def _import_models():
     if "models_module" in CACHE:
         return CACHE["models_module"]
@@ -44,10 +47,11 @@ def load_model(model_name: str):
     base_model = cfg.get("base_model", "airesearch/wangchanberta-base-att-spm-uncased")
     arch_name  = cfg.get("architecture", model_name)
-    tok = AutoTokenizer.from_pretrained(base_model)
     models = _import_models()
-    model = models._build(arch_name, base_model, int(cfg.get("num_labels",2)), cfg.get("pooling_after_lstm","masked_mean"))
     state = load_file(w_path)
     model.load_state_dict(state, strict=False)
@@ -56,17 +60,14 @@ def load_model(model_name: str):
     CACHE[key] = (model, tok, cfg)
     return CACHE[key]
-# ---------- helpers ----------
-def _format_pct(x: float) -> str:
-    return f"{x*100:.2f}%"
 _INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""}
 _RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]")
 def _norm_text(v) -> str:
     if v is None: return ""
     if isinstance(v, float) and math.isnan(v): return ""
-    return str(v).strip()
 def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
     if not s: return False
@@ -75,11 +76,8 @@ def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
     if len(s.replace(" ", "")) < min_chars: return False
     return True
-def _clean_texts(texts):
-    all_norm = [_norm_text(t) for t in texts]
-    cleaned = [t for t in all_norm if _is_substantive_text(t)]
-    skipped = len(all_norm) - len(cleaned)
-    return cleaned, skipped
 def _make_figures(df: pd.DataFrame):
     total = len(df)
@@ -110,10 +108,9 @@ def _make_figures(df: pd.DataFrame):
         marker=dict(colors=[NEG_COLOR, POS_COLOR])
     ))
     fig_pie.update_layout(title="Label share", template=TEMPLATE)
     return fig_bar, fig_pie, info
-# ---------- core prediction ----------
 def _predict_batch(texts, model_name, batch_size=32):
     model, tok, cfg = load_model(model_name)
     results = []
@@ -135,37 +132,92 @@ def _predict_batch(texts, model_name, batch_size=32):
             })
     return results
 def predict_one(text: str, model_choice: str):
-    s = _norm_text(text)
-    if not _is_substantive_text(s):
-        return {"negative": 0.0, "positive": 0.0}, "invalid"
-    out = _predict_batch([s], model_choice)[0]
-    probs = {
-        "negative": float(out["negative(%)"].rstrip("%"))/100.0,
-        "positive": float(out["positive(%)"].rstrip("%"))/100.0,
-    }
-    return probs, out["label"]
 def predict_many(text_block: str, model_choice: str):
-    raw_lines = (text_block or "").splitlines()
-    cleaned, skipped = _clean_texts(raw_lines)
-    if len(cleaned) == 0:
         empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
-        return empty, go.Figure(), go.Figure(), "No valid text"
-    results = _predict_batch(cleaned, model_choice)
-    df = pd.DataFrame(results)
-    fig_bar, fig_pie, info_md = _make_figures(df)
-    info_md = f"{info_md}  \n- Skipped: {skipped}"
-    return df, fig_bar, fig_pie, info_md
-# ---------- Gradio UI ----------
-AVAILABLE_CHOICES = ["WCB", "WCB_BiLSTM", "WCB_CNN_BiLSTM", "WCB_4Layer_BiLSTM"]
-if DEFAULT_MODEL not in AVAILABLE_CHOICES:
-    DEFAULT_MODEL = "WCB"
-with gr.Blocks(title="Thai Sentiment GUI") as demo:
     gr.Markdown("### Thai Sentiment (WangchanBERTa Variants)")
     model_radio = gr.Radio(choices=AVAILABLE_CHOICES, value=DEFAULT_MODEL, label="เลือกโมเดล")
     with gr.Tab("Single"):
@@ -182,5 +234,15 @@ with gr.Blocks(title="Thai Sentiment GUI") as demo:
         sum2 = gr.Markdown()
         gr.Button("Run Batch").click(predict_many, [t2, model_radio], [df2, bar2, pie2, sum2])
 if __name__ == "__main__":
     demo.launch()

+# app.py — Thai Sentiment (WangchanBERTa Variants) GUI
+import os, json, importlib.util, traceback, sys, re, math, tempfile
 import gradio as gr
+import torch, pandas as pd
+import torch.nn.functional as F
 import plotly.graph_objects as go
 from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
 from transformers import AutoTokenizer, AutoModel
+# ================= Settings =================
+REPO_ID       = os.getenv("REPO_ID", "Dusit-P/thai-sentiment")  # <<< ใช้รีโปใหม่
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "WCB")
 HF_TOKEN      = os.getenv("HF_TOKEN", None)
+AVAILABLE_CHOICES = ["WCB", "WCB_BiLSTM", "WCB_CNN_BiLSTM", "WCB_4Layer_BiLSTM"]
+if DEFAULT_MODEL not in AVAILABLE_CHOICES:
+    DEFAULT_MODEL = "WCB"
+NEG_COLOR = "#F87171"
+POS_COLOR = "#34D399"
 TEMPLATE  = "plotly_white"
 CACHE = {}
+# ================= Loader =================
 def _import_models():
     if "models_module" in CACHE:
         return CACHE["models_module"]
     base_model = cfg.get("base_model", "airesearch/wangchanberta-base-att-spm-uncased")
     arch_name  = cfg.get("architecture", model_name)
+    tok = AutoTokenizer.from_pretrained(base_model)
     models = _import_models()
+    model = models._build(arch_name, base_model, int(cfg.get("num_labels",2)),
+                          cfg.get("pooling_after_lstm", "masked_mean"))
     state = load_file(w_path)
     model.load_state_dict(state, strict=False)
     CACHE[key] = (model, tok, cfg)
     return CACHE[key]
+# ================= Utils =================
 _INVALID_STRINGS = {"-", "--", "—", "n/a", "na", "null", "none", "nan", ".", "…", ""}
 _RE_HAS_LETTER = re.compile(r"[ก-๙A-Za-z]")
 def _norm_text(v) -> str:
     if v is None: return ""
     if isinstance(v, float) and math.isnan(v): return ""
+    return str(v).strip().strip('"').strip("'").strip(",")
 def _is_substantive_text(s: str, min_chars: int = 2) -> bool:
     if not s: return False
     if len(s.replace(" ", "")) < min_chars: return False
     return True
+def _format_pct(x: float) -> str:
+    return f"{x*100:.2f}%"
 def _make_figures(df: pd.DataFrame):
     total = len(df)
         marker=dict(colors=[NEG_COLOR, POS_COLOR])
     ))
     fig_pie.update_layout(title="Label share", template=TEMPLATE)
     return fig_bar, fig_pie, info
+# ================= Core Predict =================
 def _predict_batch(texts, model_name, batch_size=32):
     model, tok, cfg = load_model(model_name)
     results = []
             })
     return results
+# ----- single -----
 def predict_one(text: str, model_choice: str):
+    try:
+        s = _norm_text(text)
+        if not _is_substantive_text(s):
+            return {"negative": 0.0, "positive": 0.0}, "invalid"
+        out = _predict_batch([s], model_choice)[0]
+        probs = {
+            "negative": float(out["negative(%)"].rstrip("%"))/100.0,
+            "positive": float(out["positive(%)"].rstrip("%"))/100.0,
+        }
+        return probs, out["label"]
+    except Exception as e:
+        return {"error": str(e)}, "error"
+# ----- textarea batch -----
 def predict_many(text_block: str, model_choice: str):
+    try:
+        raw_lines = (text_block or "").splitlines()
+        all_norm = [_norm_text(t) for t in raw_lines]
+        cleaned = [t for t in all_norm if _is_substantive_text(t)]
+        skipped = len(all_norm) - len(cleaned)
+        if len(cleaned) == 0:
+            empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
+            return empty, go.Figure(), go.Figure(), "No valid text"
+        results = _predict_batch(cleaned, model_choice)
+        df = pd.DataFrame(results)
+        fig_bar, fig_pie, info_md = _make_figures(df)
+        info_md = f"{info_md}  \n- Skipped: {skipped}"
+        return df, fig_bar, fig_pie, info_md
+    except Exception:
+        tb = traceback.format_exc()
         empty = pd.DataFrame(columns=["review","negative(%)","positive(%)","label"])
+        return empty, go.Figure(), go.Figure(), f"**Error**\n```\n{tb}\n```"
+# ----- CSV upload -----
+LIKELY_TEXT_COLS = ["text","review","message","comment","content","sentence","body"]
+def predict_csv(file_obj, model_choice: str, text_col_name: str):
+    """
+    file_obj: gr.File (temp file), text_col_name: optional override
+    """
+    try:
+        if file_obj is None:
+            return pd.DataFrame(), go.Figure(), go.Figure(), "Please upload a CSV.", None
+        df = pd.read_csv(file_obj.name)
+        cols = [c for c in df.columns]
+        # autodetect column if not provided
+        col = text_col_name or ""
+        if not col or col not in df.columns:
+            # pick first matching likely name; else first object dtype
+            found = None
+            low = {c.lower(): c for c in cols}
+            for k in LIKELY_TEXT_COLS:
+                if k in low:
+                    found = low[k]; break
+            if found is None:
+                cand = [c for c in cols if df[c].dtype == object]
+                found = cand[0] if cand else cols[0]
+            col = found
+        # clean & predict
+        texts = [_norm_text(v) for v in df[col].tolist()]
+        texts = [t for t in texts if _is_substantive_text(t)]
+        if len(texts) == 0:
+            return pd.DataFrame(), go.Figure(), go.Figure(), "No valid texts in selected column.", None
+        results = _predict_batch(texts, model_choice)
+        out_df = pd.DataFrame(results)
+        fig_bar, fig_pie, info_md = _make_figures(out_df)
+        # write downloadable csv
+        fd, out_path = tempfile.mkstemp(prefix="pred_", suffix=".csv")
+        os.close(fd)
+        out_df.to_csv(out_path, index=False, encoding="utf-8-sig")
+        info_md = f"{info_md}  \n- Column used: **{col}**"
+        return out_df, fig_bar, fig_pie, info_md, out_path
+    except Exception:
+        tb = traceback.format_exc()
+        return pd.DataFrame(), go.Figure(), go.Figure(), f"**Error**\n```\n{tb}\n```", None
+# ================= Gradio UI =================
+with gr.Blocks(title="Thai Sentiment (WangchanBERTa Variants)") as demo:
     gr.Markdown("### Thai Sentiment (WangchanBERTa Variants)")
     model_radio = gr.Radio(choices=AVAILABLE_CHOICES, value=DEFAULT_MODEL, label="เลือกโมเดล")
     with gr.Tab("Single"):
         sum2 = gr.Markdown()
         gr.Button("Run Batch").click(predict_many, [t2, model_radio], [df2, bar2, pie2, sum2])
+    with gr.Tab("CSV Upload"):
+        file_in = gr.File(label="อัปโหลดไฟล์ .csv", file_types=[".csv"])
+        col_in  = gr.Textbox(label="ชื่อคอลัมน์ข้อความ (เว้นว่างให้เลือกอัตโนมัติได้)", value="")
+        df3  = gr.Dataframe(label="ผลลัพธ์", interactive=False)
+        bar3 = gr.Plot(label="Label counts (bar)")
+        pie3 = gr.Plot(label="Label share (pie)")
+        sum3 = gr.Markdown()
+        dl3  = gr.File(label="ดาวน์โหลดผลเป็น CSV", interactive=False)
+        gr.Button("Predict CSV").click(predict_csv, [file_in, model_radio, col_in], [df3, bar3, pie3, sum3, dl3])
 if __name__ == "__main__":
     demo.launch()