Spaces:

Dusit-P
/

thai-sentiment-api

Sleeping

App Files Files Community

Dusit-P commited on Sep 8, 2025

Commit

3d347ae

verified ·

1 Parent(s): f8698d9

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -64

app.py CHANGED Viewed

@@ -1,64 +1,239 @@
-import os, json, importlib.util, torch
-import torch.nn.functional as F
-import gradio as gr
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
-from transformers import AutoTokenizer
-# ===== ปรับได้ผ่าน Settings > Variables (Environment) =====
-REPO_ID = os.getenv("REPO_ID", "Dusit-P/thai-sentiment-wcb")
-DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "cnn_bilstm")  # หรือ "baseline"
-HF_TOKEN = os.getenv("HF_TOKEN", None)  # ถ้าโมเดลเป็น private ให้เพิ่ม secret ชื่อนี้
-CACHE = {}
-def _import_models():
-    if "models_module" in CACHE:
-        return CACHE["models_module"]
-    models_py = hf_hub_download(REPO_ID, filename="common/models.py", token=HF_TOKEN)
-    spec = importlib.util.spec_from_file_location("models", models_py)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    CACHE["models_module"] = mod
-    return mod
-def load_model(model_name: str):
-    key = f"model:{model_name}"
-    if key in CACHE:
-        return CACHE[key]
-    cfg_path = hf_hub_download(REPO_ID, filename=f"{model_name}/config.json", token=HF_TOKEN)
-    w_path   = hf_hub_download(REPO_ID, filename=f"{model_name}/model.safetensors", token=HF_TOKEN)
-    with open(cfg_path, "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-    models = _import_models()
-    tok = AutoTokenizer.from_pretrained(cfg["base_model"])
-    model = models.create_model_by_name(cfg["arch"])
-    state = load_file(w_path)
-    model.load_state_dict(state, strict=True)
-    model.eval()
-    CACHE[key] = (model, tok, cfg)
-    return CACHE[key]
-def predict_api(text: str, model_choice: str):
-    if not text.strip():
-        return {"negative": 0.0, "positive": 0.0}, ""
-    model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
-    model, tok, cfg = load_model(model_name)
-    enc = tok([text], padding=True, truncation=True, max_length=cfg["max_len"], return_tensors="pt")
-    with torch.no_grad():
-        logits = model(enc["input_ids"], enc["attention_mask"])
-        probs = F.softmax(logits, dim=1)[0].tolist()
-    out = {"negative": float(probs[0]), "positive": float(probs[1])}
-    label = "positive" if out["positive"] >= out["negative"] else "negative"
-    return out, label
-with gr.Blocks(title="Thai Sentiment API (Dusit-P)") as demo:
-    gr.Markdown("### Thai Sentiment (WangchanBERTa + LSTM Heads)")
-    inp_text  = gr.Textbox(lines=3, label="ข้อความรีวิวภาษาไทย", placeholder="พิมพ์รีวิวที่นี่")
-    inp_model = gr.Radio(choices=["cnn_bilstm","baseline"], value=DEFAULT_MODEL, label="เลือกโมเดล")
-    out_probs = gr.Label(label="Probabilities")
-    out_label = gr.Textbox(label="Prediction", interactive=False)
-    gr.Button("Predict").click(predict_api, [inp_text, inp_model], [out_probs, out_label])
-if __name__ == "__main__":
-    demo.launch()

+import os, json, importlib.util, tempfile, torch
+import torch.nn.functional as F
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from transformers import AutoTokenizer
+# ===== ปรับได้จาก Settings > Variables & secrets ของ Space =====
+REPO_ID = os.getenv("REPO_ID", "Dusit-P/thai-sentiment-wcb")
+DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "cnn_bilstm")  # หรือ "baseline"
+HF_TOKEN = os.getenv("HF_TOKEN", None)  # ถ้าโมเดลเป็น private ให้เพิ่ม secret ชื่อนี้
+CACHE = {}
+# ---------- load architecture & weights from model repo ----------
+def _import_models():
+    if "models_module" in CACHE:
+        return CACHE["models_module"]
+    models_py = hf_hub_download(REPO_ID, filename="common/models.py", token=HF_TOKEN)
+    spec = importlib.util.spec_from_file_location("models", models_py)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    CACHE["models_module"] = mod
+    return mod
+def load_model(model_name: str):
+    key = f"model:{model_name}"
+    if key in CACHE:
+        return CACHE[key]
+    cfg_path = hf_hub_download(REPO_ID, filename=f"{model_name}/config.json", token=HF_TOKEN)
+    w_path   = hf_hub_download(REPO_ID, filename=f"{model_name}/model.safetensors", token=HF_TOKEN)
+    with open(cfg_path, "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+    models = _import_models()
+    tok = AutoTokenizer.from_pretrained(cfg["base_model"])
+    model = models.create_model_by_name(cfg["arch"])
+    state = load_file(w_path)
+    model.load_state_dict(state, strict=True)
+    model.eval()
+    CACHE[key] = (model, tok, cfg)
+    return CACHE[key]
+# ---------- helpers ----------
+def _format_pct(x: float) -> str:
+    return f"{x*100:.2f}%"
+def _predict_batch(texts, model_name, batch_size=64):
+    """รับ list[str] → คืน list[dict] = review, negative(%), positive(%), label"""
+    model, tok, cfg = load_model(model_name)
+    results = []
+    rows = [str(t) for t in texts if str(t).strip()]
+    for i in range(0, len(rows), batch_size):
+        chunk = rows[i:i+batch_size]
+        enc = tok(chunk, padding=True, truncation=True, max_length=cfg["max_len"], return_tensors="pt")
+        with torch.no_grad():
+            logits = model(enc["input_ids"], enc["attention_mask"])
+            probs = F.softmax(logits, dim=1).cpu().numpy()
+        for txt, p in zip(chunk, probs):
+            neg, pos = float(p[0]), float(p[1])
+            label = "positive" if pos >= neg else "negative"
+            results.append({
+                "review": txt,
+                "negative(%)": _format_pct(neg),
+                "positive(%)": _format_pct(pos),
+                "label": label,
+            })
+    return results
+def _detect_cols(df: pd.DataFrame):
+    """เดาชื่อคอลัมน์รีวิว/ร้านอัตโนมัติ ถ้าไม่พบรีวิว เลือกคอลัมน์ object ตัวแรก"""
+    rev_cands  = ["review", "text", "comment", "content", "message", "ข้อความ", "รีวิว"]
+    shop_cands = ["shop", "shop_name", "store", "restaurant", "brand", "merchant", "ชื่อร้าน"]
+    review_col = next((c for c in rev_cands  if c in df.columns), None)
+    shop_col   = next((c for c in shop_cands if c in df.columns), None)
+    if review_col is None:
+        obj_cols = [c for c in df.columns if df[c].dtype == object]
+        if obj_cols:
+            review_col = obj_cols[0]
+    return review_col, shop_col
+def _summarize_df(df: pd.DataFrame):
+    """สรุปภาพรวม + ตัวเลขเฉลี่ยความมั่นใจ"""
+    total = len(df)
+    neg = int((df["label"] == "negative").sum())
+    pos = int((df["label"] == "positive").sum())
+    neg_avg = pd.to_numeric(df["negative(%)"].str.rstrip("%"), errors="coerce").mean()
+    pos_avg = pd.to_numeric(df["positive(%)"].str.rstrip("%"), errors="coerce").mean()
+    info = (
+        f"**Summary**  \n"
+        f"- Total: {total}  \n"
+        f"- Negative: {neg}  \n"
+        f"- Positive: {pos}  \n"
+        f"- Avg negative: {neg_avg:.2f}%  \n"
+        f"- Avg positive: {pos_avg:.2f}%"
+    )
+    return {"total": total, "neg": neg, "pos": pos, "neg_avg": neg_avg, "pos_avg": pos_avg, "md": info}
+def _make_figures(df: pd.DataFrame):
+    s = _summarize_df(df)
+    # Bar รวม
+    fig_bar = go.Figure([go.Bar(x=["negative","positive"], y=[s["neg"], s["pos"]])])
+    fig_bar.update_layout(title="Label counts", xaxis_title="label", yaxis_title="count")
+    # Pie รวม
+    fig_pie = go.Figure(go.Pie(labels=["negative","positive"], values=[s["neg"], s["pos"]], hole=0.35))
+    fig_pie.update_layout(title="Label share")
+    return fig_bar, fig_pie, s["md"]
+def _shop_summary(out_df: pd.DataFrame, max_shops=15):
+    """สรุปต่อร้าน: table + stacked bar (pos/neg counts) — ถ้ามีคอลัมน์ shop"""
+    if "shop" not in out_df.columns:
+        return go.Figure(), pd.DataFrame(columns=["shop","total","positive","negative","positive_rate(%)","negative_rate(%)"])
+    g = out_df.groupby("shop")["label"].value_counts().unstack(fill_value=0)
+    # ให้มีทั้งสองคอลัมน์เสมอ
+    for col in ["positive","negative"]:
+        if col not in g.columns:
+            g[col] = 0
+    g["total"] = g["positive"] + g["negative"]
+    g = g.sort_values("total", ascending=False)
+    table = g[["total","positive","negative"]].copy()
+    table["positive_rate(%)"] = (table["positive"] / table["total"] * 100).round(2)
+    table["negative_rate(%)"] = (table["negative"] / table["total"] * 100).round(2)
+    table = table.reset_index().rename(columns={"index":"shop"})
+    # กราฟโชว์ top N ร้านตามจำนวนรีวิวรวม
+    top = table.head(max_shops)
+    fig = go.Figure()
+    fig.add_bar(name="positive", x=top["shop"], y=top["positive"])
+    fig.add_bar(name="negative", x=top["shop"], y=top["negative"])
+    fig.update_layout(barmode="stack", title=f"Per-shop counts (top {len(top)})",
+                      xaxis_title="shop", yaxis_title="count", legend_title="label")
+    return fig, table
+# ---------- API wrappers ----------
+def predict_one(text: str, model_choice: str):
+    if not text.strip():
+        return {"negative": 0.0, "positive": 0.0}, ""
+    model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
+    out = _predict_batch([text], model_name)[0]
+    probs = {
+        "negative": float(out["negative(%)"].rstrip("%"))/100.0,
+        "positive": float(out["positive(%)"].rstrip("%"))/100.0,
+    }
+    return probs, out["label"]
+def predict_many(text_block: str, model_choice: str):
+    model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
+    lines = [ln.strip() for ln in (text_block or "").splitlines() if ln.strip()]
+    results = _predict_batch(lines, model_name)
+    df = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
+    if len(df) == 0:
+        return df, go.Figure(), go.Figure(), "No data"
+    fig_bar, fig_pie, info_md = _make_figures(df)
+    return df, fig_bar, fig_pie, info_md
+def predict_csv(file_obj, model_choice: str, review_col_override: str = "", shop_col_override: str = ""):
+    if file_obj is None:
+        return pd.DataFrame(), None, go.Figure(), go.Figure(), go.Figure(), pd.DataFrame(), "กรุณาอัปโหลดไฟล์ CSV"
+    model_name = "baseline" if model_choice == "baseline" else "cnn_bilstm"
+    df = pd.read_csv(file_obj.name)
+    auto_rev, auto_shop = _detect_cols(df)
+    rev_col  = (review_col_override or "").strip() or auto_rev
+    shop_col = (shop_col_override or "").strip() or auto_shop
+    if rev_col not in df.columns:
+        raise ValueError(f"ไม่พบคอลัมน์รีวิว '{rev_col}' ใน CSV (columns = {list(df.columns)})")
+    results = _predict_batch(df[rev_col].astype(str).tolist(), model_name)
+    out = pd.DataFrame(results, columns=["review","negative(%)","positive(%)","label"])
+    if shop_col and shop_col in df.columns:
+        out.insert(0, "shop", df[shop_col].astype(str).fillna(""))
+    # ไฟล์ผลลัพธ์สำหรับดาวน์โหลด
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+    out.to_csv(tmp.name, index=False, encoding="utf-8-sig")
+    # กราฟ/สรุปรวม
+    fig_bar, fig_pie, info_md = _make_figures(out)
+    # กราฟ/ตารางต่อร้าน (ถ้ามี shop)
+    fig_shop, tbl_shop = _shop_summary(out)
+    # แนบข้อความบอกคอลัมน์ที่ใช้
+    info_md = f"{info_md}  \nใช้คอลัมน์รีวิว: {rev_col}" + (f" | คอลัมน์ร้าน: {shop_col}" if ("shop" in out.columns) else " | ไม่มีคอลัมน์ร้าน")
+    return out, tmp.name, fig_bar, fig_pie, fig_shop, tbl_shop, info_md
+# ---------- Gradio UI ----------
+with gr.Blocks(title="Thai Sentiment API (Dusit-P)") as demo:
+    gr.Markdown("### Thai Sentiment (WangchanBERTa + LSTM Heads)")
+    model_radio = gr.Radio(choices=["cnn_bilstm","baseline"], value=DEFAULT_MODEL, label="เลือกโมเดล")
+    with gr.Tab("Single"):
+        t1 = gr.Textbox(lines=3, label="ข้อความรีวิว (1 ข้อความ)")
+        probs = gr.Label(label="Probabilities")
+        pred  = gr.Textbox(label="Prediction", interactive=False)
+        gr.Button("Predict").click(predict_one, [t1, model_radio], [probs, pred])
+    with gr.Tab("Batch (หลายข้อความ)"):
+        t2 = gr.Textbox(lines=8, label="พิมพ์หลายรีวิว (บรรทัดละ 1 รีวิว)")
+        df2  = gr.Dataframe(label="ผลลัพธ์", interactive=False)
+        bar2 = gr.Plot(label="Label counts (bar)")
+        pie2 = gr.Plot(label="Label share (pie)")
+        sum2 = gr.Markdown()
+        gr.Button("Run Batch").click(predict_many, [t2, model_radio], [df2, bar2, pie2, sum2])
+    with gr.Tab("CSV (auto-detect columns)"):
+        f = gr.File(label="อัปโหลด CSV", file_types=[".csv"])
+        review_col_inp = gr.Textbox(label="ชื่อคอลัมน์รีวิว (เว้นว่างให้เดาได้)")
+        shop_col_inp   = gr.Textbox(label="ชื่อคอลัมน์ร้าน (เว้นว่างได้)")
+        df3  = gr.Dataframe(label="ผลลัพธ์ CSV", interactive=False)
+        download = gr.File(label="ดาวน์โหลดผลลัพธ์")
+        bar3 = gr.Plot(label="Label counts (bar)")
+        pie3 = gr.Plot(label="Label share (pie)")
+        shop_bar = gr.Plot(label="Per-shop stacked bar")
+        shop_tbl = gr.Dataframe(label="Per-shop summary", interactive=False)
+        info = gr.Markdown()
+        gr.Button("Run CSV").click(
+            predict_csv,
+            inputs=[f, model_radio, review_col_inp, shop_col_inp],
+            outputs=[df3, download, bar3, pie3, shop_bar, shop_tbl, info]
+        )
+if __name__ == "__main__":
+    demo.launch()