seedflora's picture
Initial space deploy
7f0ea09 verified
import json
import os
import re
from collections import Counter
from pathlib import Path
import gradio as gr
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from wordcloud import WordCloud
MODEL_ID = "seedflora/ev-sentiment"
DATA_PATH = "data.xlsx"
TEXT_COL = "clean_text_formal"
LABEL_COL = "label"
RESULTS_PATH = "results.csv"
def load_label_map(model_dir: Path):
label_map_path = model_dir / "label_map.json"
if label_map_path.exists():
with label_map_path.open("r", encoding="utf-8") as f:
return json.load(f)
return None
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
MODEL.eval()
ID2LABEL = MODEL.config.id2label
STOPWORDS = {
"yang",
"dan",
"di",
"ke",
"dari",
"untuk",
"pada",
"ini",
"itu",
"atau",
"juga",
"dengan",
"karena",
"bahwa",
"sudah",
"belum",
"tidak",
"bukan",
"jadi",
"agar",
"sebagai",
"lebih",
"paling",
"seperti",
"saja",
"masih",
"bisa",
"dapat",
"akan",
"kami",
"kita",
"saya",
"anda",
"mereka",
"aku",
"dia",
"kamu",
"nya",
"the",
"a",
"an",
"is",
"are",
"of",
"to",
"in",
"for",
"on",
"it",
}
def load_dataset():
path = Path(DATA_PATH)
if not path.exists():
return None, {}
df = pd.read_excel(path)
if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
return None, {}
df = df[[TEXT_COL, LABEL_COL]].dropna()
df[TEXT_COL] = df[TEXT_COL].astype(str)
labels = sorted(df[LABEL_COL].unique().tolist())
if set(labels) == {0, 2}:
label_name = {0: "Negatif", 2: "Positif"}
elif set(labels) == {0, 1}:
label_name = {0: "Negatif", 1: "Positif"}
else:
label_name = {val: f"Label {val}" for val in labels}
return df, label_name
def load_results():
path = Path(RESULTS_PATH)
if not path.exists():
return None
try:
return pd.read_csv(path)
except Exception:
return None
DATA_DF, LABEL_NAME = load_dataset()
RESULTS_DF = load_results()
def predict(text):
if not text or not text.strip():
return {}
inputs = TOKENIZER(text, return_tensors="pt", truncation=True)
with torch.no_grad():
logits = MODEL(**inputs).logits
probs = torch.softmax(logits, dim=-1).squeeze().tolist()
scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))}
return scores
def _tokenize(text: str):
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", " ", text)
tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2]
return tokens
def _filter_df(label_choice: str):
if DATA_DF is None:
return None
if label_choice == "Semua":
return DATA_DF
label_val = None
for val, name in LABEL_NAME.items():
if name == label_choice:
label_val = val
break
if label_val is None:
return DATA_DF
return DATA_DF[DATA_DF[LABEL_COL] == label_val]
def build_distribution_plot():
if DATA_DF is None:
fig = plt.figure()
plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center")
return fig
counts = DATA_DF[LABEL_COL].value_counts().sort_index()
labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()]
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"])
ax.set_title("Distribusi Label")
ax.set_ylabel("Jumlah")
ax.grid(axis="y", linestyle="--", alpha=0.4)
return fig
def build_top_words_plot(label_choice: str, top_n: int = 20):
df = _filter_df(label_choice)
fig, ax = plt.subplots(figsize=(6, 5))
if df is None or df.empty:
ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
return fig
tokens = []
for text in df[TEXT_COL].tolist():
tokens.extend(_tokenize(text))
if not tokens:
ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
return fig
common = Counter(tokens).most_common(top_n)
words = [w for w, _ in common][::-1]
freqs = [c for _, c in common][::-1]
ax.barh(words, freqs, color="#3b82f6")
ax.set_title(f"Top {top_n} Kata - {label_choice}")
return fig
def build_wordcloud(label_choice: str):
df = _filter_df(label_choice)
fig, ax = plt.subplots(figsize=(7, 4.5))
if df is None or df.empty:
ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
ax.axis("off")
return fig
tokens = []
for text in df[TEXT_COL].tolist():
tokens.extend(_tokenize(text))
if not tokens:
ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
ax.axis("off")
return fig
wc = WordCloud(width=900, height=500, background_color="white", collocations=False)
wc.generate(" ".join(tokens))
ax.imshow(wc, interpolation="bilinear")
ax.axis("off")
ax.set_title(f"Word Cloud - {label_choice}")
return fig
def build_model_comparison_plot():
fig, ax = plt.subplots(figsize=(6, 4))
if RESULTS_DF is None or RESULTS_DF.empty:
ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center")
return fig
data = RESULTS_DF.copy()
data = data.sort_values("val_f1", ascending=False)
models = data["model"].tolist()
val = data["val_f1"].tolist()
test = data["test_f1"].tolist()
x = range(len(models))
ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e")
ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6")
ax.set_xticks([i + 0.2 for i in x])
ax.set_xticklabels(models, rotation=45, ha="right")
ax.set_ylim(0, 1.0)
ax.set_title("Perbandingan Model (F1)")
ax.legend()
fig.tight_layout()
return fig
def analytics(label_choice):
dist_fig = build_distribution_plot()
top_fig = build_top_words_plot(label_choice)
wc_fig = build_wordcloud(label_choice)
model_fig = build_model_comparison_plot()
if DATA_DF is None:
summary = pd.DataFrame([{"metric": "rows", "value": 0}])
else:
summary = pd.DataFrame(
[{"metric": "rows", "value": len(DATA_DF)}]
+ [
{"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v}
for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items()
]
)
return dist_fig, top_fig, wc_fig, model_fig, summary
with gr.Blocks(title="Klasifikasi Sentimen EV") as app:
gr.Markdown("# Klasifikasi Sentimen EV")
gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).")
with gr.Tab("Prediksi"):
inp = gr.Textbox(lines=4, label="Teks")
out = gr.Label(num_top_classes=2, label="Prediksi")
btn = gr.Button("Prediksi")
btn.click(predict, inputs=inp, outputs=out)
with gr.Tab("Analitik"):
label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"]
label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label")
dist_plot = gr.Plot(label="Distribusi Label")
top_plot = gr.Plot(label="Top Kata")
wc_plot = gr.Plot(label="Word Cloud")
model_plot = gr.Plot(label="Perbandingan Model")
summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False)
run_btn = gr.Button("Generate")
run_btn.click(
analytics,
inputs=label_choice,
outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
)
label_choice.change(
analytics,
inputs=label_choice,
outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
)
app.load(
analytics,
inputs=label_choice,
outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
)
if __name__ == "__main__":
app.launch()