import json import os import re from collections import Counter from pathlib import Path import gradio as gr import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import pandas as pd import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer from wordcloud import WordCloud MODEL_ID = "seedflora/ev-sentiment" DATA_PATH = "data.xlsx" TEXT_COL = "clean_text_formal" LABEL_COL = "label" RESULTS_PATH = "results.csv" def load_label_map(model_dir: Path): label_map_path = model_dir / "label_map.json" if label_map_path.exists(): with label_map_path.open("r", encoding="utf-8") as f: return json.load(f) return None TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) MODEL.eval() ID2LABEL = MODEL.config.id2label STOPWORDS = { "yang", "dan", "di", "ke", "dari", "untuk", "pada", "ini", "itu", "atau", "juga", "dengan", "karena", "bahwa", "sudah", "belum", "tidak", "bukan", "jadi", "agar", "sebagai", "lebih", "paling", "seperti", "saja", "masih", "bisa", "dapat", "akan", "kami", "kita", "saya", "anda", "mereka", "aku", "dia", "kamu", "nya", "the", "a", "an", "is", "are", "of", "to", "in", "for", "on", "it", } def load_dataset(): path = Path(DATA_PATH) if not path.exists(): return None, {} df = pd.read_excel(path) if TEXT_COL not in df.columns or LABEL_COL not in df.columns: return None, {} df = df[[TEXT_COL, LABEL_COL]].dropna() df[TEXT_COL] = df[TEXT_COL].astype(str) labels = sorted(df[LABEL_COL].unique().tolist()) if set(labels) == {0, 2}: label_name = {0: "Negatif", 2: "Positif"} elif set(labels) == {0, 1}: label_name = {0: "Negatif", 1: "Positif"} else: label_name = {val: f"Label {val}" for val in labels} return df, label_name def load_results(): path = Path(RESULTS_PATH) if not path.exists(): return None try: return pd.read_csv(path) except Exception: return None DATA_DF, LABEL_NAME = load_dataset() RESULTS_DF = load_results() def predict(text): if not text or not text.strip(): return {} inputs = TOKENIZER(text, return_tensors="pt", truncation=True) with torch.no_grad(): logits = MODEL(**inputs).logits probs = torch.softmax(logits, dim=-1).squeeze().tolist() scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))} return scores def _tokenize(text: str): text = text.lower() text = re.sub(r"[^a-z0-9\s]", " ", text) tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2] return tokens def _filter_df(label_choice: str): if DATA_DF is None: return None if label_choice == "Semua": return DATA_DF label_val = None for val, name in LABEL_NAME.items(): if name == label_choice: label_val = val break if label_val is None: return DATA_DF return DATA_DF[DATA_DF[LABEL_COL] == label_val] def build_distribution_plot(): if DATA_DF is None: fig = plt.figure() plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center") return fig counts = DATA_DF[LABEL_COL].value_counts().sort_index() labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()] fig, ax = plt.subplots(figsize=(6, 4)) ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"]) ax.set_title("Distribusi Label") ax.set_ylabel("Jumlah") ax.grid(axis="y", linestyle="--", alpha=0.4) return fig def build_top_words_plot(label_choice: str, top_n: int = 20): df = _filter_df(label_choice) fig, ax = plt.subplots(figsize=(6, 5)) if df is None or df.empty: ax.text(0.5, 0.5, "Data kosong", ha="center", va="center") return fig tokens = [] for text in df[TEXT_COL].tolist(): tokens.extend(_tokenize(text)) if not tokens: ax.text(0.5, 0.5, "Token kosong", ha="center", va="center") return fig common = Counter(tokens).most_common(top_n) words = [w for w, _ in common][::-1] freqs = [c for _, c in common][::-1] ax.barh(words, freqs, color="#3b82f6") ax.set_title(f"Top {top_n} Kata - {label_choice}") return fig def build_wordcloud(label_choice: str): df = _filter_df(label_choice) fig, ax = plt.subplots(figsize=(7, 4.5)) if df is None or df.empty: ax.text(0.5, 0.5, "Data kosong", ha="center", va="center") ax.axis("off") return fig tokens = [] for text in df[TEXT_COL].tolist(): tokens.extend(_tokenize(text)) if not tokens: ax.text(0.5, 0.5, "Token kosong", ha="center", va="center") ax.axis("off") return fig wc = WordCloud(width=900, height=500, background_color="white", collocations=False) wc.generate(" ".join(tokens)) ax.imshow(wc, interpolation="bilinear") ax.axis("off") ax.set_title(f"Word Cloud - {label_choice}") return fig def build_model_comparison_plot(): fig, ax = plt.subplots(figsize=(6, 4)) if RESULTS_DF is None or RESULTS_DF.empty: ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center") return fig data = RESULTS_DF.copy() data = data.sort_values("val_f1", ascending=False) models = data["model"].tolist() val = data["val_f1"].tolist() test = data["test_f1"].tolist() x = range(len(models)) ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e") ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6") ax.set_xticks([i + 0.2 for i in x]) ax.set_xticklabels(models, rotation=45, ha="right") ax.set_ylim(0, 1.0) ax.set_title("Perbandingan Model (F1)") ax.legend() fig.tight_layout() return fig def analytics(label_choice): dist_fig = build_distribution_plot() top_fig = build_top_words_plot(label_choice) wc_fig = build_wordcloud(label_choice) model_fig = build_model_comparison_plot() if DATA_DF is None: summary = pd.DataFrame([{"metric": "rows", "value": 0}]) else: summary = pd.DataFrame( [{"metric": "rows", "value": len(DATA_DF)}] + [ {"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v} for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items() ] ) return dist_fig, top_fig, wc_fig, model_fig, summary with gr.Blocks(title="Klasifikasi Sentimen EV") as app: gr.Markdown("# Klasifikasi Sentimen EV") gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).") with gr.Tab("Prediksi"): inp = gr.Textbox(lines=4, label="Teks") out = gr.Label(num_top_classes=2, label="Prediksi") btn = gr.Button("Prediksi") btn.click(predict, inputs=inp, outputs=out) with gr.Tab("Analitik"): label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"] label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label") dist_plot = gr.Plot(label="Distribusi Label") top_plot = gr.Plot(label="Top Kata") wc_plot = gr.Plot(label="Word Cloud") model_plot = gr.Plot(label="Perbandingan Model") summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False) run_btn = gr.Button("Generate") run_btn.click( analytics, inputs=label_choice, outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], ) label_choice.change( analytics, inputs=label_choice, outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], ) app.load( analytics, inputs=label_choice, outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], ) if __name__ == "__main__": app.launch()