| import json | |
| import os | |
| import re | |
| from collections import Counter | |
| from pathlib import Path | |
| import gradio as gr | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import torch | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from wordcloud import WordCloud | |
| MODEL_ID = "seedflora/ev-sentiment" | |
| DATA_PATH = "data.xlsx" | |
| TEXT_COL = "clean_text_formal" | |
| LABEL_COL = "label" | |
| RESULTS_PATH = "results.csv" | |
| def load_label_map(model_dir: Path): | |
| label_map_path = model_dir / "label_map.json" | |
| if label_map_path.exists(): | |
| with label_map_path.open("r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return None | |
| TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) | |
| MODEL.eval() | |
| ID2LABEL = MODEL.config.id2label | |
| STOPWORDS = { | |
| "yang", | |
| "dan", | |
| "di", | |
| "ke", | |
| "dari", | |
| "untuk", | |
| "pada", | |
| "ini", | |
| "itu", | |
| "atau", | |
| "juga", | |
| "dengan", | |
| "karena", | |
| "bahwa", | |
| "sudah", | |
| "belum", | |
| "tidak", | |
| "bukan", | |
| "jadi", | |
| "agar", | |
| "sebagai", | |
| "lebih", | |
| "paling", | |
| "seperti", | |
| "saja", | |
| "masih", | |
| "bisa", | |
| "dapat", | |
| "akan", | |
| "kami", | |
| "kita", | |
| "saya", | |
| "anda", | |
| "mereka", | |
| "aku", | |
| "dia", | |
| "kamu", | |
| "nya", | |
| "the", | |
| "a", | |
| "an", | |
| "is", | |
| "are", | |
| "of", | |
| "to", | |
| "in", | |
| "for", | |
| "on", | |
| "it", | |
| } | |
| def load_dataset(): | |
| path = Path(DATA_PATH) | |
| if not path.exists(): | |
| return None, {} | |
| df = pd.read_excel(path) | |
| if TEXT_COL not in df.columns or LABEL_COL not in df.columns: | |
| return None, {} | |
| df = df[[TEXT_COL, LABEL_COL]].dropna() | |
| df[TEXT_COL] = df[TEXT_COL].astype(str) | |
| labels = sorted(df[LABEL_COL].unique().tolist()) | |
| if set(labels) == {0, 2}: | |
| label_name = {0: "Negatif", 2: "Positif"} | |
| elif set(labels) == {0, 1}: | |
| label_name = {0: "Negatif", 1: "Positif"} | |
| else: | |
| label_name = {val: f"Label {val}" for val in labels} | |
| return df, label_name | |
| def load_results(): | |
| path = Path(RESULTS_PATH) | |
| if not path.exists(): | |
| return None | |
| try: | |
| return pd.read_csv(path) | |
| except Exception: | |
| return None | |
| DATA_DF, LABEL_NAME = load_dataset() | |
| RESULTS_DF = load_results() | |
| def predict(text): | |
| if not text or not text.strip(): | |
| return {} | |
| inputs = TOKENIZER(text, return_tensors="pt", truncation=True) | |
| with torch.no_grad(): | |
| logits = MODEL(**inputs).logits | |
| probs = torch.softmax(logits, dim=-1).squeeze().tolist() | |
| scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))} | |
| return scores | |
| def _tokenize(text: str): | |
| text = text.lower() | |
| text = re.sub(r"[^a-z0-9\s]", " ", text) | |
| tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2] | |
| return tokens | |
| def _filter_df(label_choice: str): | |
| if DATA_DF is None: | |
| return None | |
| if label_choice == "Semua": | |
| return DATA_DF | |
| label_val = None | |
| for val, name in LABEL_NAME.items(): | |
| if name == label_choice: | |
| label_val = val | |
| break | |
| if label_val is None: | |
| return DATA_DF | |
| return DATA_DF[DATA_DF[LABEL_COL] == label_val] | |
| def build_distribution_plot(): | |
| if DATA_DF is None: | |
| fig = plt.figure() | |
| plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center") | |
| return fig | |
| counts = DATA_DF[LABEL_COL].value_counts().sort_index() | |
| labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()] | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"]) | |
| ax.set_title("Distribusi Label") | |
| ax.set_ylabel("Jumlah") | |
| ax.grid(axis="y", linestyle="--", alpha=0.4) | |
| return fig | |
| def build_top_words_plot(label_choice: str, top_n: int = 20): | |
| df = _filter_df(label_choice) | |
| fig, ax = plt.subplots(figsize=(6, 5)) | |
| if df is None or df.empty: | |
| ax.text(0.5, 0.5, "Data kosong", ha="center", va="center") | |
| return fig | |
| tokens = [] | |
| for text in df[TEXT_COL].tolist(): | |
| tokens.extend(_tokenize(text)) | |
| if not tokens: | |
| ax.text(0.5, 0.5, "Token kosong", ha="center", va="center") | |
| return fig | |
| common = Counter(tokens).most_common(top_n) | |
| words = [w for w, _ in common][::-1] | |
| freqs = [c for _, c in common][::-1] | |
| ax.barh(words, freqs, color="#3b82f6") | |
| ax.set_title(f"Top {top_n} Kata - {label_choice}") | |
| return fig | |
| def build_wordcloud(label_choice: str): | |
| df = _filter_df(label_choice) | |
| fig, ax = plt.subplots(figsize=(7, 4.5)) | |
| if df is None or df.empty: | |
| ax.text(0.5, 0.5, "Data kosong", ha="center", va="center") | |
| ax.axis("off") | |
| return fig | |
| tokens = [] | |
| for text in df[TEXT_COL].tolist(): | |
| tokens.extend(_tokenize(text)) | |
| if not tokens: | |
| ax.text(0.5, 0.5, "Token kosong", ha="center", va="center") | |
| ax.axis("off") | |
| return fig | |
| wc = WordCloud(width=900, height=500, background_color="white", collocations=False) | |
| wc.generate(" ".join(tokens)) | |
| ax.imshow(wc, interpolation="bilinear") | |
| ax.axis("off") | |
| ax.set_title(f"Word Cloud - {label_choice}") | |
| return fig | |
| def build_model_comparison_plot(): | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| if RESULTS_DF is None or RESULTS_DF.empty: | |
| ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center") | |
| return fig | |
| data = RESULTS_DF.copy() | |
| data = data.sort_values("val_f1", ascending=False) | |
| models = data["model"].tolist() | |
| val = data["val_f1"].tolist() | |
| test = data["test_f1"].tolist() | |
| x = range(len(models)) | |
| ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e") | |
| ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6") | |
| ax.set_xticks([i + 0.2 for i in x]) | |
| ax.set_xticklabels(models, rotation=45, ha="right") | |
| ax.set_ylim(0, 1.0) | |
| ax.set_title("Perbandingan Model (F1)") | |
| ax.legend() | |
| fig.tight_layout() | |
| return fig | |
| def analytics(label_choice): | |
| dist_fig = build_distribution_plot() | |
| top_fig = build_top_words_plot(label_choice) | |
| wc_fig = build_wordcloud(label_choice) | |
| model_fig = build_model_comparison_plot() | |
| if DATA_DF is None: | |
| summary = pd.DataFrame([{"metric": "rows", "value": 0}]) | |
| else: | |
| summary = pd.DataFrame( | |
| [{"metric": "rows", "value": len(DATA_DF)}] | |
| + [ | |
| {"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v} | |
| for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items() | |
| ] | |
| ) | |
| return dist_fig, top_fig, wc_fig, model_fig, summary | |
| with gr.Blocks(title="Klasifikasi Sentimen EV") as app: | |
| gr.Markdown("# Klasifikasi Sentimen EV") | |
| gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).") | |
| with gr.Tab("Prediksi"): | |
| inp = gr.Textbox(lines=4, label="Teks") | |
| out = gr.Label(num_top_classes=2, label="Prediksi") | |
| btn = gr.Button("Prediksi") | |
| btn.click(predict, inputs=inp, outputs=out) | |
| with gr.Tab("Analitik"): | |
| label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"] | |
| label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label") | |
| dist_plot = gr.Plot(label="Distribusi Label") | |
| top_plot = gr.Plot(label="Top Kata") | |
| wc_plot = gr.Plot(label="Word Cloud") | |
| model_plot = gr.Plot(label="Perbandingan Model") | |
| summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False) | |
| run_btn = gr.Button("Generate") | |
| run_btn.click( | |
| analytics, | |
| inputs=label_choice, | |
| outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], | |
| ) | |
| label_choice.change( | |
| analytics, | |
| inputs=label_choice, | |
| outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], | |
| ) | |
| app.load( | |
| analytics, | |
| inputs=label_choice, | |
| outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl], | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() | |