Spaces:

seedflora
/

ev-sentiment-dashboard

Running

App Files Files Community

ev-sentiment-dashboard / app.py

seedflora

Initial space deploy

7f0ea09 verified about 9 hours ago

raw

history blame contribute delete

8.21 kB

	import json
	import os
	import re
	from collections import Counter
	from pathlib import Path

	import gradio as gr
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import pandas as pd
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	from wordcloud import WordCloud

	MODEL_ID = "seedflora/ev-sentiment"
	DATA_PATH = "data.xlsx"
	TEXT_COL = "clean_text_formal"
	LABEL_COL = "label"
	RESULTS_PATH = "results.csv"


	def load_label_map(model_dir: Path):
	label_map_path = model_dir / "label_map.json"
	if label_map_path.exists():
	with label_map_path.open("r", encoding="utf-8") as f:
	return json.load(f)
	return None


	TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
	MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
	MODEL.eval()

	ID2LABEL = MODEL.config.id2label

	STOPWORDS = {
	"yang",
	"dan",
	"di",
	"ke",
	"dari",
	"untuk",
	"pada",
	"ini",
	"itu",
	"atau",
	"juga",
	"dengan",
	"karena",
	"bahwa",
	"sudah",
	"belum",
	"tidak",
	"bukan",
	"jadi",
	"agar",
	"sebagai",
	"lebih",
	"paling",
	"seperti",
	"saja",
	"masih",
	"bisa",
	"dapat",
	"akan",
	"kami",
	"kita",
	"saya",
	"anda",
	"mereka",
	"aku",
	"dia",
	"kamu",
	"nya",
	"the",
	"a",
	"an",
	"is",
	"are",
	"of",
	"to",
	"in",
	"for",
	"on",
	"it",
	}


	def load_dataset():
	path = Path(DATA_PATH)
	if not path.exists():
	return None, {}
	df = pd.read_excel(path)
	if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
	return None, {}

	df = df[[TEXT_COL, LABEL_COL]].dropna()
	df[TEXT_COL] = df[TEXT_COL].astype(str)
	labels = sorted(df[LABEL_COL].unique().tolist())
	if set(labels) == {0, 2}:
	label_name = {0: "Negatif", 2: "Positif"}
	elif set(labels) == {0, 1}:
	label_name = {0: "Negatif", 1: "Positif"}
	else:
	label_name = {val: f"Label {val}" for val in labels}
	return df, label_name


	def load_results():
	path = Path(RESULTS_PATH)
	if not path.exists():
	return None
	try:
	return pd.read_csv(path)
	except Exception:
	return None


	DATA_DF, LABEL_NAME = load_dataset()
	RESULTS_DF = load_results()


	def predict(text):
	if not text or not text.strip():
	return {}

	inputs = TOKENIZER(text, return_tensors="pt", truncation=True)
	with torch.no_grad():
	logits = MODEL(**inputs).logits
	probs = torch.softmax(logits, dim=-1).squeeze().tolist()

	scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))}
	return scores


	def _tokenize(text: str):
	text = text.lower()
	text = re.sub(r"[^a-z0-9\s]", " ", text)
	tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2]
	return tokens


	def _filter_df(label_choice: str):
	if DATA_DF is None:
	return None
	if label_choice == "Semua":
	return DATA_DF
	label_val = None
	for val, name in LABEL_NAME.items():
	if name == label_choice:
	label_val = val
	break
	if label_val is None:
	return DATA_DF
	return DATA_DF[DATA_DF[LABEL_COL] == label_val]


	def build_distribution_plot():
	if DATA_DF is None:
	fig = plt.figure()
	plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center")
	return fig
	counts = DATA_DF[LABEL_COL].value_counts().sort_index()
	labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()]
	fig, ax = plt.subplots(figsize=(6, 4))
	ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"])
	ax.set_title("Distribusi Label")
	ax.set_ylabel("Jumlah")
	ax.grid(axis="y", linestyle="--", alpha=0.4)
	return fig


	def build_top_words_plot(label_choice: str, top_n: int = 20):
	df = _filter_df(label_choice)
	fig, ax = plt.subplots(figsize=(6, 5))
	if df is None or df.empty:
	ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
	return fig
	tokens = []
	for text in df[TEXT_COL].tolist():
	tokens.extend(_tokenize(text))
	if not tokens:
	ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
	return fig
	common = Counter(tokens).most_common(top_n)
	words = [w for w, _ in common][::-1]
	freqs = [c for _, c in common][::-1]
	ax.barh(words, freqs, color="#3b82f6")
	ax.set_title(f"Top {top_n} Kata - {label_choice}")
	return fig


	def build_wordcloud(label_choice: str):
	df = _filter_df(label_choice)
	fig, ax = plt.subplots(figsize=(7, 4.5))
	if df is None or df.empty:
	ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
	ax.axis("off")
	return fig
	tokens = []
	for text in df[TEXT_COL].tolist():
	tokens.extend(_tokenize(text))
	if not tokens:
	ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
	ax.axis("off")
	return fig
	wc = WordCloud(width=900, height=500, background_color="white", collocations=False)
	wc.generate(" ".join(tokens))
	ax.imshow(wc, interpolation="bilinear")
	ax.axis("off")
	ax.set_title(f"Word Cloud - {label_choice}")
	return fig


	def build_model_comparison_plot():
	fig, ax = plt.subplots(figsize=(6, 4))
	if RESULTS_DF is None or RESULTS_DF.empty:
	ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center")
	return fig
	data = RESULTS_DF.copy()
	data = data.sort_values("val_f1", ascending=False)
	models = data["model"].tolist()
	val = data["val_f1"].tolist()
	test = data["test_f1"].tolist()
	x = range(len(models))
	ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e")
	ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6")
	ax.set_xticks([i + 0.2 for i in x])
	ax.set_xticklabels(models, rotation=45, ha="right")
	ax.set_ylim(0, 1.0)
	ax.set_title("Perbandingan Model (F1)")
	ax.legend()
	fig.tight_layout()
	return fig


	def analytics(label_choice):
	dist_fig = build_distribution_plot()
	top_fig = build_top_words_plot(label_choice)
	wc_fig = build_wordcloud(label_choice)
	model_fig = build_model_comparison_plot()
	if DATA_DF is None:
	summary = pd.DataFrame([{"metric": "rows", "value": 0}])
	else:
	summary = pd.DataFrame(
	[{"metric": "rows", "value": len(DATA_DF)}]
	+ [
	{"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v}
	for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items()
	]
	)
	return dist_fig, top_fig, wc_fig, model_fig, summary


	with gr.Blocks(title="Klasifikasi Sentimen EV") as app:
	gr.Markdown("# Klasifikasi Sentimen EV")
	gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).")

	with gr.Tab("Prediksi"):
	inp = gr.Textbox(lines=4, label="Teks")
	out = gr.Label(num_top_classes=2, label="Prediksi")
	btn = gr.Button("Prediksi")
	btn.click(predict, inputs=inp, outputs=out)

	with gr.Tab("Analitik"):
	label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"]
	label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label")
	dist_plot = gr.Plot(label="Distribusi Label")
	top_plot = gr.Plot(label="Top Kata")
	wc_plot = gr.Plot(label="Word Cloud")
	model_plot = gr.Plot(label="Perbandingan Model")
	summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False)
	run_btn = gr.Button("Generate")
	run_btn.click(
	analytics,
	inputs=label_choice,
	outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
	)
	label_choice.change(
	analytics,
	inputs=label_choice,
	outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
	)
	app.load(
	analytics,
	inputs=label_choice,
	outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
	)


	if __name__ == "__main__":
	app.launch()