seedflora commited on
Commit
7f0ea09
·
verified ·
1 Parent(s): 49069ef

Initial space deploy

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +8 -6
  3. app.py +293 -0
  4. data.xlsx +3 -0
  5. requirements.txt +7 -0
  6. results.csv +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data.xlsx filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,14 @@
1
- ---
2
- title: Ev Sentiment Dashboard
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
+ ---
2
+ title: EV Sentiment Dashboard
3
+ emoji: 🚗
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
  sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # EV Sentiment Dashboard
13
+
14
+ Klasifikasi sentimen + dashboard analitik (word cloud, distribusi label, dan perbandingan model).
app.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import Counter
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+ import matplotlib
9
+ matplotlib.use("Agg")
10
+ import matplotlib.pyplot as plt
11
+ import pandas as pd
12
+ import torch
13
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
14
+ from wordcloud import WordCloud
15
+
16
+ MODEL_ID = "seedflora/ev-sentiment"
17
+ DATA_PATH = "data.xlsx"
18
+ TEXT_COL = "clean_text_formal"
19
+ LABEL_COL = "label"
20
+ RESULTS_PATH = "results.csv"
21
+
22
+
23
+ def load_label_map(model_dir: Path):
24
+ label_map_path = model_dir / "label_map.json"
25
+ if label_map_path.exists():
26
+ with label_map_path.open("r", encoding="utf-8") as f:
27
+ return json.load(f)
28
+ return None
29
+
30
+
31
+ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
32
+ MODEL = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
33
+ MODEL.eval()
34
+
35
+ ID2LABEL = MODEL.config.id2label
36
+
37
+ STOPWORDS = {
38
+ "yang",
39
+ "dan",
40
+ "di",
41
+ "ke",
42
+ "dari",
43
+ "untuk",
44
+ "pada",
45
+ "ini",
46
+ "itu",
47
+ "atau",
48
+ "juga",
49
+ "dengan",
50
+ "karena",
51
+ "bahwa",
52
+ "sudah",
53
+ "belum",
54
+ "tidak",
55
+ "bukan",
56
+ "jadi",
57
+ "agar",
58
+ "sebagai",
59
+ "lebih",
60
+ "paling",
61
+ "seperti",
62
+ "saja",
63
+ "masih",
64
+ "bisa",
65
+ "dapat",
66
+ "akan",
67
+ "kami",
68
+ "kita",
69
+ "saya",
70
+ "anda",
71
+ "mereka",
72
+ "aku",
73
+ "dia",
74
+ "kamu",
75
+ "nya",
76
+ "the",
77
+ "a",
78
+ "an",
79
+ "is",
80
+ "are",
81
+ "of",
82
+ "to",
83
+ "in",
84
+ "for",
85
+ "on",
86
+ "it",
87
+ }
88
+
89
+
90
+ def load_dataset():
91
+ path = Path(DATA_PATH)
92
+ if not path.exists():
93
+ return None, {}
94
+ df = pd.read_excel(path)
95
+ if TEXT_COL not in df.columns or LABEL_COL not in df.columns:
96
+ return None, {}
97
+
98
+ df = df[[TEXT_COL, LABEL_COL]].dropna()
99
+ df[TEXT_COL] = df[TEXT_COL].astype(str)
100
+ labels = sorted(df[LABEL_COL].unique().tolist())
101
+ if set(labels) == {0, 2}:
102
+ label_name = {0: "Negatif", 2: "Positif"}
103
+ elif set(labels) == {0, 1}:
104
+ label_name = {0: "Negatif", 1: "Positif"}
105
+ else:
106
+ label_name = {val: f"Label {val}" for val in labels}
107
+ return df, label_name
108
+
109
+
110
+ def load_results():
111
+ path = Path(RESULTS_PATH)
112
+ if not path.exists():
113
+ return None
114
+ try:
115
+ return pd.read_csv(path)
116
+ except Exception:
117
+ return None
118
+
119
+
120
+ DATA_DF, LABEL_NAME = load_dataset()
121
+ RESULTS_DF = load_results()
122
+
123
+
124
+ def predict(text):
125
+ if not text or not text.strip():
126
+ return {}
127
+
128
+ inputs = TOKENIZER(text, return_tensors="pt", truncation=True)
129
+ with torch.no_grad():
130
+ logits = MODEL(**inputs).logits
131
+ probs = torch.softmax(logits, dim=-1).squeeze().tolist()
132
+
133
+ scores = {ID2LABEL[i]: float(probs[i]) for i in range(len(probs))}
134
+ return scores
135
+
136
+
137
+ def _tokenize(text: str):
138
+ text = text.lower()
139
+ text = re.sub(r"[^a-z0-9\s]", " ", text)
140
+ tokens = [t for t in text.split() if t and t not in STOPWORDS and len(t) > 2]
141
+ return tokens
142
+
143
+
144
+ def _filter_df(label_choice: str):
145
+ if DATA_DF is None:
146
+ return None
147
+ if label_choice == "Semua":
148
+ return DATA_DF
149
+ label_val = None
150
+ for val, name in LABEL_NAME.items():
151
+ if name == label_choice:
152
+ label_val = val
153
+ break
154
+ if label_val is None:
155
+ return DATA_DF
156
+ return DATA_DF[DATA_DF[LABEL_COL] == label_val]
157
+
158
+
159
+ def build_distribution_plot():
160
+ if DATA_DF is None:
161
+ fig = plt.figure()
162
+ plt.text(0.5, 0.5, "Dataset tidak ditemukan", ha="center", va="center")
163
+ return fig
164
+ counts = DATA_DF[LABEL_COL].value_counts().sort_index()
165
+ labels = [LABEL_NAME.get(val, str(val)) for val in counts.index.tolist()]
166
+ fig, ax = plt.subplots(figsize=(6, 4))
167
+ ax.bar(labels, counts.values, color=["#ef4444", "#22c55e"])
168
+ ax.set_title("Distribusi Label")
169
+ ax.set_ylabel("Jumlah")
170
+ ax.grid(axis="y", linestyle="--", alpha=0.4)
171
+ return fig
172
+
173
+
174
+ def build_top_words_plot(label_choice: str, top_n: int = 20):
175
+ df = _filter_df(label_choice)
176
+ fig, ax = plt.subplots(figsize=(6, 5))
177
+ if df is None or df.empty:
178
+ ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
179
+ return fig
180
+ tokens = []
181
+ for text in df[TEXT_COL].tolist():
182
+ tokens.extend(_tokenize(text))
183
+ if not tokens:
184
+ ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
185
+ return fig
186
+ common = Counter(tokens).most_common(top_n)
187
+ words = [w for w, _ in common][::-1]
188
+ freqs = [c for _, c in common][::-1]
189
+ ax.barh(words, freqs, color="#3b82f6")
190
+ ax.set_title(f"Top {top_n} Kata - {label_choice}")
191
+ return fig
192
+
193
+
194
+ def build_wordcloud(label_choice: str):
195
+ df = _filter_df(label_choice)
196
+ fig, ax = plt.subplots(figsize=(7, 4.5))
197
+ if df is None or df.empty:
198
+ ax.text(0.5, 0.5, "Data kosong", ha="center", va="center")
199
+ ax.axis("off")
200
+ return fig
201
+ tokens = []
202
+ for text in df[TEXT_COL].tolist():
203
+ tokens.extend(_tokenize(text))
204
+ if not tokens:
205
+ ax.text(0.5, 0.5, "Token kosong", ha="center", va="center")
206
+ ax.axis("off")
207
+ return fig
208
+ wc = WordCloud(width=900, height=500, background_color="white", collocations=False)
209
+ wc.generate(" ".join(tokens))
210
+ ax.imshow(wc, interpolation="bilinear")
211
+ ax.axis("off")
212
+ ax.set_title(f"Word Cloud - {label_choice}")
213
+ return fig
214
+
215
+
216
+ def build_model_comparison_plot():
217
+ fig, ax = plt.subplots(figsize=(6, 4))
218
+ if RESULTS_DF is None or RESULTS_DF.empty:
219
+ ax.text(0.5, 0.5, "results.csv tidak ditemukan", ha="center", va="center")
220
+ return fig
221
+ data = RESULTS_DF.copy()
222
+ data = data.sort_values("val_f1", ascending=False)
223
+ models = data["model"].tolist()
224
+ val = data["val_f1"].tolist()
225
+ test = data["test_f1"].tolist()
226
+ x = range(len(models))
227
+ ax.bar(x, val, width=0.4, label="Val F1", color="#22c55e")
228
+ ax.bar([i + 0.4 for i in x], test, width=0.4, label="Test F1", color="#3b82f6")
229
+ ax.set_xticks([i + 0.2 for i in x])
230
+ ax.set_xticklabels(models, rotation=45, ha="right")
231
+ ax.set_ylim(0, 1.0)
232
+ ax.set_title("Perbandingan Model (F1)")
233
+ ax.legend()
234
+ fig.tight_layout()
235
+ return fig
236
+
237
+
238
+ def analytics(label_choice):
239
+ dist_fig = build_distribution_plot()
240
+ top_fig = build_top_words_plot(label_choice)
241
+ wc_fig = build_wordcloud(label_choice)
242
+ model_fig = build_model_comparison_plot()
243
+ if DATA_DF is None:
244
+ summary = pd.DataFrame([{"metric": "rows", "value": 0}])
245
+ else:
246
+ summary = pd.DataFrame(
247
+ [{"metric": "rows", "value": len(DATA_DF)}]
248
+ + [
249
+ {"metric": f"label_{LABEL_NAME.get(k, k)}", "value": v}
250
+ for k, v in DATA_DF[LABEL_COL].value_counts().to_dict().items()
251
+ ]
252
+ )
253
+ return dist_fig, top_fig, wc_fig, model_fig, summary
254
+
255
+
256
+ with gr.Blocks(title="Klasifikasi Sentimen EV") as app:
257
+ gr.Markdown("# Klasifikasi Sentimen EV")
258
+ gr.Markdown("Prediksi sentimen + dashboard analitik (word cloud & distribusi label).")
259
+
260
+ with gr.Tab("Prediksi"):
261
+ inp = gr.Textbox(lines=4, label="Teks")
262
+ out = gr.Label(num_top_classes=2, label="Prediksi")
263
+ btn = gr.Button("Prediksi")
264
+ btn.click(predict, inputs=inp, outputs=out)
265
+
266
+ with gr.Tab("Analitik"):
267
+ label_options = ["Semua"] + list(LABEL_NAME.values()) if LABEL_NAME else ["Semua"]
268
+ label_choice = gr.Dropdown(label_options, value="Semua", label="Filter Label")
269
+ dist_plot = gr.Plot(label="Distribusi Label")
270
+ top_plot = gr.Plot(label="Top Kata")
271
+ wc_plot = gr.Plot(label="Word Cloud")
272
+ model_plot = gr.Plot(label="Perbandingan Model")
273
+ summary_tbl = gr.Dataframe(label="Ringkasan Dataset", interactive=False)
274
+ run_btn = gr.Button("Generate")
275
+ run_btn.click(
276
+ analytics,
277
+ inputs=label_choice,
278
+ outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
279
+ )
280
+ label_choice.change(
281
+ analytics,
282
+ inputs=label_choice,
283
+ outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
284
+ )
285
+ app.load(
286
+ analytics,
287
+ inputs=label_choice,
288
+ outputs=[dist_plot, top_plot, wc_plot, model_plot, summary_tbl],
289
+ )
290
+
291
+
292
+ if __name__ == "__main__":
293
+ app.launch()
data.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:038011314726d0073886358f9e9890f5d9e7595bb7fa46a82f4b0e6c1f15af61
3
+ size 124200
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers==4.56.2
2
+ torch
3
+ pandas
4
+ openpyxl
5
+ gradio
6
+ matplotlib
7
+ wordcloud
results.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model,run_dir,val_accuracy,val_precision,val_recall,val_f1,test_accuracy,test_precision,test_recall,test_f1
2
+ indobenchmark/indobert-base-p1,outputs\indobenchmark_indobert-base-p1,0.9779411764705882,0.9850746268656716,0.9705882352941176,0.9777777777777777,0.9191176470588235,0.9014084507042254,0.9411764705882353,0.920863309352518
3
+ cahya/bert-base-indonesian-1.5G,outputs\cahya_bert-base-indonesian-1.5G,0.9705882352941176,0.9571428571428572,0.9852941176470589,0.9710144927536232,0.9264705882352942,0.9142857142857143,0.9411764705882353,0.927536231884058
4
+ cahya/roberta-base-indonesian-1.5G,outputs\cahya_roberta-base-indonesian-1.5G,0.9852941176470589,0.9852941176470589,0.9852941176470589,0.9852941176470589,0.9338235294117647,0.9154929577464789,0.9558823529411765,0.935251798561151
5
+ xlm-roberta-base,outputs\xlm-roberta-base,0.9411764705882353,0.9166666666666666,0.9705882352941176,0.9428571428571428,0.9338235294117647,0.9154929577464789,0.9558823529411765,0.935251798561151
6
+ bert-base-multilingual-cased,outputs\bert-base-multilingual-cased,0.9485294117647058,0.9178082191780822,0.9852941176470589,0.950354609929078,0.8970588235294118,0.875,0.9264705882352942,0.9