noranisa commited on
Commit
06f79f7
·
verified ·
1 Parent(s): f7370ef

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +487 -172
main.py CHANGED
@@ -1,32 +1,28 @@
1
  from flask import Flask, render_template, request, jsonify, send_file
2
  from services.aggregator import collect_data
3
- from services.sentiment import predict
4
 
5
- # =========================
6
- # IMPORT TAMBAHAN
7
- # =========================
8
  from collections import Counter
9
  import pandas as pd
10
  import os
11
  import re
12
  import numpy as np
 
13
 
14
- # VISUAL
15
- from wordcloud import WordCloud
16
  import matplotlib
17
- matplotlib.use('Agg') # ← WAJIB: non-interactive backend untuk server
18
  import matplotlib.pyplot as plt
19
 
20
- # ML
21
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
22
  from sklearn.decomposition import LatentDirichletAllocation
23
  from sklearn.cluster import KMeans
24
  from sklearn.metrics.pairwise import cosine_similarity
25
- from sklearn.linear_model import LinearRegression
 
26
 
27
- # GRAPH
28
  import networkx as nx
29
  from itertools import combinations
 
30
 
31
  # OPTIONAL ADVANCED
32
  try:
@@ -46,30 +42,83 @@ except Exception:
46
 
47
  app = Flask(__name__)
48
 
49
- # =========================
50
- # UTIL
51
- # =========================
52
- def clean_text(t):
53
- t = t.lower()
54
- t = re.sub(r'http\S+', '', t)
55
- t = re.sub(r'[^a-zA-Z0-9\s]', ' ', t)
56
- t = re.sub(r'\s+', ' ', t).strip()
57
- return t
58
-
59
 
60
- # =========================
61
- # TOP WORDS
62
- # =========================
63
  STOPWORDS_ID = {
64
  'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah','ada',
65
  'pada','juga','tidak','bisa','sudah','saya','kamu','kami','mereka','kita',
66
  'nya','pun','aja','gak','ga','ya','yg','dgn','yah','dah','udah','mau',
67
- 'jadi','buat','kalau','tp','tapi','tapi','banget','sangat','lebih','nih',
68
- 'sih','dong','lah','lagi','terus','sama','atau','karena','tapi','juga',
69
- 'so','the','is','in','of','to','a','an','and','it','for','that','this',
 
70
  }
71
 
72
- def get_top_words(texts):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  words = []
74
  for t in texts:
75
  for w in clean_text(t).split():
@@ -78,31 +127,386 @@ def get_top_words(texts):
78
  return [{"word": w, "count": c} for w, c in Counter(words).most_common(15)]
79
 
80
 
81
- # =========================
82
- # WORDCLOUD
83
- # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def generate_wordcloud(texts):
85
  try:
86
  os.makedirs("static", exist_ok=True)
87
  texts = [t for t in texts if len(t.strip()) > 3]
88
  if not texts:
89
  return
90
- combined = " ".join(texts)
91
  wc = WordCloud(
92
- width=900, height=400,
93
- background_color='white',
 
94
  max_words=80,
95
  stopwords=STOPWORDS_ID,
96
- colormap='Blues'
97
- ).generate(combined)
98
  wc.to_file("static/wordcloud.png")
99
  except Exception as e:
100
  print("wordcloud error:", e)
101
 
102
 
103
- # =========================
104
- # HEATMAP
105
- # =========================
106
  def generate_heatmap(data):
107
  try:
108
  if not data:
@@ -110,93 +514,49 @@ def generate_heatmap(data):
110
  labels = ["Positive", "Neutral", "Negative"]
111
  sources = sorted(set(d["source"] for d in data))
112
  matrix = np.zeros((len(sources), len(labels)))
113
-
114
  for d in data:
115
  i = sources.index(d["source"])
116
  j = labels.index(d["sentiment"])
117
  matrix[i][j] += 1
118
-
119
  if matrix.sum() == 0:
120
  return
121
-
122
  fig, ax = plt.subplots(figsize=(6, max(2, len(sources))))
 
 
123
  im = ax.imshow(matrix, cmap='Blues', aspect='auto')
124
  ax.set_xticks(range(len(labels)))
125
- ax.set_xticklabels(labels)
126
  ax.set_yticks(range(len(sources)))
127
- ax.set_yticklabels(sources)
 
128
  plt.colorbar(im, ax=ax)
129
  plt.tight_layout()
130
  os.makedirs("static", exist_ok=True)
131
- plt.savefig("static/heatmap.png", dpi=100)
132
  plt.close(fig)
133
  except Exception as e:
134
  print("heatmap error:", e)
135
 
136
 
137
- # =========================
138
- # TIMELINE
139
- # =========================
140
- def generate_timeline(data):
141
- try:
142
- if not data:
143
- return
144
- os.makedirs("static", exist_ok=True)
145
-
146
- pos = [1 if d["sentiment"] == "Positive" else 0 for d in data]
147
- neg = [1 if d["sentiment"] == "Negative" else 0 for d in data]
148
- neu = [1 if d["sentiment"] == "Neutral" else 0 for d in data]
149
-
150
- # rolling average
151
- def roll(arr, n=5):
152
- return [sum(arr[max(0,i-n):i+1]) / len(arr[max(0,i-n):i+1]) for i in range(len(arr))]
153
-
154
- fig, ax = plt.subplots(figsize=(10, 3))
155
- ax.plot(roll(pos), label="Positive", color="#22c55e", linewidth=1.5)
156
- ax.plot(roll(neg), label="Negative", color="#ef4444", linewidth=1.5)
157
- ax.plot(roll(neu), label="Neutral", color="#94a3b8", linewidth=1.0)
158
- ax.legend()
159
- ax.set_facecolor('#f8fafc')
160
- fig.patch.set_facecolor('#f8fafc')
161
- plt.tight_layout()
162
- plt.savefig("static/timeline.png", dpi=100)
163
- plt.close(fig)
164
- except Exception as e:
165
- print("timeline error:", e)
166
-
167
-
168
- # =========================
169
- # TOPIC MODELING
170
- # =========================
171
  def get_topics(texts):
172
  try:
173
  texts = [t for t in texts if len(t) > 3]
174
  if len(texts) < 5:
175
  return [["data kurang"]]
176
-
177
  vec = CountVectorizer(min_df=2, stop_words=list(STOPWORDS_ID))
178
  X = vec.fit_transform(texts)
179
-
180
  if X.shape[1] == 0:
181
  return [["kosong"]]
182
-
183
- n_topics = min(3, X.shape[1])
184
- lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
185
  lda.fit(X)
186
-
187
  words = vec.get_feature_names_out()
188
- topics = []
189
- for t in lda.components_:
190
- topics.append([words[i] for i in t.argsort()[-5:]])
191
- return topics
192
  except Exception as e:
193
  print("topic error:", e)
194
  return [["error"]]
195
 
196
 
197
- # =========================
198
- # INSIGHT
199
- # =========================
200
  def generate_insight(data):
201
  s = [d["sentiment"] for d in data]
202
  return (f"Positive:{s.count('Positive')} "
@@ -204,9 +564,6 @@ def generate_insight(data):
204
  f"Neutral:{s.count('Neutral')}")
205
 
206
 
207
- # =========================
208
- # CLUSTER
209
- # =========================
210
  def cluster_opinions(texts):
211
  try:
212
  if len(texts) < 6:
@@ -215,38 +572,19 @@ def cluster_opinions(texts):
215
  n = min(3, len(texts))
216
  k = KMeans(n_clusters=n, n_init=10, random_state=42).fit(X)
217
  clusters = {}
218
- for i, label in enumerate(k.labels_):
219
- clusters.setdefault(int(label), []).append(texts[i])
220
- return [{"cluster": lbl, "samples": samples[:3]} for lbl, samples in clusters.items()]
221
  except Exception as e:
222
  print("cluster error:", e)
223
  return []
224
 
225
 
226
- # =========================
227
- # HOAX (keyword-based)
228
- # =========================
229
- HOAX_KW = [
230
- "hoax","bohong","fitnah","propaganda","palsu","fake","disinformasi",
231
- "menyesatkan","kebohongan","manipulasi","adu domba","provokasi"
232
- ]
233
-
234
- def detect_hoax(texts):
235
- results = []
236
- for t in texts[:15]:
237
- lower = t.lower()
238
- label = "Hoax" if any(k in lower for k in HOAX_KW) else "Normal"
239
- results.append({"text": t, "label": label})
240
- return results
241
-
242
-
243
- # =========================
244
- # NETWORK
245
- # =========================
246
  def build_network(texts):
247
  edges = {}
248
  for t in texts:
249
- words = [w for w in set(clean_text(t).split()) if len(w) > 3 and w not in STOPWORDS_ID][:6]
 
250
  for a, b in combinations(words, 2):
251
  key = tuple(sorted([a, b]))
252
  edges[key] = edges.get(key, 0) + 1
@@ -254,30 +592,22 @@ def build_network(texts):
254
  for k, v in edges.items() if v > 1]
255
 
256
 
257
- # =========================
258
- # BOT NETWORK
259
- # =========================
260
  def detect_bot_network(texts):
261
  try:
262
  if len(texts) < 5:
263
  return {"nodes": [], "edges": [], "bots": []}
264
-
265
  X = TfidfVectorizer(max_features=300).fit_transform(texts)
266
  sim = cosine_similarity(X)
267
-
268
- G = nx.Graph()
269
  for i in range(len(texts)):
270
  G.add_node(i, text=texts[i])
271
-
272
  for i in range(len(texts)):
273
  for j in range(i + 1, len(texts)):
274
  if sim[i][j] > 0.75:
275
  G.add_edge(i, j)
276
-
277
  central = nx.degree_centrality(G)
278
  bots = [{"node": i, "score": round(s, 2), "text": texts[i]}
279
  for i, s in central.items() if s > 0.3]
280
-
281
  return {
282
  "nodes": [{"id": i} for i in G.nodes()],
283
  "edges": [{"source": u, "target": v} for u, v in G.edges()],
@@ -288,32 +618,9 @@ def detect_bot_network(texts):
288
  return {"nodes": [], "edges": [], "bots": []}
289
 
290
 
291
- # =========================
292
- # TREND
293
- # =========================
294
- def predict_trend(data):
295
- try:
296
- y = [1 if d["sentiment"] == "Positive" else
297
- -1 if d["sentiment"] == "Negative" else 0
298
- for d in data]
299
- if len(y) < 5:
300
- return "Kurang Data"
301
- X = np.arange(len(y)).reshape(-1, 1)
302
- coef = LinearRegression().fit(X, y).coef_[0]
303
- if coef > 0.05:
304
- return "Naik Positif"
305
- elif coef < -0.05:
306
- return "Naik Negatif"
307
- else:
308
- return "Stabil"
309
- except Exception as e:
310
- print("trend error:", e)
311
- return "Error"
312
-
313
-
314
- # =========================
315
  # ROUTES
316
- # =========================
317
  @app.route("/")
318
  def home():
319
  return render_template("index.html")
@@ -337,39 +644,50 @@ def analyze():
337
  texts = [t for _, t in raw][:100]
338
  sources = [s for s, _ in raw][:100]
339
 
340
- sentiments = predict(texts)
341
-
342
- result = [
343
- {"text": t, "sentiment": s, "source": src}
344
- for t, s, src in zip(texts, sentiments, sources)
 
 
 
 
 
 
 
 
 
345
  ]
346
 
347
- # VISUAL — non-blocking
348
  generate_wordcloud(texts)
349
- generate_heatmap(result)
350
- generate_timeline(result)
351
 
352
  # ANALYSIS
353
  top_words = get_top_words(texts)
354
  topics = get_topics(texts)
355
- insight = generate_insight(result)
356
  clusters = cluster_opinions(texts)
357
- hoax = detect_hoax(texts)
 
358
  network = build_network(texts)
359
  bot_network = detect_bot_network(texts)
360
- trend = predict_trend(result)
361
 
362
- # ADVANCED (optional)
 
 
 
363
  bot_bert = detect_bot_bert(texts)
364
  fake_news = detect_fake_news(texts)
365
- gnn = run_gnn(bot_network["nodes"], bot_network["edges"])
366
 
367
- # SAVE CSV
368
  os.makedirs("static", exist_ok=True)
369
- pd.DataFrame(result).to_csv("static/result.csv", index=False)
370
 
371
  return jsonify({
372
- "data": result,
373
  "top_words": top_words,
374
  "topics": topics,
375
  "insight": insight,
@@ -380,7 +698,7 @@ def analyze():
380
  "trend": trend,
381
  "bot_bert": bot_bert,
382
  "fake_news": fake_news,
383
- "gnn": gnn
384
  })
385
 
386
  except Exception as e:
@@ -401,8 +719,5 @@ def static_files(filename):
401
  return send_file(f"static/{filename}")
402
 
403
 
404
- # =========================
405
- # RUN
406
- # =========================
407
  if __name__ == "__main__":
408
  app.run(host="0.0.0.0", port=7860, debug=False)
 
1
  from flask import Flask, render_template, request, jsonify, send_file
2
  from services.aggregator import collect_data
3
+ from services.sentiment import predict, predict_with_score
4
 
 
 
 
5
  from collections import Counter
6
  import pandas as pd
7
  import os
8
  import re
9
  import numpy as np
10
+ from datetime import datetime
11
 
 
 
12
  import matplotlib
13
+ matplotlib.use('Agg')
14
  import matplotlib.pyplot as plt
15
 
 
16
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
17
  from sklearn.decomposition import LatentDirichletAllocation
18
  from sklearn.cluster import KMeans
19
  from sklearn.metrics.pairwise import cosine_similarity
20
+ from sklearn.linear_model import LogisticRegression
21
+ from sklearn.pipeline import Pipeline
22
 
 
23
  import networkx as nx
24
  from itertools import combinations
25
+ from wordcloud import WordCloud
26
 
27
  # OPTIONAL ADVANCED
28
  try:
 
42
 
43
  app = Flask(__name__)
44
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # =============================================================
47
+ # STOPWORDS & SLANG NORMALIZATION
48
+ # =============================================================
49
  STOPWORDS_ID = {
50
  'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah','ada',
51
  'pada','juga','tidak','bisa','sudah','saya','kamu','kami','mereka','kita',
52
  'nya','pun','aja','gak','ga','ya','yg','dgn','yah','dah','udah','mau',
53
+ 'jadi','buat','kalau','tp','tapi','banget','sangat','lebih','nih','sih',
54
+ 'dong','lah','lagi','terus','sama','atau','karena','so','the','is','in',
55
+ 'of','to','a','an','and','it','for','that','this','was','are','be',
56
+ 'has','have','had','do','does','did','will','would','could','should',
57
  }
58
 
59
+ SLANG_MAP = {
60
+ 'gak':'tidak','ga':'tidak','nggak':'tidak','ngga':'tidak','enggak':'tidak',
61
+ 'yg':'yang','dgn':'dengan','utk':'untuk','krn':'karena','karna':'karena',
62
+ 'udah':'sudah','udh':'sudah','dah':'sudah','sdh':'sudah',
63
+ 'gue':'saya','gw':'saya','aku':'saya','w':'saya',
64
+ 'lo':'kamu','lu':'kamu','elo':'kamu',
65
+ 'tp':'tapi','tpi':'tapi',
66
+ 'jg':'juga','jga':'juga',
67
+ 'bs':'bisa','bsa':'bisa',
68
+ 'lg':'lagi','lgi':'lagi',
69
+ 'sm':'sama','bgt':'banget','bngt':'banget',
70
+ 'emg':'memang','emang':'memang','mmg':'memang',
71
+ 'kyk':'kayak','kek':'kayak',
72
+ 'dr':'dari','ke':'ke','pd':'pada',
73
+ 'spy':'supaya','biar':'supaya',
74
+ 'msh':'masih','masi':'masih',
75
+ 'blm':'belum','blum':'belum',
76
+ 'jd':'jadi','jdi':'jadi',
77
+ 'sy':'saya','mrk':'mereka',
78
+ 'mk':'maka','sdgkan':'sedangkan',
79
+ 'hrs':'harus','wajib':'harus',
80
+ 'krg':'kurang','krang':'kurang',
81
+ 'skrg':'sekarang','skg':'sekarang',
82
+ 'tdk':'tidak','tdk':'tidak','bkn':'bukan',
83
+ 'pdhl':'padahal','pdhal':'padahal',
84
+ 'bnr':'benar','bner':'benar',
85
+ 'slh':'salah','slah':'salah',
86
+ 'org':'orang','orng':'orang',
87
+ 'trs':'terus','trus':'terus',
88
+ 'knp':'kenapa','ngp':'kenapa',
89
+ 'gmn':'gimana','gmana':'bagaimana','bgmn':'bagaimana',
90
+ 'aja':'saja','aj':'saja',
91
+ 'ok':'oke','oke':'oke','okay':'oke',
92
+ 'wkwk':'haha','wkwkwk':'haha','hehe':'haha','hihi':'haha',
93
+ 'brp':'berapa','brapa':'berapa',
94
+ 'stlh':'setelah','sblm':'sebelum',
95
+ 'ttg':'tentang','mnrt':'menurut',
96
+ 'hrs':'harus','perlu':'harus',
97
+ }
98
+
99
+
100
+ # =============================================================
101
+ # TEXT CLEANING WITH SLANG NORMALIZATION
102
+ # =============================================================
103
+ def clean_text(t: str) -> str:
104
+ t = t.lower().strip()
105
+ t = re.sub(r'http\S+|www\.\S+', '', t) # hapus URL
106
+ t = re.sub(r'@\w+', '', t) # hapus mention
107
+ t = re.sub(r'#(\w+)', r'\1', t) # hashtag → kata
108
+ t = re.sub(r'(.)\1{2,}', r'\1\1', t) # reduplikasi: "baguuus" → "bagus"
109
+ t = re.sub(r'[^a-zA-Z0-9\s]', ' ', t) # hapus karakter khusus
110
+ # normalisasi slang
111
+ tokens = t.split()
112
+ tokens = [SLANG_MAP.get(w, w) for w in tokens]
113
+ t = ' '.join(tokens)
114
+ t = re.sub(r'\s+', ' ', t).strip()
115
+ return t
116
+
117
+
118
+ # =============================================================
119
+ # TOP WORDS
120
+ # =============================================================
121
+ def get_top_words(texts: list) -> list:
122
  words = []
123
  for t in texts:
124
  for w in clean_text(t).split():
 
127
  return [{"word": w, "count": c} for w, c in Counter(words).most_common(15)]
128
 
129
 
130
+ # =============================================================
131
+ # 🔴 FIX 1: DETEKSI HOAKS — ML-based (TF-IDF + Logistic Regression)
132
+ # =============================================================
133
+
134
+ # Training data minimal untuk bootstrap model
135
+ # Label: 1 = berpotensi hoaks/disinformasi, 0 = normal
136
+ _HOAX_TRAIN_TEXTS = [
137
+ # HOAKS (label=1)
138
+ "berita ini bohong dan tidak benar sama sekali",
139
+ "ini adalah propaganda yang menyesatkan masyarakat",
140
+ "jangan percaya hoax yang beredar di media sosial",
141
+ "informasi palsu yang disebarkan untuk memfitnah",
142
+ "ini adalah disinformasi yang sengaja dibuat untuk menipu",
143
+ "berita palsu yang beredar sangat meresahkan warga",
144
+ "mereka menyebarkan kebohongan dan fitnah kepada publik",
145
+ "isu ini adalah manipulasi politik yang berbahaya",
146
+ "provokasi yang dilakukan untuk memecah belah bangsa",
147
+ "konten ini mengandung ujaran kebencian dan fitnah",
148
+ "waspada berita bohong yang sengaja disebarkan",
149
+ "ini hoaks yang sudah dibantah oleh pihak berwenang",
150
+ "informasi yang menyesatkan dan tidak ada buktinya",
151
+ "narasi sesat yang dibuat untuk mengadu domba",
152
+ "berita manipulatif yang perlu diklarifikasi segera",
153
+ # NORMAL (label=0)
154
+ "produk ini sangat bagus dan berkualitas tinggi",
155
+ "saya sangat senang dengan pelayanannya yang ramah",
156
+ "hasil kerja tim ini luar biasa dan membanggakan",
157
+ "kebijakan ini berdampak positif bagi masyarakat luas",
158
+ "acara kemarin berjalan lancar dan sangat meriah",
159
+ "terima kasih atas bantuan yang diberikan selama ini",
160
+ "pemerintah berupaya meningkatkan kesejahteraan rakyat",
161
+ "inovasi terbaru ini sangat membantu kehidupan sehari-hari",
162
+ "prestasi luar biasa yang patut kita banggakan bersama",
163
+ "kondisi ekonomi mulai membaik berdasarkan data terbaru",
164
+ "program ini memberikan manfaat nyata bagi warga",
165
+ "kolaborasi yang baik menghasilkan output yang optimal",
166
+ "penelitian ini memberikan temuan yang sangat menarik",
167
+ "masyarakat antusias menyambut kebijakan baru tersebut",
168
+ "kualitas pendidikan terus meningkat dari tahun ke tahun",
169
+ ]
170
+ _HOAX_TRAIN_LABELS = [1]*15 + [0]*15
171
+
172
+ # Build pipeline sekali saat startup
173
+ _hoax_pipeline = Pipeline([
174
+ ('tfidf', TfidfVectorizer(
175
+ ngram_range=(1, 2),
176
+ max_features=500,
177
+ sublinear_tf=True,
178
+ )),
179
+ ('clf', LogisticRegression(
180
+ C=1.0,
181
+ max_iter=200,
182
+ random_state=42,
183
+ class_weight='balanced',
184
+ )),
185
+ ])
186
+
187
+ try:
188
+ _hoax_pipeline.fit(_HOAX_TRAIN_TEXTS, _HOAX_TRAIN_LABELS)
189
+ print("✅ Hoax classifier trained")
190
+ except Exception as e:
191
+ print(f"⚠️ Hoax classifier training failed: {e}")
192
+ _hoax_pipeline = None
193
+
194
+
195
+ def detect_hoax(texts: list) -> list:
196
+ """
197
+ Deteksi hoaks/disinformasi menggunakan TF-IDF + Logistic Regression.
198
+ Output: label (Hoax/Normal) + confidence score.
199
+
200
+ Fallback ke keyword-based jika model gagal.
201
+ """
202
+ results = []
203
+ sample = texts[:20]
204
+
205
+ if _hoax_pipeline is not None:
206
+ try:
207
+ preds = _hoax_pipeline.predict(sample)
208
+ probas = _hoax_pipeline.predict_proba(sample)
209
+
210
+ for t, pred, proba in zip(sample, preds, probas):
211
+ label = "Hoax" if pred == 1 else "Normal"
212
+ confidence = round(float(max(proba)), 3)
213
+ results.append({
214
+ "text": t,
215
+ "label": label,
216
+ "confidence": confidence,
217
+ "method": "ml"
218
+ })
219
+ return results
220
+ except Exception as e:
221
+ print(f"⚠️ Hoax ML predict error: {e} — fallback ke keyword")
222
+
223
+ # Fallback keyword-based (lebih kaya dari sebelumnya)
224
+ HOAX_KW = [
225
+ "hoax","bohong","fitnah","propaganda","palsu","fake","disinformasi",
226
+ "menyesatkan","kebohongan","manipulasi","adu domba","provokasi",
227
+ "berita palsu","ujaran kebencian","tidak benar","perlu diklarifikasi",
228
+ "waspada","jangan percaya","disebarkan untuk","narasi sesat",
229
+ ]
230
+ for t in sample:
231
+ lower = t.lower()
232
+ score = sum(1 for k in HOAX_KW if k in lower)
233
+ label = "Hoax" if score >= 1 else "Normal"
234
+ conf = min(0.5 + score * 0.1, 0.95) if label == "Hoax" else 0.6
235
+ results.append({
236
+ "text": t,
237
+ "label": label,
238
+ "confidence": round(conf, 3),
239
+ "method": "keyword"
240
+ })
241
+ return results
242
+
243
+
244
+ # =============================================================
245
+ # 🔴 FIX 2: TREND — distribusi per-sumber, bukan regresi naif
246
+ # =============================================================
247
+ def predict_trend(data: list) -> dict:
248
+ """
249
+ Analisis tren sentimen yang lebih bermakna:
250
+ 1. Distribusi sentimen per sumber platform
251
+ 2. Dominasi sentimen keseluruhan
252
+ 3. Indeks polarisasi (seberapa terpolarisasi opini)
253
+ 4. Label tren (naik positif/negatif/stabil) dengan confidence
254
+ """
255
+ if not data:
256
+ return {
257
+ "label": "Kurang Data",
258
+ "dominant": "Neutral",
259
+ "polarity": 0.0,
260
+ "confidence": 0.0,
261
+ "by_source": {},
262
+ "summary": "Tidak ada data yang cukup untuk analisis tren."
263
+ }
264
+
265
+ sentiments = [d["sentiment"] for d in data]
266
+ total = len(sentiments)
267
+
268
+ pos = sentiments.count("Positive")
269
+ neg = sentiments.count("Negative")
270
+ neu = sentiments.count("Neutral")
271
+
272
+ pos_r = pos / total
273
+ neg_r = neg / total
274
+ neu_r = neu / total
275
+
276
+ # Indeks polarisasi: seberapa jauh dari distribusi seimbang
277
+ # 0 = sangat seimbang, 1 = sangat terpolarisasi
278
+ polarity = round(abs(pos_r - neg_r), 3)
279
+
280
+ # Distribusi per sumber
281
+ by_source = {}
282
+ for d in data:
283
+ src = d.get("source", "unknown")
284
+ if src not in by_source:
285
+ by_source[src] = {"Positive": 0, "Negative": 0, "Neutral": 0, "total": 0}
286
+ by_source[src][d["sentiment"]] += 1
287
+ by_source[src]["total"] += 1
288
+
289
+ # Hitung persentase per sumber
290
+ for src in by_source:
291
+ t = by_source[src]["total"]
292
+ by_source[src]["pos_pct"] = round(by_source[src]["Positive"] / t * 100, 1)
293
+ by_source[src]["neg_pct"] = round(by_source[src]["Negative"] / t * 100, 1)
294
+ by_source[src]["neu_pct"] = round(by_source[src]["Neutral"] / t * 100, 1)
295
+
296
+ # Label tren & confidence
297
+ if pos_r > neg_r and pos_r > neu_r:
298
+ label = "Dominan Positif"
299
+ dominant = "Positive"
300
+ confidence = round(pos_r, 3)
301
+ elif neg_r > pos_r and neg_r > neu_r:
302
+ label = "Dominan Negatif"
303
+ dominant = "Negative"
304
+ confidence = round(neg_r, 3)
305
+ elif neu_r >= 0.5:
306
+ label = "Mayoritas Netral"
307
+ dominant = "Neutral"
308
+ confidence = round(neu_r, 3)
309
+ else:
310
+ label = "Terpolarisasi"
311
+ dominant = "Mixed"
312
+ confidence = round(polarity, 3)
313
+
314
+ # Narasi ringkas
315
+ dominant_src = max(by_source, key=lambda s: by_source[s]["total"]) if by_source else "-"
316
+ summary = (
317
+ f"{label} ({round(pos_r*100,1)}% positif, "
318
+ f"{round(neg_r*100,1)}% negatif, "
319
+ f"{round(neu_r*100,1)}% netral). "
320
+ f"Indeks polarisasi: {polarity:.2f}. "
321
+ f"Sumber terbanyak: {dominant_src}."
322
+ )
323
+
324
+ return {
325
+ "label": label,
326
+ "dominant": dominant,
327
+ "polarity": polarity,
328
+ "confidence": confidence,
329
+ "by_source": by_source,
330
+ "pos_pct": round(pos_r * 100, 1),
331
+ "neg_pct": round(neg_r * 100, 1),
332
+ "neu_pct": round(neu_r * 100, 1),
333
+ "summary": summary,
334
+ }
335
+
336
+
337
+ # =============================================================
338
+ # 🔴 FIX 3: TIMELINE — distribusi kumulatif yang akurat
339
+ # =============================================================
340
+ def generate_timeline(data: list):
341
+ """
342
+ Visualisasi distribusi sentimen yang jujur:
343
+ - X-axis: indeks urutan (dengan label yang jelas)
344
+ - Y-axis: proporsi kumulatif sentimen (bukan binary 0/1)
345
+ - Tambahkan annotation rata-rata di tiap segmen
346
+ """
347
+ try:
348
+ if not data or len(data) < 3:
349
+ return
350
+
351
+ os.makedirs("static", exist_ok=True)
352
+
353
+ window = max(5, len(data) // 10) # window adaptif
354
+
355
+ def rolling_mean(arr, w):
356
+ result = []
357
+ for i in range(len(arr)):
358
+ sl = arr[max(0, i - w + 1): i + 1]
359
+ result.append(sum(sl) / len(sl))
360
+ return result
361
+
362
+ pos_raw = [1 if d["sentiment"] == "Positive" else 0 for d in data]
363
+ neg_raw = [1 if d["sentiment"] == "Negative" else 0 for d in data]
364
+ neu_raw = [1 if d["sentiment"] == "Neutral" else 0 for d in data]
365
+
366
+ x = list(range(1, len(data) + 1))
367
+ pos = rolling_mean(pos_raw, window)
368
+ neg = rolling_mean(neg_raw, window)
369
+ neu = rolling_mean(neu_raw, window)
370
+
371
+ fig, ax = plt.subplots(figsize=(11, 3.5))
372
+ fig.patch.set_facecolor('#0e1117')
373
+ ax.set_facecolor('#141820')
374
+
375
+ ax.fill_between(x, pos, alpha=0.15, color='#22c55e')
376
+ ax.fill_between(x, neg, alpha=0.15, color='#ef4444')
377
+ ax.plot(x, pos, label='Positif', color='#22c55e', linewidth=1.8, alpha=0.9)
378
+ ax.plot(x, neg, label='Negatif', color='#ef4444', linewidth=1.8, alpha=0.9)
379
+ ax.plot(x, neu, label='Netral', color='#94a3b8', linewidth=1.2, alpha=0.7, linestyle='--')
380
+
381
+ ax.set_xlabel(
382
+ f'Urutan komentar (rolling mean, window={window})',
383
+ color='#5a6070', fontsize=8
384
+ )
385
+ ax.set_ylabel('Proporsi', color='#5a6070', fontsize=8)
386
+ ax.tick_params(colors='#5a6070', labelsize=7)
387
+ for spine in ax.spines.values():
388
+ spine.set_edgecolor('#1a2030')
389
+
390
+ ax.legend(
391
+ fontsize=8, loc='upper right',
392
+ facecolor='#141820', edgecolor='#1a2030',
393
+ labelcolor='#8892a4'
394
+ )
395
+ ax.set_ylim(0, 1.05)
396
+ ax.set_xlim(1, len(data))
397
+
398
+ # annotation rata-rata
399
+ ax.axhline(np.mean(pos_raw), color='#22c55e', linewidth=0.6, linestyle=':', alpha=0.5)
400
+ ax.axhline(np.mean(neg_raw), color='#ef4444', linewidth=0.6, linestyle=':', alpha=0.5)
401
+
402
+ plt.tight_layout(pad=1.0)
403
+ plt.savefig("static/timeline.png", dpi=110, facecolor=fig.get_facecolor())
404
+ plt.close(fig)
405
+
406
+ except Exception as e:
407
+ print("timeline error:", e)
408
+
409
+
410
+ # =============================================================
411
+ # 🔴 FIX 4: GNN — deterministic, fitur TF-IDF bukan random
412
+ # =============================================================
413
+ def run_gnn_safe(nodes: list, edges: list, texts: list) -> list:
414
+ """
415
+ GNN dengan fitur deterministik dari TF-IDF.
416
+ Tidak lagi menggunakan torch.rand() — hasil konsisten & bermakna.
417
+
418
+ Output: anomaly score per node berdasarkan graph convolution.
419
+ """
420
+ if not nodes or not edges or len(nodes) < 3:
421
+ return [{"node": n["id"], "score": 0.0} for n in nodes]
422
+
423
+ try:
424
+ import torch
425
+ from torch_geometric.data import Data
426
+ from torch_geometric.nn import GCNConv
427
+
428
+ # Fitur node dari TF-IDF (bukan random)
429
+ node_texts = [texts[n["id"]] if n["id"] < len(texts) else "" for n in nodes]
430
+ vec = TfidfVectorizer(max_features=32, min_df=1)
431
+
432
+ try:
433
+ X = vec.fit_transform(node_texts).toarray()
434
+ except Exception:
435
+ # fallback jika TF-IDF gagal (misal semua teks kosong)
436
+ X = np.eye(len(nodes), 32)
437
+
438
+ x = torch.tensor(X, dtype=torch.float)
439
+
440
+ # Edge index
441
+ edge_list = [[e["source"], e["target"]] for e in edges
442
+ if e["source"] < len(nodes) and e["target"] < len(nodes)]
443
+
444
+ if not edge_list:
445
+ return [{"node": n["id"], "score": 0.0} for n in nodes]
446
+
447
+ edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
448
+
449
+ # Model GCN sederhana (tidak ditraining — hanya forward pass untuk anomaly scoring)
450
+ class GCN(torch.nn.Module):
451
+ def __init__(self, in_ch):
452
+ super().__init__()
453
+ self.conv1 = GCNConv(in_ch, 16)
454
+ self.conv2 = GCNConv(16, 4)
455
+
456
+ def forward(self, x, ei):
457
+ x = torch.relu(self.conv1(x, ei))
458
+ return self.conv2(x, ei)
459
+
460
+ # Seed untuk reproducibility
461
+ torch.manual_seed(42)
462
+ model = GCN(x.shape[1])
463
+ model.eval()
464
+
465
+ with torch.no_grad():
466
+ out = model(x, edge_index)
467
+
468
+ # Anomaly score = L2 norm dari output embedding
469
+ scores = torch.norm(out, dim=1).numpy()
470
+ # Normalize ke [0, 1]
471
+ if scores.max() > scores.min():
472
+ scores = (scores - scores.min()) / (scores.max() - scores.min())
473
+ else:
474
+ scores = np.zeros(len(scores))
475
+
476
+ return [
477
+ {"node": nodes[i]["id"], "score": round(float(scores[i]), 3)}
478
+ for i in range(len(nodes))
479
+ ]
480
+
481
+ except ImportError:
482
+ print("⚠️ torch-geometric tidak tersedia — skip GNN")
483
+ return [{"node": n["id"], "score": 0.0} for n in nodes]
484
+ except Exception as e:
485
+ print(f"⚠️ GNN error: {e}")
486
+ return [{"node": n["id"], "score": 0.0} for n in nodes]
487
+
488
+
489
+ # =============================================================
490
+ # FUNGSI LAIN (tidak berubah, tapi disempurnakan)
491
+ # =============================================================
492
  def generate_wordcloud(texts):
493
  try:
494
  os.makedirs("static", exist_ok=True)
495
  texts = [t for t in texts if len(t.strip()) > 3]
496
  if not texts:
497
  return
 
498
  wc = WordCloud(
499
+ width=900, height=380,
500
+ background_color='#0e1117',
501
+ color_func=lambda *a, **k: '#4f9cf9',
502
  max_words=80,
503
  stopwords=STOPWORDS_ID,
504
+ ).generate(" ".join(texts))
 
505
  wc.to_file("static/wordcloud.png")
506
  except Exception as e:
507
  print("wordcloud error:", e)
508
 
509
 
 
 
 
510
  def generate_heatmap(data):
511
  try:
512
  if not data:
 
514
  labels = ["Positive", "Neutral", "Negative"]
515
  sources = sorted(set(d["source"] for d in data))
516
  matrix = np.zeros((len(sources), len(labels)))
 
517
  for d in data:
518
  i = sources.index(d["source"])
519
  j = labels.index(d["sentiment"])
520
  matrix[i][j] += 1
 
521
  if matrix.sum() == 0:
522
  return
 
523
  fig, ax = plt.subplots(figsize=(6, max(2, len(sources))))
524
+ fig.patch.set_facecolor('#0e1117')
525
+ ax.set_facecolor('#141820')
526
  im = ax.imshow(matrix, cmap='Blues', aspect='auto')
527
  ax.set_xticks(range(len(labels)))
528
+ ax.set_xticklabels(labels, color='#8892a4', fontsize=9)
529
  ax.set_yticks(range(len(sources)))
530
+ ax.set_yticklabels(sources, color='#8892a4', fontsize=9)
531
+ ax.tick_params(colors='#5a6070')
532
  plt.colorbar(im, ax=ax)
533
  plt.tight_layout()
534
  os.makedirs("static", exist_ok=True)
535
+ plt.savefig("static/heatmap.png", dpi=100, facecolor=fig.get_facecolor())
536
  plt.close(fig)
537
  except Exception as e:
538
  print("heatmap error:", e)
539
 
540
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  def get_topics(texts):
542
  try:
543
  texts = [t for t in texts if len(t) > 3]
544
  if len(texts) < 5:
545
  return [["data kurang"]]
 
546
  vec = CountVectorizer(min_df=2, stop_words=list(STOPWORDS_ID))
547
  X = vec.fit_transform(texts)
 
548
  if X.shape[1] == 0:
549
  return [["kosong"]]
550
+ n = min(3, X.shape[1])
551
+ lda = LatentDirichletAllocation(n_components=n, random_state=42)
 
552
  lda.fit(X)
 
553
  words = vec.get_feature_names_out()
554
+ return [[words[i] for i in t.argsort()[-5:]] for t in lda.components_]
 
 
 
555
  except Exception as e:
556
  print("topic error:", e)
557
  return [["error"]]
558
 
559
 
 
 
 
560
  def generate_insight(data):
561
  s = [d["sentiment"] for d in data]
562
  return (f"Positive:{s.count('Positive')} "
 
564
  f"Neutral:{s.count('Neutral')}")
565
 
566
 
 
 
 
567
  def cluster_opinions(texts):
568
  try:
569
  if len(texts) < 6:
 
572
  n = min(3, len(texts))
573
  k = KMeans(n_clusters=n, n_init=10, random_state=42).fit(X)
574
  clusters = {}
575
+ for i, lbl in enumerate(k.labels_):
576
+ clusters.setdefault(int(lbl), []).append(texts[i])
577
+ return [{"cluster": lbl, "samples": s[:3]} for lbl, s in clusters.items()]
578
  except Exception as e:
579
  print("cluster error:", e)
580
  return []
581
 
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  def build_network(texts):
584
  edges = {}
585
  for t in texts:
586
+ words = [w for w in set(clean_text(t).split())
587
+ if len(w) > 3 and w not in STOPWORDS_ID][:6]
588
  for a, b in combinations(words, 2):
589
  key = tuple(sorted([a, b]))
590
  edges[key] = edges.get(key, 0) + 1
 
592
  for k, v in edges.items() if v > 1]
593
 
594
 
 
 
 
595
  def detect_bot_network(texts):
596
  try:
597
  if len(texts) < 5:
598
  return {"nodes": [], "edges": [], "bots": []}
 
599
  X = TfidfVectorizer(max_features=300).fit_transform(texts)
600
  sim = cosine_similarity(X)
601
+ G = nx.Graph()
 
602
  for i in range(len(texts)):
603
  G.add_node(i, text=texts[i])
 
604
  for i in range(len(texts)):
605
  for j in range(i + 1, len(texts)):
606
  if sim[i][j] > 0.75:
607
  G.add_edge(i, j)
 
608
  central = nx.degree_centrality(G)
609
  bots = [{"node": i, "score": round(s, 2), "text": texts[i]}
610
  for i, s in central.items() if s > 0.3]
 
611
  return {
612
  "nodes": [{"id": i} for i in G.nodes()],
613
  "edges": [{"source": u, "target": v} for u, v in G.edges()],
 
618
  return {"nodes": [], "edges": [], "bots": []}
619
 
620
 
621
+ # =============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
622
  # ROUTES
623
+ # =============================================================
624
  @app.route("/")
625
  def home():
626
  return render_template("index.html")
 
644
  texts = [t for _, t in raw][:100]
645
  sources = [s for s, _ in raw][:100]
646
 
647
+ # Sentimen dengan confidence score
648
+ scored = predict_with_score(texts)
649
+ sentiments = [s["label"] for s in scored]
650
+ scores = [s["score"] for s in scored]
651
+
652
+ result_data = [
653
+ {
654
+ "text": t,
655
+ "sentiment": s,
656
+ "confidence": c,
657
+ "source": src,
658
+ "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M")
659
+ }
660
+ for t, s, c, src in zip(texts, sentiments, scores, sources)
661
  ]
662
 
663
+ # VISUAL
664
  generate_wordcloud(texts)
665
+ generate_heatmap(result_data)
666
+ generate_timeline(result_data)
667
 
668
  # ANALYSIS
669
  top_words = get_top_words(texts)
670
  topics = get_topics(texts)
671
+ insight = generate_insight(result_data)
672
  clusters = cluster_opinions(texts)
673
+ trend = predict_trend(result_data) # dict sekarang
674
+ hoax = detect_hoax(texts) # ML-based
675
  network = build_network(texts)
676
  bot_network = detect_bot_network(texts)
 
677
 
678
+ # GNN deterministik
679
+ gnn = run_gnn_safe(bot_network["nodes"], bot_network["edges"], texts)
680
+
681
+ # ADVANCED optional
682
  bot_bert = detect_bot_bert(texts)
683
  fake_news = detect_fake_news(texts)
 
684
 
685
+ # SAVE CSV dengan kolom lebih lengkap
686
  os.makedirs("static", exist_ok=True)
687
+ pd.DataFrame(result_data).to_csv("static/result.csv", index=False)
688
 
689
  return jsonify({
690
+ "data": result_data,
691
  "top_words": top_words,
692
  "topics": topics,
693
  "insight": insight,
 
698
  "trend": trend,
699
  "bot_bert": bot_bert,
700
  "fake_news": fake_news,
701
+ "gnn": gnn,
702
  })
703
 
704
  except Exception as e:
 
719
  return send_file(f"static/{filename}")
720
 
721
 
 
 
 
722
  if __name__ == "__main__":
723
  app.run(host="0.0.0.0", port=7860, debug=False)