noranisa commited on
Commit
7051fa3
·
verified ·
1 Parent(s): 45e75e6

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +191 -136
main.py CHANGED
@@ -3,7 +3,7 @@ from services.aggregator import collect_data
3
  from services.sentiment import predict
4
 
5
  # =========================
6
- # IMPORT
7
  # =========================
8
  from collections import Counter
9
  import pandas as pd
@@ -11,55 +11,71 @@ import os
11
  import re
12
  import numpy as np
13
 
 
14
  from wordcloud import WordCloud
 
 
15
  import matplotlib.pyplot as plt
16
 
 
17
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
18
  from sklearn.decomposition import LatentDirichletAllocation
19
  from sklearn.cluster import KMeans
20
  from sklearn.metrics.pairwise import cosine_similarity
21
  from sklearn.linear_model import LinearRegression
22
 
 
23
  import networkx as nx
24
  from itertools import combinations
25
 
26
- # OPTIONAL (SAFE IMPORT)
27
  try:
28
  from services.bot_bert import detect_bot_bert
29
- except:
30
  def detect_bot_bert(x): return []
31
 
32
  try:
33
  from services.fake_news import detect_fake_news
34
- except:
35
  def detect_fake_news(x): return []
36
 
37
  try:
38
  from services.gnn import run_gnn
39
- except:
40
- def run_gnn(n,e): return []
41
 
42
- # =========================
43
- # INIT
44
- # =========================
45
  app = Flask(__name__)
46
 
47
-
48
  # =========================
49
- # CLEAN TEXT
50
  # =========================
51
  def clean_text(t):
52
- return re.sub(r'[^a-zA-Z\s]', '', str(t).lower())
 
 
 
 
53
 
54
 
55
  # =========================
56
  # TOP WORDS
57
  # =========================
 
 
 
 
 
 
 
 
 
58
  def get_top_words(texts):
59
  words = []
60
  for t in texts:
61
- words.extend(clean_text(t).split())
62
- return [{"word": w, "count": c} for w, c in Counter(words).most_common(10)]
 
 
63
 
64
 
65
  # =========================
@@ -69,15 +85,19 @@ def generate_wordcloud(texts):
69
  try:
70
  os.makedirs("static", exist_ok=True)
71
  texts = [t for t in texts if len(t.strip()) > 3]
72
-
73
  if not texts:
74
  return
75
-
76
- wc = WordCloud(width=800, height=400).generate(" ".join(texts))
 
 
 
 
 
 
77
  wc.to_file("static/wordcloud.png")
78
-
79
  except Exception as e:
80
- print("wordcloud error:", e)
81
 
82
 
83
  # =========================
@@ -87,11 +107,9 @@ def generate_heatmap(data):
87
  try:
88
  if not data:
89
  return
90
-
91
- labels = ["Positive", "Neutral", "Negative"]
92
- sources = list(set([d["source"] for d in data]))
93
-
94
- matrix = np.zeros((len(sources), len(labels)))
95
 
96
  for d in data:
97
  i = sources.index(d["source"])
@@ -101,17 +119,19 @@ def generate_heatmap(data):
101
  if matrix.sum() == 0:
102
  return
103
 
104
- plt.figure()
105
- plt.imshow(matrix)
106
- plt.xticks(range(len(labels)), labels)
107
- plt.yticks(range(len(sources)), sources)
108
- plt.colorbar()
109
-
110
- plt.savefig("static/heatmap.png")
111
- plt.close()
112
-
 
 
113
  except Exception as e:
114
- print("heatmap error:", e)
115
 
116
 
117
  # =========================
@@ -121,25 +141,28 @@ def generate_timeline(data):
121
  try:
122
  if not data:
123
  return
 
124
 
125
- pos, neg, neu = [], [], []
126
-
127
- for d in data:
128
- pos.append(1 if d["sentiment"] == "Positive" else 0)
129
- neg.append(1 if d["sentiment"] == "Negative" else 0)
130
- neu.append(1 if d["sentiment"] == "Neutral" else 0)
131
-
132
- plt.figure()
133
- plt.plot(pos, label="Positive")
134
- plt.plot(neg, label="Negative")
135
- plt.plot(neu, label="Neutral")
136
- plt.legend()
137
-
138
- plt.savefig("static/timeline.png")
139
- plt.close()
140
-
 
 
141
  except Exception as e:
142
- print("timeline error:", e)
143
 
144
 
145
  # =========================
@@ -147,65 +170,74 @@ def generate_timeline(data):
147
  # =========================
148
  def get_topics(texts):
149
  try:
150
- texts = [t for t in texts if len(t.strip()) > 3]
151
-
152
  if len(texts) < 5:
153
  return [["data kurang"]]
154
 
155
- vec = CountVectorizer(min_df=2)
156
- X = vec.fit_transform(texts)
157
 
158
  if X.shape[1] == 0:
159
- return [["tidak ada kata"]]
160
 
161
- lda = LatentDirichletAllocation(n_components=3)
 
162
  lda.fit(X)
163
 
164
- words = vec.get_feature_names_out()
165
-
166
  topics = []
167
  for t in lda.components_:
168
  topics.append([words[i] for i in t.argsort()[-5:]])
169
-
170
  return topics
171
-
172
  except Exception as e:
173
- print("topic error:", e)
174
  return [["error"]]
175
 
176
 
 
 
 
 
 
 
 
 
 
 
177
  # =========================
178
  # CLUSTER
179
  # =========================
180
  def cluster_opinions(texts):
181
  try:
182
- if len(texts) < 5:
183
  return []
184
-
185
- X = TfidfVectorizer(max_features=300).fit_transform(texts)
186
- model = KMeans(n_clusters=3, n_init=10)
187
- labels = model.fit_predict(X)
188
-
189
  clusters = {}
190
- for i, l in enumerate(labels):
191
- clusters.setdefault(l, []).append(texts[i])
192
-
193
- return [{"cluster": k, "samples": v[:3]} for k, v in clusters.items()]
194
-
195
  except Exception as e:
196
- print("cluster error:", e)
197
  return []
198
 
199
 
200
  # =========================
201
- # HOAX
202
  # =========================
 
 
 
 
 
203
  def detect_hoax(texts):
204
- kw = ["hoax", "bohong", "fitnah", "propaganda"]
205
- return [
206
- {"text": t, "label": "Hoax" if any(k in t.lower() for k in kw) else "Normal"}
207
- for t in texts[:10]
208
- ]
 
209
 
210
 
211
  # =========================
@@ -213,14 +245,13 @@ def detect_hoax(texts):
213
  # =========================
214
  def build_network(texts):
215
  edges = {}
216
-
217
  for t in texts:
218
- words = list(set(t.split()))[:5]
219
  for a, b in combinations(words, 2):
220
  key = tuple(sorted([a, b]))
221
  edges[key] = edges.get(key, 0) + 1
222
-
223
- return [{"source": k[0], "target": k[1], "weight": v} for k, v in edges.items() if v > 1]
224
 
225
 
226
  # =========================
@@ -231,11 +262,10 @@ def detect_bot_network(texts):
231
  if len(texts) < 5:
232
  return {"nodes": [], "edges": [], "bots": []}
233
 
234
- X = TfidfVectorizer(max_features=300).fit_transform(texts)
235
  sim = cosine_similarity(X)
236
 
237
  G = nx.Graph()
238
-
239
  for i in range(len(texts)):
240
  G.add_node(i, text=texts[i])
241
 
@@ -245,19 +275,16 @@ def detect_bot_network(texts):
245
  G.add_edge(i, j)
246
 
247
  central = nx.degree_centrality(G)
 
 
248
 
249
- bots = [
250
- {"node": i, "score": round(s, 2), "text": texts[i]}
251
- for i, s in central.items() if s > 0.3
252
- ]
253
-
254
- nodes = [{"id": i} for i in G.nodes()]
255
- edges = [{"source": u, "target": v} for u, v in G.edges()]
256
-
257
- return {"nodes": nodes, "edges": edges, "bots": bots[:10]}
258
-
259
  except Exception as e:
260
- print(" bot network error:", e)
261
  return {"nodes": [], "edges": [], "bots": []}
262
 
263
 
@@ -266,22 +293,21 @@ def detect_bot_network(texts):
266
  # =========================
267
  def predict_trend(data):
268
  try:
269
- y = [
270
- 1 if d["sentiment"] == "Positive"
271
- else -1 if d["sentiment"] == "Negative"
272
- else 0 for d in data
273
- ]
274
-
275
  if len(y) < 5:
276
- return "Data kurang"
277
-
278
- X = np.arange(len(y)).reshape(-1, 1)
279
- model = LinearRegression().fit(X, y)
280
-
281
- return "📈 Positif" if model.coef_[0] > 0 else "📉 Negatif"
282
-
 
 
283
  except Exception as e:
284
- print("trend error:", e)
285
  return "Error"
286
 
287
 
@@ -293,16 +319,23 @@ def home():
293
  return render_template("index.html")
294
 
295
 
 
 
 
 
 
296
  @app.route("/analyze", methods=["POST"])
297
  def analyze():
298
  try:
299
- keyword = request.json.get("keyword")
300
- source = request.json.get("source", "all")
301
 
302
- raw = collect_data(keyword, source)
 
303
 
304
- texts = [t for s, t in raw][:100]
305
- sources = [s for s, t in raw][:100]
 
306
 
307
  sentiments = predict(texts)
308
 
@@ -311,43 +344,65 @@ def analyze():
311
  for t, s, src in zip(texts, sentiments, sources)
312
  ]
313
 
314
- # VISUAL
315
  generate_wordcloud(texts)
316
  generate_heatmap(result)
317
  generate_timeline(result)
318
 
319
  # ANALYSIS
320
- response = {
321
- "data": result,
322
- "top_words": get_top_words(texts),
323
- "topics": get_topics(texts),
324
- "clusters": cluster_opinions(texts),
325
- "hoax": detect_hoax(texts),
326
- "network": build_network(texts),
327
- "bot_network": detect_bot_network(texts),
328
- "trend": predict_trend(result),
329
- "bot_bert": detect_bot_bert(texts),
330
- "fake_news": detect_fake_news(texts),
331
- "gnn": [] # 🔥 DISABLE TORCH SAFE
332
- }
333
-
 
334
  os.makedirs("static", exist_ok=True)
335
  pd.DataFrame(result).to_csv("static/result.csv", index=False)
336
 
337
- return jsonify(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  except Exception as e:
340
- print("ERROR:", e)
341
- return jsonify({"data": []})
342
 
343
 
344
  @app.route("/download")
345
  def download():
346
- return send_file("static/result.csv", as_attachment=True)
 
 
 
 
 
 
 
 
347
 
348
 
349
  # =========================
350
  # RUN
351
  # =========================
352
  if __name__ == "__main__":
353
- app.run(host="0.0.0.0", port=7860)
 
3
  from services.sentiment import predict
4
 
5
  # =========================
6
+ # IMPORT TAMBAHAN
7
  # =========================
8
  from collections import Counter
9
  import pandas as pd
 
11
  import re
12
  import numpy as np
13
 
14
+ # VISUAL
15
  from wordcloud import WordCloud
16
+ import matplotlib
17
+ matplotlib.use('Agg') # ← WAJIB: non-interactive backend untuk server
18
  import matplotlib.pyplot as plt
19
 
20
+ # ML
21
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
22
  from sklearn.decomposition import LatentDirichletAllocation
23
  from sklearn.cluster import KMeans
24
  from sklearn.metrics.pairwise import cosine_similarity
25
  from sklearn.linear_model import LinearRegression
26
 
27
+ # GRAPH
28
  import networkx as nx
29
  from itertools import combinations
30
 
31
+ # OPTIONAL ADVANCED
32
  try:
33
  from services.bot_bert import detect_bot_bert
34
+ except Exception:
35
  def detect_bot_bert(x): return []
36
 
37
  try:
38
  from services.fake_news import detect_fake_news
39
+ except Exception:
40
  def detect_fake_news(x): return []
41
 
42
  try:
43
  from services.gnn import run_gnn
44
+ except Exception:
45
+ def run_gnn(n, e): return []
46
 
 
 
 
47
  app = Flask(__name__)
48
 
 
49
  # =========================
50
+ # UTIL
51
  # =========================
52
  def clean_text(t):
53
+ t = t.lower()
54
+ t = re.sub(r'http\S+', '', t)
55
+ t = re.sub(r'[^a-zA-Z0-9\s]', ' ', t)
56
+ t = re.sub(r'\s+', ' ', t).strip()
57
+ return t
58
 
59
 
60
  # =========================
61
  # TOP WORDS
62
  # =========================
63
+ STOPWORDS_ID = {
64
+ 'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah','ada',
65
+ 'pada','juga','tidak','bisa','sudah','saya','kamu','kami','mereka','kita',
66
+ 'nya','pun','aja','gak','ga','ya','yg','dgn','yah','dah','udah','mau',
67
+ 'jadi','buat','kalau','tp','tapi','tapi','banget','sangat','lebih','nih',
68
+ 'sih','dong','lah','lagi','terus','sama','atau','karena','tapi','juga',
69
+ 'so','the','is','in','of','to','a','an','and','it','for','that','this',
70
+ }
71
+
72
  def get_top_words(texts):
73
  words = []
74
  for t in texts:
75
+ for w in clean_text(t).split():
76
+ if len(w) > 2 and w not in STOPWORDS_ID:
77
+ words.append(w)
78
+ return [{"word": w, "count": c} for w, c in Counter(words).most_common(15)]
79
 
80
 
81
  # =========================
 
85
  try:
86
  os.makedirs("static", exist_ok=True)
87
  texts = [t for t in texts if len(t.strip()) > 3]
 
88
  if not texts:
89
  return
90
+ combined = " ".join(texts)
91
+ wc = WordCloud(
92
+ width=900, height=400,
93
+ background_color='white',
94
+ max_words=80,
95
+ stopwords=STOPWORDS_ID,
96
+ colormap='Blues'
97
+ ).generate(combined)
98
  wc.to_file("static/wordcloud.png")
 
99
  except Exception as e:
100
+ print("wordcloud error:", e)
101
 
102
 
103
  # =========================
 
107
  try:
108
  if not data:
109
  return
110
+ labels = ["Positive", "Neutral", "Negative"]
111
+ sources = sorted(set(d["source"] for d in data))
112
+ matrix = np.zeros((len(sources), len(labels)))
 
 
113
 
114
  for d in data:
115
  i = sources.index(d["source"])
 
119
  if matrix.sum() == 0:
120
  return
121
 
122
+ fig, ax = plt.subplots(figsize=(6, max(2, len(sources))))
123
+ im = ax.imshow(matrix, cmap='Blues', aspect='auto')
124
+ ax.set_xticks(range(len(labels)))
125
+ ax.set_xticklabels(labels)
126
+ ax.set_yticks(range(len(sources)))
127
+ ax.set_yticklabels(sources)
128
+ plt.colorbar(im, ax=ax)
129
+ plt.tight_layout()
130
+ os.makedirs("static", exist_ok=True)
131
+ plt.savefig("static/heatmap.png", dpi=100)
132
+ plt.close(fig)
133
  except Exception as e:
134
+ print("heatmap error:", e)
135
 
136
 
137
  # =========================
 
141
  try:
142
  if not data:
143
  return
144
+ os.makedirs("static", exist_ok=True)
145
 
146
+ pos = [1 if d["sentiment"] == "Positive" else 0 for d in data]
147
+ neg = [1 if d["sentiment"] == "Negative" else 0 for d in data]
148
+ neu = [1 if d["sentiment"] == "Neutral" else 0 for d in data]
149
+
150
+ # rolling average
151
+ def roll(arr, n=5):
152
+ return [sum(arr[max(0,i-n):i+1]) / len(arr[max(0,i-n):i+1]) for i in range(len(arr))]
153
+
154
+ fig, ax = plt.subplots(figsize=(10, 3))
155
+ ax.plot(roll(pos), label="Positive", color="#22c55e", linewidth=1.5)
156
+ ax.plot(roll(neg), label="Negative", color="#ef4444", linewidth=1.5)
157
+ ax.plot(roll(neu), label="Neutral", color="#94a3b8", linewidth=1.0)
158
+ ax.legend()
159
+ ax.set_facecolor('#f8fafc')
160
+ fig.patch.set_facecolor('#f8fafc')
161
+ plt.tight_layout()
162
+ plt.savefig("static/timeline.png", dpi=100)
163
+ plt.close(fig)
164
  except Exception as e:
165
+ print("timeline error:", e)
166
 
167
 
168
  # =========================
 
170
  # =========================
171
  def get_topics(texts):
172
  try:
173
+ texts = [t for t in texts if len(t) > 3]
 
174
  if len(texts) < 5:
175
  return [["data kurang"]]
176
 
177
+ vec = CountVectorizer(min_df=2, stop_words=list(STOPWORDS_ID))
178
+ X = vec.fit_transform(texts)
179
 
180
  if X.shape[1] == 0:
181
+ return [["kosong"]]
182
 
183
+ n_topics = min(3, X.shape[1])
184
+ lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
185
  lda.fit(X)
186
 
187
+ words = vec.get_feature_names_out()
 
188
  topics = []
189
  for t in lda.components_:
190
  topics.append([words[i] for i in t.argsort()[-5:]])
 
191
  return topics
 
192
  except Exception as e:
193
+ print("topic error:", e)
194
  return [["error"]]
195
 
196
 
197
+ # =========================
198
+ # INSIGHT
199
+ # =========================
200
+ def generate_insight(data):
201
+ s = [d["sentiment"] for d in data]
202
+ return (f"Positive:{s.count('Positive')} "
203
+ f"Negative:{s.count('Negative')} "
204
+ f"Neutral:{s.count('Neutral')}")
205
+
206
+
207
  # =========================
208
  # CLUSTER
209
  # =========================
210
  def cluster_opinions(texts):
211
  try:
212
+ if len(texts) < 6:
213
  return []
214
+ X = TfidfVectorizer(max_features=300, stop_words=list(STOPWORDS_ID)).fit_transform(texts)
215
+ n = min(3, len(texts))
216
+ k = KMeans(n_clusters=n, n_init=10, random_state=42).fit(X)
 
 
217
  clusters = {}
218
+ for i, label in enumerate(k.labels_):
219
+ clusters.setdefault(int(label), []).append(texts[i])
220
+ return [{"cluster": lbl, "samples": samples[:3]} for lbl, samples in clusters.items()]
 
 
221
  except Exception as e:
222
+ print("cluster error:", e)
223
  return []
224
 
225
 
226
  # =========================
227
+ # HOAX (keyword-based)
228
  # =========================
229
+ HOAX_KW = [
230
+ "hoax","bohong","fitnah","propaganda","palsu","fake","disinformasi",
231
+ "menyesatkan","kebohongan","manipulasi","adu domba","provokasi"
232
+ ]
233
+
234
  def detect_hoax(texts):
235
+ results = []
236
+ for t in texts[:15]:
237
+ lower = t.lower()
238
+ label = "Hoax" if any(k in lower for k in HOAX_KW) else "Normal"
239
+ results.append({"text": t, "label": label})
240
+ return results
241
 
242
 
243
  # =========================
 
245
  # =========================
246
  def build_network(texts):
247
  edges = {}
 
248
  for t in texts:
249
+ words = [w for w in set(clean_text(t).split()) if len(w) > 3 and w not in STOPWORDS_ID][:6]
250
  for a, b in combinations(words, 2):
251
  key = tuple(sorted([a, b]))
252
  edges[key] = edges.get(key, 0) + 1
253
+ return [{"source": k[0], "target": k[1], "weight": v}
254
+ for k, v in edges.items() if v > 1]
255
 
256
 
257
  # =========================
 
262
  if len(texts) < 5:
263
  return {"nodes": [], "edges": [], "bots": []}
264
 
265
+ X = TfidfVectorizer(max_features=300).fit_transform(texts)
266
  sim = cosine_similarity(X)
267
 
268
  G = nx.Graph()
 
269
  for i in range(len(texts)):
270
  G.add_node(i, text=texts[i])
271
 
 
275
  G.add_edge(i, j)
276
 
277
  central = nx.degree_centrality(G)
278
+ bots = [{"node": i, "score": round(s, 2), "text": texts[i]}
279
+ for i, s in central.items() if s > 0.3]
280
 
281
+ return {
282
+ "nodes": [{"id": i} for i in G.nodes()],
283
+ "edges": [{"source": u, "target": v} for u, v in G.edges()],
284
+ "bots": bots[:10]
285
+ }
 
 
 
 
 
286
  except Exception as e:
287
+ print("bot_network error:", e)
288
  return {"nodes": [], "edges": [], "bots": []}
289
 
290
 
 
293
  # =========================
294
  def predict_trend(data):
295
  try:
296
+ y = [1 if d["sentiment"] == "Positive" else
297
+ -1 if d["sentiment"] == "Negative" else 0
298
+ for d in data]
 
 
 
299
  if len(y) < 5:
300
+ return "Kurang Data"
301
+ X = np.arange(len(y)).reshape(-1, 1)
302
+ coef = LinearRegression().fit(X, y).coef_[0]
303
+ if coef > 0.05:
304
+ return "Naik Positif"
305
+ elif coef < -0.05:
306
+ return "Naik Negatif"
307
+ else:
308
+ return "Stabil"
309
  except Exception as e:
310
+ print("trend error:", e)
311
  return "Error"
312
 
313
 
 
319
  return render_template("index.html")
320
 
321
 
322
+ @app.route("/result")
323
+ def result():
324
+ return render_template("result.html")
325
+
326
+
327
  @app.route("/analyze", methods=["POST"])
328
  def analyze():
329
  try:
330
+ keyword = request.json.get("keyword", "").strip()
331
+ source = request.json.get("source", "all")
332
 
333
+ if not keyword:
334
+ return jsonify({"error": "keyword kosong", "data": []}), 400
335
 
336
+ raw = collect_data(keyword, source)
337
+ texts = [t for _, t in raw][:100]
338
+ sources = [s for s, _ in raw][:100]
339
 
340
  sentiments = predict(texts)
341
 
 
344
  for t, s, src in zip(texts, sentiments, sources)
345
  ]
346
 
347
+ # VISUAL — non-blocking
348
  generate_wordcloud(texts)
349
  generate_heatmap(result)
350
  generate_timeline(result)
351
 
352
  # ANALYSIS
353
+ top_words = get_top_words(texts)
354
+ topics = get_topics(texts)
355
+ insight = generate_insight(result)
356
+ clusters = cluster_opinions(texts)
357
+ hoax = detect_hoax(texts)
358
+ network = build_network(texts)
359
+ bot_network = detect_bot_network(texts)
360
+ trend = predict_trend(result)
361
+
362
+ # ADVANCED (optional)
363
+ bot_bert = detect_bot_bert(texts)
364
+ fake_news = detect_fake_news(texts)
365
+ gnn = run_gnn(bot_network["nodes"], bot_network["edges"])
366
+
367
+ # SAVE CSV
368
  os.makedirs("static", exist_ok=True)
369
  pd.DataFrame(result).to_csv("static/result.csv", index=False)
370
 
371
+ return jsonify({
372
+ "data": result,
373
+ "top_words": top_words,
374
+ "topics": topics,
375
+ "insight": insight,
376
+ "clusters": clusters,
377
+ "hoax": hoax,
378
+ "network": network,
379
+ "bot_network": bot_network,
380
+ "trend": trend,
381
+ "bot_bert": bot_bert,
382
+ "fake_news": fake_news,
383
+ "gnn": gnn
384
+ })
385
 
386
  except Exception as e:
387
+ print("ERROR /analyze:", e)
388
+ return jsonify({"error": str(e), "data": []}), 500
389
 
390
 
391
  @app.route("/download")
392
  def download():
393
+ path = "static/result.csv"
394
+ if not os.path.exists(path):
395
+ return jsonify({"error": "Belum ada hasil analisis"}), 404
396
+ return send_file(path, as_attachment=True)
397
+
398
+
399
+ @app.route("/static/<path:filename>")
400
+ def static_files(filename):
401
+ return send_file(f"static/{filename}")
402
 
403
 
404
  # =========================
405
  # RUN
406
  # =========================
407
  if __name__ == "__main__":
408
+ app.run(host="0.0.0.0", port=7860, debug=False)