noranisa commited on
Commit
45e75e6
Β·
verified Β·
1 Parent(s): 809e115

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +165 -136
main.py CHANGED
@@ -3,7 +3,7 @@ from services.aggregator import collect_data
3
  from services.sentiment import predict
4
 
5
  # =========================
6
- # IMPORT TAMBAHAN
7
  # =========================
8
  from collections import Counter
9
  import pandas as pd
@@ -11,22 +11,19 @@ import os
11
  import re
12
  import numpy as np
13
 
14
- # VISUAL
15
  from wordcloud import WordCloud
16
  import matplotlib.pyplot as plt
17
 
18
- # ML
19
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
20
  from sklearn.decomposition import LatentDirichletAllocation
21
  from sklearn.cluster import KMeans
22
  from sklearn.metrics.pairwise import cosine_similarity
23
  from sklearn.linear_model import LinearRegression
24
 
25
- # GRAPH
26
  import networkx as nx
27
  from itertools import combinations
28
 
29
- # OPTIONAL ADVANCED
30
  try:
31
  from services.bot_bert import detect_bot_bert
32
  except:
@@ -42,46 +39,56 @@ try:
42
  except:
43
  def run_gnn(n,e): return []
44
 
 
 
 
45
  app = Flask(__name__)
46
 
 
47
  # =========================
48
- # πŸ”₯ UTIL
49
  # =========================
50
  def clean_text(t):
51
- return re.sub(r'[^a-zA-Z\s]', '', t.lower())
52
 
53
 
54
  # =========================
55
- # πŸ”₯ TOP WORDS
56
  # =========================
57
  def get_top_words(texts):
58
  words = []
59
  for t in texts:
60
  words.extend(clean_text(t).split())
61
- return [{"word":w,"count":c} for w,c in Counter(words).most_common(10)]
62
 
63
 
64
  # =========================
65
- # πŸ”₯ WORDCLOUD
66
  # =========================
67
  def generate_wordcloud(texts):
68
  try:
69
  os.makedirs("static", exist_ok=True)
70
- texts = [t for t in texts if len(t.strip())>3]
71
- if not texts: return
72
- wc = WordCloud(width=800,height=400).generate(" ".join(texts))
 
 
 
73
  wc.to_file("static/wordcloud.png")
 
74
  except Exception as e:
75
- print("wordcloud error:",e)
76
 
77
 
78
  # =========================
79
- # πŸ”₯ HEATMAP
80
  # =========================
81
  def generate_heatmap(data):
82
  try:
83
- if not data: return
84
- labels = ["Positive","Neutral","Negative"]
 
 
85
  sources = list(set([d["source"] for d in data]))
86
 
87
  matrix = np.zeros((len(sources), len(labels)))
@@ -89,164 +96,197 @@ def generate_heatmap(data):
89
  for d in data:
90
  i = sources.index(d["source"])
91
  j = labels.index(d["sentiment"])
92
- matrix[i][j]+=1
93
 
94
- if matrix.sum()==0: return
 
95
 
96
  plt.figure()
97
  plt.imshow(matrix)
98
- plt.xticks(range(len(labels)),labels)
99
- plt.yticks(range(len(sources)),sources)
100
  plt.colorbar()
101
- os.makedirs("static",exist_ok=True)
102
  plt.savefig("static/heatmap.png")
103
  plt.close()
 
104
  except Exception as e:
105
- print("heatmap error:",e)
106
 
107
 
108
  # =========================
109
- # πŸ”₯ TIMELINE
110
  # =========================
111
  def generate_timeline(data):
112
  try:
113
- if not data: return
114
- os.makedirs("static", exist_ok=True)
 
 
115
 
116
- pos,neg,neu=[],[],[]
117
  for d in data:
118
- pos.append(1 if d["sentiment"]=="Positive" else 0)
119
- neg.append(1 if d["sentiment"]=="Negative" else 0)
120
- neu.append(1 if d["sentiment"]=="Neutral" else 0)
121
 
122
  plt.figure()
123
- plt.plot(pos,label="Positive")
124
- plt.plot(neg,label="Negative")
125
- plt.plot(neu,label="Neutral")
126
  plt.legend()
 
127
  plt.savefig("static/timeline.png")
128
  plt.close()
 
129
  except Exception as e:
130
- print("timeline error:",e)
131
 
132
 
133
  # =========================
134
- # πŸ”₯ TOPIC MODELING
135
  # =========================
136
  def get_topics(texts):
137
  try:
138
- texts = [t for t in texts if len(t)>3]
139
- if len(texts)<5: return [["data kurang"]]
 
 
140
 
141
  vec = CountVectorizer(min_df=2)
142
  X = vec.fit_transform(texts)
143
 
144
- if X.shape[1]==0: return [["kosong"]]
 
145
 
146
  lda = LatentDirichletAllocation(n_components=3)
147
  lda.fit(X)
148
 
149
  words = vec.get_feature_names_out()
150
- topics=[]
 
151
  for t in lda.components_:
152
  topics.append([words[i] for i in t.argsort()[-5:]])
153
- return topics
154
- except:
155
- return [["error"]]
156
 
 
157
 
158
- # =========================
159
- # πŸ”₯ INSIGHT
160
- # =========================
161
- def generate_insight(data):
162
- s=[d["sentiment"] for d in data]
163
- return f"Positive:{s.count('Positive')} Negative:{s.count('Negative')} Neutral:{s.count('Neutral')}"
164
 
165
 
166
  # =========================
167
- # πŸ”₯ CLUSTER
168
  # =========================
169
  def cluster_opinions(texts):
170
  try:
171
- if len(texts)<5: return []
172
- X=TfidfVectorizer(max_features=300).fit_transform(texts)
173
- k=KMeans(n_clusters=3,n_init=10).fit(X)
174
- clusters={}
175
- for i,l in enumerate(k.labels_):
176
- clusters.setdefault(l,[]).append(texts[i])
177
- return [{"cluster":k,"samples":v[:3]} for k,v in clusters.items()]
178
- except:
 
 
 
 
 
 
 
179
  return []
180
 
181
 
182
  # =========================
183
- # πŸ”₯ HOAX
184
  # =========================
185
  def detect_hoax(texts):
186
- kw=["hoax","bohong","fitnah","propaganda"]
187
- return [{"text":t,"label":"Hoax" if any(k in t.lower() for k in kw) else "Normal"} for t in texts[:10]]
 
 
 
188
 
189
 
190
  # =========================
191
- # πŸ”₯ NETWORK
192
  # =========================
193
  def build_network(texts):
194
- edges={}
 
195
  for t in texts:
196
- w=list(set(t.split()))[:5]
197
- for a,b in combinations(w,2):
198
- key=tuple(sorted([a,b]))
199
- edges[key]=edges.get(key,0)+1
200
- return [{"source":k[0],"target":k[1],"weight":v} for k,v in edges.items() if v>1]
 
201
 
202
 
203
  # =========================
204
- # πŸ”₯ BOT NETWORK
205
  # =========================
206
  def detect_bot_network(texts):
207
  try:
208
- if len(texts)<5: return {"nodes":[],"edges":[],"bots":[]}
 
 
 
 
209
 
210
- X=TfidfVectorizer(max_features=300).fit_transform(texts)
211
- sim=cosine_similarity(X)
212
 
213
- G=nx.Graph()
214
  for i in range(len(texts)):
215
- G.add_node(i,text=texts[i])
216
 
217
  for i in range(len(texts)):
218
- for j in range(i+1,len(texts)):
219
- if sim[i][j]>0.75:
220
- G.add_edge(i,j)
221
 
222
- central=nx.degree_centrality(G)
223
 
224
- bots=[{"node":i,"score":round(s,2),"text":texts[i]} for i,s in central.items() if s>0.3]
 
 
 
225
 
226
- nodes=[{"id":i} for i in G.nodes()]
227
- edges=[{"source":u,"target":v} for u,v in G.edges()]
228
 
229
- return {"nodes":nodes,"edges":edges,"bots":bots[:10]}
230
- except:
231
- return {"nodes":[],"edges":[],"bots":[]}
 
 
232
 
233
 
234
  # =========================
235
- # πŸ”₯ TREND
236
  # =========================
237
  def predict_trend(data):
238
  try:
239
- y=[1 if d["sentiment"]=="Positive" else -1 if d["sentiment"]=="Negative" else 0 for d in data]
240
- if len(y)<5: return "kurang data"
241
- X=np.arange(len(y)).reshape(-1,1)
242
- model=LinearRegression().fit(X,y)
243
- return "Naik Positif" if model.coef_[0]>0 else "Naik Negatif"
244
- except:
245
- return "error"
 
 
 
 
 
 
 
 
 
 
246
 
247
 
248
  # =========================
249
- # πŸ”₯ ROUTES
250
  # =========================
251
  @app.route("/")
252
  def home():
@@ -256,17 +296,20 @@ def home():
256
  @app.route("/analyze", methods=["POST"])
257
  def analyze():
258
  try:
259
- keyword=request.json.get("keyword")
260
- source=request.json.get("source","all")
261
 
262
- raw=collect_data(keyword,source)
263
 
264
- texts=[t for s,t in raw][:100]
265
- sources=[s for s,t in raw][:100]
266
 
267
- sentiments=predict(texts)
268
 
269
- result=[{"text":t,"sentiment":s,"source":src} for t,s,src in zip(texts,sentiments,sources)]
 
 
 
270
 
271
  # VISUAL
272
  generate_wordcloud(texts)
@@ -274,51 +317,37 @@ def analyze():
274
  generate_timeline(result)
275
 
276
  # ANALYSIS
277
- top_words=get_top_words(texts)
278
- topics=get_topics(texts)
279
- insight=generate_insight(result)
280
- clusters=cluster_opinions(texts)
281
- hoax=detect_hoax(texts)
282
- network=build_network(texts)
283
- bot_network=detect_bot_network(texts)
284
- trend=predict_trend(result)
285
-
286
- # ADVANCED
287
- bot_bert=detect_bot_bert(texts)
288
- fake_news=detect_fake_news(texts)
289
- gnn=run_gnn(bot_network["nodes"], bot_network["edges"])
290
-
291
- # SAVE CSV
292
- os.makedirs("static",exist_ok=True)
293
- pd.DataFrame(result).to_csv("static/result.csv",index=False)
294
-
295
- return jsonify({
296
- "data":result,
297
- "top_words":top_words,
298
- "topics":topics,
299
- "insight":insight,
300
- "clusters":clusters,
301
- "hoax":hoax,
302
- "network":network,
303
- "bot_network":bot_network,
304
- "trend":trend,
305
- "bot_bert":bot_bert,
306
- "fake_news":fake_news,
307
- "gnn":gnn
308
- })
309
 
310
  except Exception as e:
311
- print("ERROR:",e)
312
- return jsonify({"data":[]})
313
 
314
 
315
  @app.route("/download")
316
  def download():
317
- return send_file("static/result.csv",as_attachment=True)
318
 
319
 
320
  # =========================
321
  # RUN
322
  # =========================
323
- if __name__=="__main__":
324
- app.run(host="0.0.0.0",port=7860)
 
3
  from services.sentiment import predict
4
 
5
  # =========================
6
+ # IMPORT
7
  # =========================
8
  from collections import Counter
9
  import pandas as pd
 
11
  import re
12
  import numpy as np
13
 
 
14
  from wordcloud import WordCloud
15
  import matplotlib.pyplot as plt
16
 
 
17
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
18
  from sklearn.decomposition import LatentDirichletAllocation
19
  from sklearn.cluster import KMeans
20
  from sklearn.metrics.pairwise import cosine_similarity
21
  from sklearn.linear_model import LinearRegression
22
 
 
23
  import networkx as nx
24
  from itertools import combinations
25
 
26
+ # OPTIONAL (SAFE IMPORT)
27
  try:
28
  from services.bot_bert import detect_bot_bert
29
  except:
 
39
  except:
40
  def run_gnn(n,e): return []
41
 
42
+ # =========================
43
+ # INIT
44
+ # =========================
45
  app = Flask(__name__)
46
 
47
+
48
  # =========================
49
+ # CLEAN TEXT
50
  # =========================
51
  def clean_text(t):
52
+ return re.sub(r'[^a-zA-Z\s]', '', str(t).lower())
53
 
54
 
55
  # =========================
56
+ # TOP WORDS
57
  # =========================
58
  def get_top_words(texts):
59
  words = []
60
  for t in texts:
61
  words.extend(clean_text(t).split())
62
+ return [{"word": w, "count": c} for w, c in Counter(words).most_common(10)]
63
 
64
 
65
  # =========================
66
+ # WORDCLOUD
67
  # =========================
68
  def generate_wordcloud(texts):
69
  try:
70
  os.makedirs("static", exist_ok=True)
71
+ texts = [t for t in texts if len(t.strip()) > 3]
72
+
73
+ if not texts:
74
+ return
75
+
76
+ wc = WordCloud(width=800, height=400).generate(" ".join(texts))
77
  wc.to_file("static/wordcloud.png")
78
+
79
  except Exception as e:
80
+ print("❌ wordcloud error:", e)
81
 
82
 
83
  # =========================
84
+ # HEATMAP
85
  # =========================
86
  def generate_heatmap(data):
87
  try:
88
+ if not data:
89
+ return
90
+
91
+ labels = ["Positive", "Neutral", "Negative"]
92
  sources = list(set([d["source"] for d in data]))
93
 
94
  matrix = np.zeros((len(sources), len(labels)))
 
96
  for d in data:
97
  i = sources.index(d["source"])
98
  j = labels.index(d["sentiment"])
99
+ matrix[i][j] += 1
100
 
101
+ if matrix.sum() == 0:
102
+ return
103
 
104
  plt.figure()
105
  plt.imshow(matrix)
106
+ plt.xticks(range(len(labels)), labels)
107
+ plt.yticks(range(len(sources)), sources)
108
  plt.colorbar()
109
+
110
  plt.savefig("static/heatmap.png")
111
  plt.close()
112
+
113
  except Exception as e:
114
+ print("❌ heatmap error:", e)
115
 
116
 
117
  # =========================
118
+ # TIMELINE
119
  # =========================
120
  def generate_timeline(data):
121
  try:
122
+ if not data:
123
+ return
124
+
125
+ pos, neg, neu = [], [], []
126
 
 
127
  for d in data:
128
+ pos.append(1 if d["sentiment"] == "Positive" else 0)
129
+ neg.append(1 if d["sentiment"] == "Negative" else 0)
130
+ neu.append(1 if d["sentiment"] == "Neutral" else 0)
131
 
132
  plt.figure()
133
+ plt.plot(pos, label="Positive")
134
+ plt.plot(neg, label="Negative")
135
+ plt.plot(neu, label="Neutral")
136
  plt.legend()
137
+
138
  plt.savefig("static/timeline.png")
139
  plt.close()
140
+
141
  except Exception as e:
142
+ print("❌ timeline error:", e)
143
 
144
 
145
  # =========================
146
+ # TOPIC MODELING
147
  # =========================
148
  def get_topics(texts):
149
  try:
150
+ texts = [t for t in texts if len(t.strip()) > 3]
151
+
152
+ if len(texts) < 5:
153
+ return [["data kurang"]]
154
 
155
  vec = CountVectorizer(min_df=2)
156
  X = vec.fit_transform(texts)
157
 
158
+ if X.shape[1] == 0:
159
+ return [["tidak ada kata"]]
160
 
161
  lda = LatentDirichletAllocation(n_components=3)
162
  lda.fit(X)
163
 
164
  words = vec.get_feature_names_out()
165
+
166
+ topics = []
167
  for t in lda.components_:
168
  topics.append([words[i] for i in t.argsort()[-5:]])
 
 
 
169
 
170
+ return topics
171
 
172
+ except Exception as e:
173
+ print("❌ topic error:", e)
174
+ return [["error"]]
 
 
 
175
 
176
 
177
  # =========================
178
+ # CLUSTER
179
  # =========================
180
  def cluster_opinions(texts):
181
  try:
182
+ if len(texts) < 5:
183
+ return []
184
+
185
+ X = TfidfVectorizer(max_features=300).fit_transform(texts)
186
+ model = KMeans(n_clusters=3, n_init=10)
187
+ labels = model.fit_predict(X)
188
+
189
+ clusters = {}
190
+ for i, l in enumerate(labels):
191
+ clusters.setdefault(l, []).append(texts[i])
192
+
193
+ return [{"cluster": k, "samples": v[:3]} for k, v in clusters.items()]
194
+
195
+ except Exception as e:
196
+ print("❌ cluster error:", e)
197
  return []
198
 
199
 
200
  # =========================
201
+ # HOAX
202
  # =========================
203
  def detect_hoax(texts):
204
+ kw = ["hoax", "bohong", "fitnah", "propaganda"]
205
+ return [
206
+ {"text": t, "label": "Hoax" if any(k in t.lower() for k in kw) else "Normal"}
207
+ for t in texts[:10]
208
+ ]
209
 
210
 
211
  # =========================
212
+ # NETWORK
213
  # =========================
214
  def build_network(texts):
215
+ edges = {}
216
+
217
  for t in texts:
218
+ words = list(set(t.split()))[:5]
219
+ for a, b in combinations(words, 2):
220
+ key = tuple(sorted([a, b]))
221
+ edges[key] = edges.get(key, 0) + 1
222
+
223
+ return [{"source": k[0], "target": k[1], "weight": v} for k, v in edges.items() if v > 1]
224
 
225
 
226
  # =========================
227
+ # BOT NETWORK
228
  # =========================
229
  def detect_bot_network(texts):
230
  try:
231
+ if len(texts) < 5:
232
+ return {"nodes": [], "edges": [], "bots": []}
233
+
234
+ X = TfidfVectorizer(max_features=300).fit_transform(texts)
235
+ sim = cosine_similarity(X)
236
 
237
+ G = nx.Graph()
 
238
 
 
239
  for i in range(len(texts)):
240
+ G.add_node(i, text=texts[i])
241
 
242
  for i in range(len(texts)):
243
+ for j in range(i + 1, len(texts)):
244
+ if sim[i][j] > 0.75:
245
+ G.add_edge(i, j)
246
 
247
+ central = nx.degree_centrality(G)
248
 
249
+ bots = [
250
+ {"node": i, "score": round(s, 2), "text": texts[i]}
251
+ for i, s in central.items() if s > 0.3
252
+ ]
253
 
254
+ nodes = [{"id": i} for i in G.nodes()]
255
+ edges = [{"source": u, "target": v} for u, v in G.edges()]
256
 
257
+ return {"nodes": nodes, "edges": edges, "bots": bots[:10]}
258
+
259
+ except Exception as e:
260
+ print("❌ bot network error:", e)
261
+ return {"nodes": [], "edges": [], "bots": []}
262
 
263
 
264
  # =========================
265
+ # TREND
266
  # =========================
267
  def predict_trend(data):
268
  try:
269
+ y = [
270
+ 1 if d["sentiment"] == "Positive"
271
+ else -1 if d["sentiment"] == "Negative"
272
+ else 0 for d in data
273
+ ]
274
+
275
+ if len(y) < 5:
276
+ return "Data kurang"
277
+
278
+ X = np.arange(len(y)).reshape(-1, 1)
279
+ model = LinearRegression().fit(X, y)
280
+
281
+ return "πŸ“ˆ Positif" if model.coef_[0] > 0 else "πŸ“‰ Negatif"
282
+
283
+ except Exception as e:
284
+ print("❌ trend error:", e)
285
+ return "Error"
286
 
287
 
288
  # =========================
289
+ # ROUTES
290
  # =========================
291
  @app.route("/")
292
  def home():
 
296
  @app.route("/analyze", methods=["POST"])
297
  def analyze():
298
  try:
299
+ keyword = request.json.get("keyword")
300
+ source = request.json.get("source", "all")
301
 
302
+ raw = collect_data(keyword, source)
303
 
304
+ texts = [t for s, t in raw][:100]
305
+ sources = [s for s, t in raw][:100]
306
 
307
+ sentiments = predict(texts)
308
 
309
+ result = [
310
+ {"text": t, "sentiment": s, "source": src}
311
+ for t, s, src in zip(texts, sentiments, sources)
312
+ ]
313
 
314
  # VISUAL
315
  generate_wordcloud(texts)
 
317
  generate_timeline(result)
318
 
319
  # ANALYSIS
320
+ response = {
321
+ "data": result,
322
+ "top_words": get_top_words(texts),
323
+ "topics": get_topics(texts),
324
+ "clusters": cluster_opinions(texts),
325
+ "hoax": detect_hoax(texts),
326
+ "network": build_network(texts),
327
+ "bot_network": detect_bot_network(texts),
328
+ "trend": predict_trend(result),
329
+ "bot_bert": detect_bot_bert(texts),
330
+ "fake_news": detect_fake_news(texts),
331
+ "gnn": [] # πŸ”₯ DISABLE TORCH SAFE
332
+ }
333
+
334
+ os.makedirs("static", exist_ok=True)
335
+ pd.DataFrame(result).to_csv("static/result.csv", index=False)
336
+
337
+ return jsonify(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  except Exception as e:
340
+ print("❌ ERROR:", e)
341
+ return jsonify({"data": []})
342
 
343
 
344
  @app.route("/download")
345
  def download():
346
+ return send_file("static/result.csv", as_attachment=True)
347
 
348
 
349
  # =========================
350
  # RUN
351
  # =========================
352
+ if __name__ == "__main__":
353
+ app.run(host="0.0.0.0", port=7860)