noranisa commited on
Commit
409899f
Β·
verified Β·
1 Parent(s): 40ec3b8

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +107 -26
main.py CHANGED
@@ -7,14 +7,19 @@ import pandas as pd
7
  import os
8
  import re
9
 
 
10
  from wordcloud import WordCloud
11
  import matplotlib.pyplot as plt
12
  import numpy as np
13
 
 
14
  from sklearn.decomposition import LatentDirichletAllocation
15
- from sklearn.feature_extraction.text import CountVectorizer
16
-
17
 
 
 
 
18
  app = Flask(__name__)
19
 
20
 
@@ -23,11 +28,9 @@ app = Flask(__name__)
23
  # =========================
24
  def get_top_words(texts, top_n=10):
25
  words = []
26
-
27
  for t in texts:
28
  t = re.sub(r'[^a-zA-Z\s]', '', t.lower())
29
  words.extend(t.split())
30
-
31
  return [{"word": w, "count": c} for w, c in Counter(words).most_common(top_n)]
32
 
33
 
@@ -37,7 +40,6 @@ def get_top_words(texts, top_n=10):
37
  def generate_wordcloud(texts):
38
  try:
39
  os.makedirs("static", exist_ok=True)
40
-
41
  texts = [t for t in texts if len(t.strip()) > 3]
42
 
43
  if len(texts) == 0:
@@ -61,9 +63,6 @@ def generate_heatmap(data):
61
  labels_sent = ["Positive", "Neutral", "Negative"]
62
  labels_src = list(set([d["source"] for d in data]))
63
 
64
- if len(labels_src) == 0:
65
- return
66
-
67
  matrix = np.zeros((len(labels_src), len(labels_sent)))
68
 
69
  for d in data:
@@ -95,6 +94,40 @@ def generate_heatmap(data):
95
  print("❌ Heatmap error:", e)
96
 
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  # =========================
99
  # πŸ”₯ TOPIC MODELING (SAFE)
100
  # =========================
@@ -103,13 +136,13 @@ def get_topics(texts, n_topics=3):
103
  texts = [t for t in texts if len(t.strip()) > 3]
104
 
105
  if len(texts) < 5:
106
- return [["data kurang untuk topic modeling"]]
107
 
108
  vectorizer = CountVectorizer(min_df=2)
109
  X = vectorizer.fit_transform(texts)
110
 
111
  if X.shape[1] == 0:
112
- return [["tidak ada kata valid"]]
113
 
114
  lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
115
  lda.fit(X)
@@ -118,18 +151,17 @@ def get_topics(texts, n_topics=3):
118
 
119
  topics = []
120
  for topic in lda.components_:
121
- top_words = [words[i] for i in topic.argsort()[-5:]]
122
- topics.append(top_words)
123
 
124
  return topics
125
 
126
  except Exception as e:
127
  print("❌ LDA error:", e)
128
- return [["topic gagal dibuat"]]
129
 
130
 
131
  # =========================
132
- # πŸ€– AI INSIGHT
133
  # =========================
134
  def generate_insight(data, topics):
135
  sentiments = [d["sentiment"] for d in data]
@@ -145,7 +177,6 @@ def generate_insight(data, topics):
145
 
146
  insight = f"""
147
  Total data: {total}
148
-
149
  Positive: {pos}
150
  Negative: {neg}
151
  Neutral: {neu}
@@ -161,6 +192,55 @@ Topik utama:
161
  return insight
162
 
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  # =========================
165
  # 🌐 HOME
166
  # =========================
@@ -193,16 +273,20 @@ def analyze():
193
  "source": src
194
  })
195
 
196
- # πŸ”₯ GENERATE VISUAL
197
  generate_wordcloud(texts)
198
  generate_heatmap(result)
 
199
 
200
- # πŸ”₯ ANALYTICS
201
  top_words = get_top_words(texts)
202
  topics = get_topics(texts)
203
  insight = generate_insight(result, topics)
204
 
205
- # πŸ”₯ CSV
 
 
 
206
  os.makedirs("static", exist_ok=True)
207
  pd.DataFrame(result).to_csv("static/result.csv", index=False)
208
 
@@ -210,17 +294,14 @@ def analyze():
210
  "data": result,
211
  "top_words": top_words,
212
  "topics": topics,
213
- "insight": insight
 
 
214
  })
215
 
216
  except Exception as e:
217
- print("❌ ERROR ANALYZE:", e)
218
- return jsonify({
219
- "data": [],
220
- "top_words": [],
221
- "topics": [["error"]],
222
- "insight": "Terjadi error"
223
- })
224
 
225
 
226
  # =========================
 
7
  import os
8
  import re
9
 
10
+ # VISUAL
11
  from wordcloud import WordCloud
12
  import matplotlib.pyplot as plt
13
  import numpy as np
14
 
15
+ # ML
16
  from sklearn.decomposition import LatentDirichletAllocation
17
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
18
+ from sklearn.cluster import KMeans
19
 
20
+ # =========================
21
+ # INIT
22
+ # =========================
23
  app = Flask(__name__)
24
 
25
 
 
28
  # =========================
29
  def get_top_words(texts, top_n=10):
30
  words = []
 
31
  for t in texts:
32
  t = re.sub(r'[^a-zA-Z\s]', '', t.lower())
33
  words.extend(t.split())
 
34
  return [{"word": w, "count": c} for w, c in Counter(words).most_common(top_n)]
35
 
36
 
 
40
  def generate_wordcloud(texts):
41
  try:
42
  os.makedirs("static", exist_ok=True)
 
43
  texts = [t for t in texts if len(t.strip()) > 3]
44
 
45
  if len(texts) == 0:
 
63
  labels_sent = ["Positive", "Neutral", "Negative"]
64
  labels_src = list(set([d["source"] for d in data]))
65
 
 
 
 
66
  matrix = np.zeros((len(labels_src), len(labels_sent)))
67
 
68
  for d in data:
 
94
  print("❌ Heatmap error:", e)
95
 
96
 
97
+ # =========================
98
+ # πŸ”₯ TIMELINE
99
+ # =========================
100
+ def generate_timeline(data):
101
+ try:
102
+ if len(data) == 0:
103
+ return
104
+
105
+ os.makedirs("static", exist_ok=True)
106
+
107
+ timestamps = list(range(len(data)))
108
+
109
+ pos, neg, neu = [], [], []
110
+
111
+ for d in data:
112
+ pos.append(1 if d["sentiment"] == "Positive" else 0)
113
+ neg.append(1 if d["sentiment"] == "Negative" else 0)
114
+ neu.append(1 if d["sentiment"] == "Neutral" else 0)
115
+
116
+ plt.figure()
117
+ plt.plot(timestamps, pos, label="Positive")
118
+ plt.plot(timestamps, neg, label="Negative")
119
+ plt.plot(timestamps, neu, label="Neutral")
120
+
121
+ plt.legend()
122
+ plt.title("Sentiment Timeline")
123
+
124
+ plt.savefig("static/timeline.png")
125
+ plt.close()
126
+
127
+ except Exception as e:
128
+ print("❌ Timeline error:", e)
129
+
130
+
131
  # =========================
132
  # πŸ”₯ TOPIC MODELING (SAFE)
133
  # =========================
 
136
  texts = [t for t in texts if len(t.strip()) > 3]
137
 
138
  if len(texts) < 5:
139
+ return [["data kurang"]]
140
 
141
  vectorizer = CountVectorizer(min_df=2)
142
  X = vectorizer.fit_transform(texts)
143
 
144
  if X.shape[1] == 0:
145
+ return [["tidak ada kata"]]
146
 
147
  lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
148
  lda.fit(X)
 
151
 
152
  topics = []
153
  for topic in lda.components_:
154
+ topics.append([words[i] for i in topic.argsort()[-5:]])
 
155
 
156
  return topics
157
 
158
  except Exception as e:
159
  print("❌ LDA error:", e)
160
+ return [["topic gagal"]]
161
 
162
 
163
  # =========================
164
+ # πŸ€– AI INSIGHT (RULE SAFE)
165
  # =========================
166
  def generate_insight(data, topics):
167
  sentiments = [d["sentiment"] for d in data]
 
177
 
178
  insight = f"""
179
  Total data: {total}
 
180
  Positive: {pos}
181
  Negative: {neg}
182
  Neutral: {neu}
 
192
  return insight
193
 
194
 
195
+ # =========================
196
+ # πŸ”₯ CLUSTERING
197
+ # =========================
198
+ def cluster_opinions(texts):
199
+ try:
200
+ texts = [t for t in texts if len(t.strip()) > 5]
201
+
202
+ if len(texts) < 5:
203
+ return []
204
+
205
+ vectorizer = TfidfVectorizer(max_features=500)
206
+ X = vectorizer.fit_transform(texts)
207
+
208
+ model = KMeans(n_clusters=3, random_state=42, n_init=10)
209
+ labels = model.fit_predict(X)
210
+
211
+ clusters = {}
212
+ for i, label in enumerate(labels):
213
+ clusters.setdefault(label, []).append(texts[i])
214
+
215
+ result = []
216
+ for k, v in clusters.items():
217
+ result.append({"cluster": int(k), "samples": v[:3]})
218
+
219
+ return result
220
+
221
+ except Exception as e:
222
+ print("❌ clustering error:", e)
223
+ return []
224
+
225
+
226
+ # =========================
227
+ # 🚨 HOAX DETECTION
228
+ # =========================
229
+ def detect_hoax(texts):
230
+ keywords = ["hoax","bohong","fitnah","manipulasi","propaganda","tipu"]
231
+
232
+ result = []
233
+ for t in texts:
234
+ score = sum(1 for k in keywords if k in t.lower())
235
+ result.append({
236
+ "text": t,
237
+ "score": score,
238
+ "label": "Hoax" if score >= 2 else "Normal"
239
+ })
240
+
241
+ return result
242
+
243
+
244
  # =========================
245
  # 🌐 HOME
246
  # =========================
 
273
  "source": src
274
  })
275
 
276
+ # VISUAL
277
  generate_wordcloud(texts)
278
  generate_heatmap(result)
279
+ generate_timeline(result)
280
 
281
+ # ANALYTICS
282
  top_words = get_top_words(texts)
283
  topics = get_topics(texts)
284
  insight = generate_insight(result, topics)
285
 
286
+ clusters = cluster_opinions(texts)
287
+ hoax = detect_hoax(texts)
288
+
289
+ # CSV
290
  os.makedirs("static", exist_ok=True)
291
  pd.DataFrame(result).to_csv("static/result.csv", index=False)
292
 
 
294
  "data": result,
295
  "top_words": top_words,
296
  "topics": topics,
297
+ "insight": insight,
298
+ "clusters": clusters,
299
+ "hoax": hoax
300
  })
301
 
302
  except Exception as e:
303
+ print("❌ ERROR:", e)
304
+ return jsonify({"data": []})
 
 
 
 
 
305
 
306
 
307
  # =========================