Spaces:

noranisa
/

Sentimen-Analysis

Sleeping

App Files Files Community

noranisa commited on 18 days ago

Commit

7051fa3

verified ·

1 Parent(s): 45e75e6

Update main.py

Browse files

Files changed (1) hide show

main.py +191 -136

main.py CHANGED Viewed

@@ -3,7 +3,7 @@ from services.aggregator import collect_data
 from services.sentiment import predict
 # =========================
-# IMPORT
 # =========================
 from collections import Counter
 import pandas as pd
@@ -11,55 +11,71 @@ import os
 import re
 import numpy as np
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.linear_model import LinearRegression
 import networkx as nx
 from itertools import combinations
-# OPTIONAL (SAFE IMPORT)
 try:
     from services.bot_bert import detect_bot_bert
-except:
     def detect_bot_bert(x): return []
 try:
     from services.fake_news import detect_fake_news
-except:
     def detect_fake_news(x): return []
 try:
     from services.gnn import run_gnn
-except:
-    def run_gnn(n,e): return []
-# =========================
-# INIT
-# =========================
 app = Flask(__name__)
 # =========================
-# CLEAN TEXT
 # =========================
 def clean_text(t):
-    return re.sub(r'[^a-zA-Z\s]', '', str(t).lower())
 # =========================
 # TOP WORDS
 # =========================
 def get_top_words(texts):
     words = []
     for t in texts:
-        words.extend(clean_text(t).split())
-    return [{"word": w, "count": c} for w, c in Counter(words).most_common(10)]
 # =========================
@@ -69,15 +85,19 @@ def generate_wordcloud(texts):
     try:
         os.makedirs("static", exist_ok=True)
         texts = [t for t in texts if len(t.strip()) > 3]
         if not texts:
             return
-        wc = WordCloud(width=800, height=400).generate(" ".join(texts))
         wc.to_file("static/wordcloud.png")
     except Exception as e:
-        print("❌ wordcloud error:", e)
 # =========================
@@ -87,11 +107,9 @@ def generate_heatmap(data):
     try:
         if not data:
             return
-        labels = ["Positive", "Neutral", "Negative"]
-        sources = list(set([d["source"] for d in data]))
-        matrix = np.zeros((len(sources), len(labels)))
         for d in data:
             i = sources.index(d["source"])
@@ -101,17 +119,19 @@ def generate_heatmap(data):
         if matrix.sum() == 0:
             return
-        plt.figure()
-        plt.imshow(matrix)
-        plt.xticks(range(len(labels)), labels)
-        plt.yticks(range(len(sources)), sources)
-        plt.colorbar()
-        plt.savefig("static/heatmap.png")
-        plt.close()
     except Exception as e:
-        print("❌ heatmap error:", e)
 # =========================
@@ -121,25 +141,28 @@ def generate_timeline(data):
     try:
         if not data:
             return
-        pos, neg, neu = [], [], []
-        for d in data:
-            pos.append(1 if d["sentiment"] == "Positive" else 0)
-            neg.append(1 if d["sentiment"] == "Negative" else 0)
-            neu.append(1 if d["sentiment"] == "Neutral" else 0)
-        plt.figure()
-        plt.plot(pos, label="Positive")
-        plt.plot(neg, label="Negative")
-        plt.plot(neu, label="Neutral")
-        plt.legend()
-        plt.savefig("static/timeline.png")
-        plt.close()
     except Exception as e:
-        print("❌ timeline error:", e)
 # =========================
@@ -147,65 +170,74 @@ def generate_timeline(data):
 # =========================
 def get_topics(texts):
     try:
-        texts = [t for t in texts if len(t.strip()) > 3]
         if len(texts) < 5:
             return [["data kurang"]]
-        vec = CountVectorizer(min_df=2)
-        X = vec.fit_transform(texts)
         if X.shape[1] == 0:
-            return [["tidak ada kata"]]
-        lda = LatentDirichletAllocation(n_components=3)
         lda.fit(X)
-        words = vec.get_feature_names_out()
         topics = []
         for t in lda.components_:
             topics.append([words[i] for i in t.argsort()[-5:]])
         return topics
     except Exception as e:
-        print("❌ topic error:", e)
         return [["error"]]
 # =========================
 # CLUSTER
 # =========================
 def cluster_opinions(texts):
     try:
-        if len(texts) < 5:
             return []
-        X = TfidfVectorizer(max_features=300).fit_transform(texts)
-        model = KMeans(n_clusters=3, n_init=10)
-        labels = model.fit_predict(X)
         clusters = {}
-        for i, l in enumerate(labels):
-            clusters.setdefault(l, []).append(texts[i])
-        return [{"cluster": k, "samples": v[:3]} for k, v in clusters.items()]
     except Exception as e:
-        print("❌ cluster error:", e)
         return []
 # =========================
-# HOAX
 # =========================
 def detect_hoax(texts):
-    kw = ["hoax", "bohong", "fitnah", "propaganda"]
-    return [
-        {"text": t, "label": "Hoax" if any(k in t.lower() for k in kw) else "Normal"}
-        for t in texts[:10]
-    ]
 # =========================
@@ -213,14 +245,13 @@ def detect_hoax(texts):
 # =========================
 def build_network(texts):
     edges = {}
     for t in texts:
-        words = list(set(t.split()))[:5]
         for a, b in combinations(words, 2):
             key = tuple(sorted([a, b]))
             edges[key] = edges.get(key, 0) + 1
-    return [{"source": k[0], "target": k[1], "weight": v} for k, v in edges.items() if v > 1]
 # =========================
@@ -231,11 +262,10 @@ def detect_bot_network(texts):
         if len(texts) < 5:
             return {"nodes": [], "edges": [], "bots": []}
-        X = TfidfVectorizer(max_features=300).fit_transform(texts)
         sim = cosine_similarity(X)
         G = nx.Graph()
         for i in range(len(texts)):
             G.add_node(i, text=texts[i])
@@ -245,19 +275,16 @@ def detect_bot_network(texts):
                     G.add_edge(i, j)
         central = nx.degree_centrality(G)
-        bots = [
-            {"node": i, "score": round(s, 2), "text": texts[i]}
-            for i, s in central.items() if s > 0.3
-        ]
-        nodes = [{"id": i} for i in G.nodes()]
-        edges = [{"source": u, "target": v} for u, v in G.edges()]
-        return {"nodes": nodes, "edges": edges, "bots": bots[:10]}
     except Exception as e:
-        print("❌ bot network error:", e)
         return {"nodes": [], "edges": [], "bots": []}
@@ -266,22 +293,21 @@ def detect_bot_network(texts):
 # =========================
 def predict_trend(data):
     try:
-        y = [
-            1 if d["sentiment"] == "Positive"
-            else -1 if d["sentiment"] == "Negative"
-            else 0 for d in data
-        ]
         if len(y) < 5:
-            return "Data kurang"
-        X = np.arange(len(y)).reshape(-1, 1)
-        model = LinearRegression().fit(X, y)
-        return "📈 Positif" if model.coef_[0] > 0 else "📉 Negatif"
     except Exception as e:
-        print("❌ trend error:", e)
         return "Error"
@@ -293,16 +319,23 @@ def home():
     return render_template("index.html")
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
-        keyword = request.json.get("keyword")
-        source = request.json.get("source", "all")
-        raw = collect_data(keyword, source)
-        texts = [t for s, t in raw][:100]
-        sources = [s for s, t in raw][:100]
         sentiments = predict(texts)
@@ -311,43 +344,65 @@ def analyze():
             for t, s, src in zip(texts, sentiments, sources)
         ]
-        # VISUAL
         generate_wordcloud(texts)
         generate_heatmap(result)
         generate_timeline(result)
         # ANALYSIS
-        response = {
-            "data": result,
-            "top_words": get_top_words(texts),
-            "topics": get_topics(texts),
-            "clusters": cluster_opinions(texts),
-            "hoax": detect_hoax(texts),
-            "network": build_network(texts),
-            "bot_network": detect_bot_network(texts),
-            "trend": predict_trend(result),
-            "bot_bert": detect_bot_bert(texts),
-            "fake_news": detect_fake_news(texts),
-            "gnn": []  # 🔥 DISABLE TORCH SAFE
-        }
         os.makedirs("static", exist_ok=True)
         pd.DataFrame(result).to_csv("static/result.csv", index=False)
-        return jsonify(response)
     except Exception as e:
-        print("❌ ERROR:", e)
-        return jsonify({"data": []})
 @app.route("/download")
 def download():
-    return send_file("static/result.csv", as_attachment=True)
 # =========================
 # RUN
 # =========================
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

 from services.sentiment import predict
 # =========================
+# IMPORT TAMBAHAN
 # =========================
 from collections import Counter
 import pandas as pd
 import re
 import numpy as np
+# VISUAL
 from wordcloud import WordCloud
+import matplotlib
+matplotlib.use('Agg')  # ← WAJIB: non-interactive backend untuk server
 import matplotlib.pyplot as plt
+# ML
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.linear_model import LinearRegression
+# GRAPH
 import networkx as nx
 from itertools import combinations
+# OPTIONAL ADVANCED
 try:
     from services.bot_bert import detect_bot_bert
+except Exception:
     def detect_bot_bert(x): return []
 try:
     from services.fake_news import detect_fake_news
+except Exception:
     def detect_fake_news(x): return []
 try:
     from services.gnn import run_gnn
+except Exception:
+    def run_gnn(n, e): return []
 app = Flask(__name__)
 # =========================
+# UTIL
 # =========================
 def clean_text(t):
+    t = t.lower()
+    t = re.sub(r'http\S+', '', t)
+    t = re.sub(r'[^a-zA-Z0-9\s]', ' ', t)
+    t = re.sub(r'\s+', ' ', t).strip()
+    return t
 # =========================
 # TOP WORDS
 # =========================
+STOPWORDS_ID = {
+    'yang','dan','di','ke','dari','ini','itu','dengan','untuk','adalah','ada',
+    'pada','juga','tidak','bisa','sudah','saya','kamu','kami','mereka','kita',
+    'nya','pun','aja','gak','ga','ya','yg','dgn','yah','dah','udah','mau',
+    'jadi','buat','kalau','tp','tapi','tapi','banget','sangat','lebih','nih',
+    'sih','dong','lah','lagi','terus','sama','atau','karena','tapi','juga',
+    'so','the','is','in','of','to','a','an','and','it','for','that','this',
+}
 def get_top_words(texts):
     words = []
     for t in texts:
+        for w in clean_text(t).split():
+            if len(w) > 2 and w not in STOPWORDS_ID:
+                words.append(w)
+    return [{"word": w, "count": c} for w, c in Counter(words).most_common(15)]
 # =========================
     try:
         os.makedirs("static", exist_ok=True)
         texts = [t for t in texts if len(t.strip()) > 3]
         if not texts:
             return
+        combined = " ".join(texts)
+        wc = WordCloud(
+            width=900, height=400,
+            background_color='white',
+            max_words=80,
+            stopwords=STOPWORDS_ID,
+            colormap='Blues'
+        ).generate(combined)
         wc.to_file("static/wordcloud.png")
     except Exception as e:
+        print("wordcloud error:", e)
 # =========================
     try:
         if not data:
             return
+        labels  = ["Positive", "Neutral", "Negative"]
+        sources = sorted(set(d["source"] for d in data))
+        matrix  = np.zeros((len(sources), len(labels)))
         for d in data:
             i = sources.index(d["source"])
         if matrix.sum() == 0:
             return
+        fig, ax = plt.subplots(figsize=(6, max(2, len(sources))))
+        im = ax.imshow(matrix, cmap='Blues', aspect='auto')
+        ax.set_xticks(range(len(labels)))
+        ax.set_xticklabels(labels)
+        ax.set_yticks(range(len(sources)))
+        ax.set_yticklabels(sources)
+        plt.colorbar(im, ax=ax)
+        plt.tight_layout()
+        os.makedirs("static", exist_ok=True)
+        plt.savefig("static/heatmap.png", dpi=100)
+        plt.close(fig)
     except Exception as e:
+        print("heatmap error:", e)
 # =========================
     try:
         if not data:
             return
+        os.makedirs("static", exist_ok=True)
+        pos = [1 if d["sentiment"] == "Positive" else 0 for d in data]
+        neg = [1 if d["sentiment"] == "Negative" else 0 for d in data]
+        neu = [1 if d["sentiment"] == "Neutral"  else 0 for d in data]
+        # rolling average
+        def roll(arr, n=5):
+            return [sum(arr[max(0,i-n):i+1]) / len(arr[max(0,i-n):i+1]) for i in range(len(arr))]
+        fig, ax = plt.subplots(figsize=(10, 3))
+        ax.plot(roll(pos), label="Positive", color="#22c55e", linewidth=1.5)
+        ax.plot(roll(neg), label="Negative", color="#ef4444", linewidth=1.5)
+        ax.plot(roll(neu), label="Neutral",  color="#94a3b8", linewidth=1.0)
+        ax.legend()
+        ax.set_facecolor('#f8fafc')
+        fig.patch.set_facecolor('#f8fafc')
+        plt.tight_layout()
+        plt.savefig("static/timeline.png", dpi=100)
+        plt.close(fig)
     except Exception as e:
+        print("timeline error:", e)
 # =========================
 # =========================
 def get_topics(texts):
     try:
+        texts = [t for t in texts if len(t) > 3]
         if len(texts) < 5:
             return [["data kurang"]]
+        vec = CountVectorizer(min_df=2, stop_words=list(STOPWORDS_ID))
+        X   = vec.fit_transform(texts)
         if X.shape[1] == 0:
+            return [["kosong"]]
+        n_topics = min(3, X.shape[1])
+        lda      = LatentDirichletAllocation(n_components=n_topics, random_state=42)
         lda.fit(X)
+        words  = vec.get_feature_names_out()
         topics = []
         for t in lda.components_:
             topics.append([words[i] for i in t.argsort()[-5:]])
         return topics
     except Exception as e:
+        print("topic error:", e)
         return [["error"]]
+# =========================
+# INSIGHT
+# =========================
+def generate_insight(data):
+    s = [d["sentiment"] for d in data]
+    return (f"Positive:{s.count('Positive')} "
+            f"Negative:{s.count('Negative')} "
+            f"Neutral:{s.count('Neutral')}")
 # =========================
 # CLUSTER
 # =========================
 def cluster_opinions(texts):
     try:
+        if len(texts) < 6:
             return []
+        X = TfidfVectorizer(max_features=300, stop_words=list(STOPWORDS_ID)).fit_transform(texts)
+        n = min(3, len(texts))
+        k = KMeans(n_clusters=n, n_init=10, random_state=42).fit(X)
         clusters = {}
+        for i, label in enumerate(k.labels_):
+            clusters.setdefault(int(label), []).append(texts[i])
+        return [{"cluster": lbl, "samples": samples[:3]} for lbl, samples in clusters.items()]
     except Exception as e:
+        print("cluster error:", e)
         return []
 # =========================
+# HOAX (keyword-based)
 # =========================
+HOAX_KW = [
+    "hoax","bohong","fitnah","propaganda","palsu","fake","disinformasi",
+    "menyesatkan","kebohongan","manipulasi","adu domba","provokasi"
+]
 def detect_hoax(texts):
+    results = []
+    for t in texts[:15]:
+        lower = t.lower()
+        label = "Hoax" if any(k in lower for k in HOAX_KW) else "Normal"
+        results.append({"text": t, "label": label})
+    return results
 # =========================
 # =========================
 def build_network(texts):
     edges = {}
     for t in texts:
+        words = [w for w in set(clean_text(t).split()) if len(w) > 3 and w not in STOPWORDS_ID][:6]
         for a, b in combinations(words, 2):
             key = tuple(sorted([a, b]))
             edges[key] = edges.get(key, 0) + 1
+    return [{"source": k[0], "target": k[1], "weight": v}
+            for k, v in edges.items() if v > 1]
 # =========================
         if len(texts) < 5:
             return {"nodes": [], "edges": [], "bots": []}
+        X   = TfidfVectorizer(max_features=300).fit_transform(texts)
         sim = cosine_similarity(X)
         G = nx.Graph()
         for i in range(len(texts)):
             G.add_node(i, text=texts[i])
                     G.add_edge(i, j)
         central = nx.degree_centrality(G)
+        bots    = [{"node": i, "score": round(s, 2), "text": texts[i]}
+                   for i, s in central.items() if s > 0.3]
+        return {
+            "nodes": [{"id": i} for i in G.nodes()],
+            "edges": [{"source": u, "target": v} for u, v in G.edges()],
+            "bots":  bots[:10]
+        }
     except Exception as e:
+        print("bot_network error:", e)
         return {"nodes": [], "edges": [], "bots": []}
 # =========================
 def predict_trend(data):
     try:
+        y = [1 if d["sentiment"] == "Positive" else
+             -1 if d["sentiment"] == "Negative" else 0
+             for d in data]
         if len(y) < 5:
+            return "Kurang Data"
+        X     = np.arange(len(y)).reshape(-1, 1)
+        coef  = LinearRegression().fit(X, y).coef_[0]
+        if coef > 0.05:
+            return "Naik Positif"
+        elif coef < -0.05:
+            return "Naik Negatif"
+        else:
+            return "Stabil"
     except Exception as e:
+        print("trend error:", e)
         return "Error"
     return render_template("index.html")
+@app.route("/result")
+def result():
+    return render_template("result.html")
 @app.route("/analyze", methods=["POST"])
 def analyze():
     try:
+        keyword = request.json.get("keyword", "").strip()
+        source  = request.json.get("source", "all")
+        if not keyword:
+            return jsonify({"error": "keyword kosong", "data": []}), 400
+        raw     = collect_data(keyword, source)
+        texts   = [t for _, t in raw][:100]
+        sources = [s for s, _ in raw][:100]
         sentiments = predict(texts)
             for t, s, src in zip(texts, sentiments, sources)
         ]
+        # VISUAL — non-blocking
         generate_wordcloud(texts)
         generate_heatmap(result)
         generate_timeline(result)
         # ANALYSIS
+        top_words   = get_top_words(texts)
+        topics      = get_topics(texts)
+        insight     = generate_insight(result)
+        clusters    = cluster_opinions(texts)
+        hoax        = detect_hoax(texts)
+        network     = build_network(texts)
+        bot_network = detect_bot_network(texts)
+        trend       = predict_trend(result)
+        # ADVANCED (optional)
+        bot_bert  = detect_bot_bert(texts)
+        fake_news = detect_fake_news(texts)
+        gnn       = run_gnn(bot_network["nodes"], bot_network["edges"])
+        # SAVE CSV
         os.makedirs("static", exist_ok=True)
         pd.DataFrame(result).to_csv("static/result.csv", index=False)
+        return jsonify({
+            "data":        result,
+            "top_words":   top_words,
+            "topics":      topics,
+            "insight":     insight,
+            "clusters":    clusters,
+            "hoax":        hoax,
+            "network":     network,
+            "bot_network": bot_network,
+            "trend":       trend,
+            "bot_bert":    bot_bert,
+            "fake_news":   fake_news,
+            "gnn":         gnn
+        })
     except Exception as e:
+        print("ERROR /analyze:", e)
+        return jsonify({"error": str(e), "data": []}), 500
 @app.route("/download")
 def download():
+    path = "static/result.csv"
+    if not os.path.exists(path):
+        return jsonify({"error": "Belum ada hasil analisis"}), 404
+    return send_file(path, as_attachment=True)
+@app.route("/static/<path:filename>")
+def static_files(filename):
+    return send_file(f"static/{filename}")
 # =========================
 # RUN
 # =========================
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=False)