Spaces:

hasbigani
/

Youtube_Sentiment

Sleeping

App Files Files Community

hasbigani commited on Jun 4, 2025

Commit

409b285

verified ·

1 Parent(s): 65b1a32

Upload app.py

Browse files

Files changed (1) hide show

app.py +147 -0

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+import requests
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import matplotlib.pyplot as plt
+import pandas as pd
+from io import BytesIO
+import base64
+import re
+API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"
+model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"}
+lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"}
+# Fungsi untuk membersihkan teks
+def clean_text(text):
+    # Menghapus URL
+    text = re.sub(r'http\S+|www\S+', '', text)
+    # Menghapus emoji dan karakter non-alfabet
+    text = re.sub(r'[^\w\s]', '', text)
+    # Menghapus angka
+    text = re.sub(r'\d+', '', text)
+    # Mengubah teks ke huruf kecil
+    text = text.lower()
+    return text
+# Fungsi untuk mengambil ID video dari URL YouTube
+def extract_video_id(url):
+    import re
+    match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
+    return match.group(1) if match else None
+# Fungsi untuk mendapatkan komentar YouTube
+def get_youtube_comments(url, max_comments=100):
+    video_id = extract_video_id(url)
+    if not video_id:
+        return []
+    comments = []
+    next_page_token = ""
+    while len(comments) < max_comments:
+        api_url = (
+            f"https://www.googleapis.com/youtube/v3/commentThreads"
+            f"?part=snippet&videoId={video_id}&key={API_KEY}"
+            f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
+        )
+        response = requests.get(api_url)
+        if response.status_code != 200:
+            break
+        data = response.json()
+        for item in data.get("items", []):
+            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
+            comments.append(comment)
+            if len(comments) >= max_comments:
+                break
+        next_page_token = data.get("nextPageToken", "")
+        if not next_page_token:
+            break
+    return comments
+# Fungsi untuk klasifikasi berbasis lexicon
+def classify_lexicon(comment):
+    text = comment.lower()
+    pos_count = sum(1 for word in lexicon_pos if word in text)
+    neg_count = sum(1 for word in lexicon_neg if word in text)
+    if pos_count > neg_count:
+        return "Positive"
+    elif neg_count > pos_count:
+        return "Negative"
+    else:
+        return "Neutral"
+# Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon
+def classify_sentiment(comments):
+    results = []
+    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
+    # Proses cleaning sebelum dikirim ke model
+    cleaned_comments = [clean_text(comment) for comment in comments]
+    for comment in cleaned_comments:
+        # Tokenisasi menggunakan IndoBERT
+        inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
+        predicted = torch.argmax(probs, dim=1).item()
+        confidence = torch.max(probs).item()
+        indo_label = label_map[predicted]
+        lex_label = classify_lexicon(comment)
+        results.append((comment, indo_label, lex_label, confidence))
+    return results
+# Fungsi untuk menghasilkan visualisasi data
+def generate_visualization(results):
+    df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"])
+    fig, axs = plt.subplots(1, 3, figsize=(18, 5))
+    indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
+    axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
+    axs[0].set_title("IndoBERT Sentiment Distribution")
+    lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
+    axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
+    axs[1].set_title("Lexicon Sentiment Distribution")
+    axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"],
+               list(indo_counts.values) + list(lex_counts.values),
+               color=["green", "yellow", "red", "green", "yellow", "red"])
+    axs[2].set_title("Sentiment Comparison (Bar)")
+    buf = BytesIO()
+    plt.tight_layout()
+    plt.savefig(buf, format="png")
+    buf.seek(0)
+    encoded = base64.b64encode(buf.read()).decode("utf-8")
+    plt.close()
+    return f"<img src='data:image/png;base64,{encoded}'/>"
+# Fungsi utama untuk analisis sentimen
+def analyze_sentiment(url, jumlah):
+    comments = get_youtube_comments(url, max_comments=jumlah)
+    if not comments:
+        return pd.DataFrame(), "Tidak ada komentar ditemukan"
+    results = classify_sentiment(comments)
+    df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"])
+    chart = generate_visualization(results)
+    return df, chart
+gr.Interface(
+    fn=analyze_sentiment,
+    inputs=[
+        gr.Text(label="URL Video YouTube"),
+        gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
+    ],
+    outputs=[
+        gr.Dataframe(label="Preview Komentar dan Sentimen"),
+        gr.HTML(label="Visualisasi Komparatif")
+    ],
+    title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon",
+    description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based."
+).launch()