import gradio as gr import requests from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import matplotlib.pyplot as plt import pandas as pd from io import BytesIO import base64 import re API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU" model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"} lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"} # Fungsi untuk membersihkan teks def clean_text(text): # Menghapus URL text = re.sub(r'http\S+|www\S+', '', text) # Mengubah teks ke huruf kecil text = text.lower() return text # Fungsi untuk mengambil ID video dari URL YouTube def extract_video_id(url): import re match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url) return match.group(1) if match else None # Fungsi untuk mendapatkan komentar YouTube def get_youtube_comments(url, max_comments=100): video_id = extract_video_id(url) if not video_id: return [] comments = [] next_page_token = "" while len(comments) < max_comments: api_url = ( f"https://www.googleapis.com/youtube/v3/commentThreads" f"?part=snippet&videoId={video_id}&key={API_KEY}" f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}" ) response = requests.get(api_url) if response.status_code != 200: break data = response.json() for item in data.get("items", []): comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"] comments.append(comment) if len(comments) >= max_comments: break next_page_token = data.get("nextPageToken", "") if not next_page_token: break return comments # Fungsi untuk klasifikasi berbasis lexicon def classify_lexicon(comment): text = comment.lower() pos_count = sum(1 for word in lexicon_pos if word in text) neg_count = sum(1 for word in lexicon_neg if word in text) if pos_count > neg_count: return "Positive" elif neg_count > pos_count: return "Negative" else: return "Neutral" # Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon def classify_sentiment(comments): results = [] label_map = {0: "Negative", 1: "Neutral", 2: "Positive"} # Proses cleaning sebelum dikirim ke model cleaned_comments = [clean_text(comment) for comment in comments] for comment in cleaned_comments: # Tokenisasi menggunakan IndoBERT inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) predicted = torch.argmax(probs, dim=1).item() confidence = torch.max(probs).item() indo_label = label_map[predicted] lex_label = classify_lexicon(comment) results.append((comment, indo_label, lex_label, confidence)) return results # Fungsi untuk menghasilkan visualisasi data def generate_visualization(results): df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"]) fig, axs = plt.subplots(1, 3, figsize=(18, 5)) indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0) axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"]) axs[0].set_title("IndoBERT Sentiment Distribution") lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0) axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"]) axs[1].set_title("Lexicon Sentiment Distribution") axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"], list(indo_counts.values) + list(lex_counts.values), color=["green", "yellow", "red", "green", "yellow", "red"]) axs[2].set_title("Sentiment Comparison (Bar)") buf = BytesIO() plt.tight_layout() plt.savefig(buf, format="png") buf.seek(0) encoded = base64.b64encode(buf.read()).decode("utf-8") plt.close() return f"" # Fungsi utama untuk analisis sentimen def analyze_sentiment(url, jumlah): comments = get_youtube_comments(url, max_comments=jumlah) if not comments: return pd.DataFrame(), "Tidak ada komentar ditemukan" results = classify_sentiment(comments) df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"]) chart = generate_visualization(results) return df, chart gr.Interface( fn=analyze_sentiment, inputs=[ gr.Text(label="URL Video YouTube"), gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis") ], outputs=[ gr.Dataframe(label="Preview Komentar dan Sentimen"), gr.HTML(label="Visualisasi Komparatif") ], title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon", description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based." ).launch()