import gradio as gr import requests from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import matplotlib.pyplot as plt import pandas as pd from io import BytesIO import base64 import re # Model yang digunakan sekarang hasbigani/indobertsentiment model_name = "hasbigani/indobertsentiment" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name) # Fungsi untuk membersihkan teks def clean_text(text): # Menghapus URL text = re.sub(r'http\S+|www\S+', '', text) # Menghapus emoji dan karakter non-alfabet text = re.sub(r'[^\w\s]', '', text) # Menghapus angka text = re.sub(r'\d+', '', text) # Mengubah teks ke huruf kecil text = text.lower() return text # Fungsi untuk mengambil ID video dari URL YouTube def extract_video_id(url): import re match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url) return match.group(1) if match else None # Fungsi untuk mendapatkan komentar YouTube def get_youtube_comments(url, max_comments=100): video_id = extract_video_id(url) if not video_id: return [] comments = [] next_page_token = "" while len(comments) < max_comments: api_url = ( f"https://www.googleapis.com/youtube/v3/commentThreads" f"?part=snippet&videoId={video_id}&key=AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU" f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}" ) response = requests.get(api_url) if response.status_code != 200: break data = response.json() for item in data.get("items", []): comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"] comments.append(comment) if len(comments) >= max_comments: break next_page_token = data.get("nextPageToken", "") if not next_page_token: break return comments # Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT def classify_sentiment(comments): results = [] label_map = {0: "Negative", 1: "Neutral", 2: "Positive"} # Proses cleaning sebelum dikirim ke model cleaned_comments = [clean_text(comment) for comment in comments] for comment in cleaned_comments: # Tokenisasi menggunakan IndoBERT inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=1) predicted = torch.argmax(probs, dim=1).item() confidence = torch.max(probs).item() indo_label = label_map[predicted] results.append((comment, indo_label, confidence)) return results # Fungsi untuk menghasilkan visualisasi data def generate_visualization(results): df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Confidence"]) fig, axs = plt.subplots(1, 2, figsize=(18, 5)) indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0) axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"]) axs[0].set_title("IndoBERT Sentiment Distribution") axs[1].bar(["Positive", "Neutral", "Negative"], indo_counts.values, color=["green", "yellow", "red"]) axs[1].set_title("Sentiment Comparison (Bar)") buf = BytesIO() plt.tight_layout() plt.savefig(buf, format="png") buf.seek(0) encoded = base64.b64encode(buf.read()).decode("utf-8") plt.close() return f"" # Fungsi utama untuk analisis sentimen def analyze_sentiment(url, jumlah): comments = get_youtube_comments(url, max_comments=jumlah) if not comments: return pd.DataFrame(), "Tidak ada komentar ditemukan" results = classify_sentiment(comments) df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Confidence"]) chart = generate_visualization(results) return df, chart gr.Interface( fn=analyze_sentiment, inputs=[ gr.Text(label="URL Video YouTube"), gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis") ], outputs=[ gr.Dataframe(label="Preview Komentar dan Sentimen"), gr.HTML(label="Visualisasi Sentimen") ], title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT", description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya menggunakan model IndoBERT." ).launch()