Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| from io import BytesIO | |
| import base64 | |
| import re | |
| API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU" | |
| model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"} | |
| lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"} | |
| # Fungsi untuk membersihkan teks | |
| def clean_text(text): | |
| # Menghapus URL | |
| text = re.sub(r'http\S+|www\S+', '', text) | |
| # Mengubah teks ke huruf kecil | |
| text = text.lower() | |
| return text | |
| # Fungsi untuk mengambil ID video dari URL YouTube | |
| def extract_video_id(url): | |
| import re | |
| match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url) | |
| return match.group(1) if match else None | |
| # Fungsi untuk mendapatkan komentar YouTube | |
| def get_youtube_comments(url, max_comments=100): | |
| video_id = extract_video_id(url) | |
| if not video_id: | |
| return [] | |
| comments = [] | |
| next_page_token = "" | |
| while len(comments) < max_comments: | |
| api_url = ( | |
| f"https://www.googleapis.com/youtube/v3/commentThreads" | |
| f"?part=snippet&videoId={video_id}&key={API_KEY}" | |
| f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}" | |
| ) | |
| response = requests.get(api_url) | |
| if response.status_code != 200: | |
| break | |
| data = response.json() | |
| for item in data.get("items", []): | |
| comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"] | |
| comments.append(comment) | |
| if len(comments) >= max_comments: | |
| break | |
| next_page_token = data.get("nextPageToken", "") | |
| if not next_page_token: | |
| break | |
| return comments | |
| # Fungsi untuk klasifikasi berbasis lexicon | |
| def classify_lexicon(comment): | |
| text = comment.lower() | |
| pos_count = sum(1 for word in lexicon_pos if word in text) | |
| neg_count = sum(1 for word in lexicon_neg if word in text) | |
| if pos_count > neg_count: | |
| return "Positive" | |
| elif neg_count > pos_count: | |
| return "Negative" | |
| else: | |
| return "Neutral" | |
| # Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon | |
| def classify_sentiment(comments): | |
| results = [] | |
| label_map = {0: "Negative", 1: "Neutral", 2: "Positive"} | |
| # Proses cleaning sebelum dikirim ke model | |
| cleaned_comments = [clean_text(comment) for comment in comments] | |
| for comment in cleaned_comments: | |
| # Tokenisasi menggunakan IndoBERT | |
| inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| probs = torch.nn.functional.softmax(outputs.logits, dim=1) | |
| predicted = torch.argmax(probs, dim=1).item() | |
| confidence = torch.max(probs).item() | |
| indo_label = label_map[predicted] | |
| lex_label = classify_lexicon(comment) | |
| results.append((comment, indo_label, lex_label, confidence)) | |
| return results | |
| # Fungsi untuk menghasilkan visualisasi data | |
| def generate_visualization(results): | |
| df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"]) | |
| fig, axs = plt.subplots(1, 3, figsize=(18, 5)) | |
| indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0) | |
| axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"]) | |
| axs[0].set_title("IndoBERT Sentiment Distribution") | |
| lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0) | |
| axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"]) | |
| axs[1].set_title("Lexicon Sentiment Distribution") | |
| axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"], | |
| list(indo_counts.values) + list(lex_counts.values), | |
| color=["green", "yellow", "red", "green", "yellow", "red"]) | |
| axs[2].set_title("Sentiment Comparison (Bar)") | |
| buf = BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format="png") | |
| buf.seek(0) | |
| encoded = base64.b64encode(buf.read()).decode("utf-8") | |
| plt.close() | |
| return f"<img src='data:image/png;base64,{encoded}'/>" | |
| # Fungsi utama untuk analisis sentimen | |
| def analyze_sentiment(url, jumlah): | |
| comments = get_youtube_comments(url, max_comments=jumlah) | |
| if not comments: | |
| return pd.DataFrame(), "Tidak ada komentar ditemukan" | |
| results = classify_sentiment(comments) | |
| df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"]) | |
| chart = generate_visualization(results) | |
| return df, chart | |
| gr.Interface( | |
| fn=analyze_sentiment, | |
| inputs=[ | |
| gr.Text(label="URL Video YouTube"), | |
| gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis") | |
| ], | |
| outputs=[ | |
| gr.Dataframe(label="Preview Komentar dan Sentimen"), | |
| gr.HTML(label="Visualisasi Komparatif") | |
| ], | |
| title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon", | |
| description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based." | |
| ).launch() | |