File size: 4,636 Bytes
5ea0dba 7cfa7d7 5ea0dba 7cfa7d7 0dd6aad 7cfa7d7 bbd412a 5ea0dba 7cfa7d7 46269ce 7cfa7d7 46269ce 7cfa7d7 46269ce 7cfa7d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
import pandas as pd
from io import BytesIO
import base64
import re
# Model yang digunakan sekarang hasbigani/indobertsentiment
model_name = "hasbigani/indobertsentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Fungsi untuk membersihkan teks
def clean_text(text):
# Menghapus URL
text = re.sub(r'http\S+|www\S+', '', text)
# Menghapus emoji dan karakter non-alfabet
text = re.sub(r'[^\w\s]', '', text)
# Menghapus angka
text = re.sub(r'\d+', '', text)
# Mengubah teks ke huruf kecil
text = text.lower()
return text
# Fungsi untuk mengambil ID video dari URL YouTube
def extract_video_id(url):
import re
match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
return match.group(1) if match else None
# Fungsi untuk mendapatkan komentar YouTube
def get_youtube_comments(url, max_comments=100):
video_id = extract_video_id(url)
if not video_id:
return []
comments = []
next_page_token = ""
while len(comments) < max_comments:
api_url = (
f"https://www.googleapis.com/youtube/v3/commentThreads"
f"?part=snippet&videoId={video_id}&key=AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"
f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
)
response = requests.get(api_url)
if response.status_code != 200:
break
data = response.json()
for item in data.get("items", []):
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
comments.append(comment)
if len(comments) >= max_comments:
break
next_page_token = data.get("nextPageToken", "")
if not next_page_token:
break
return comments
# Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT
def classify_sentiment(comments):
results = []
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
# Proses cleaning sebelum dikirim ke model
cleaned_comments = [clean_text(comment) for comment in comments]
for comment in cleaned_comments:
# Tokenisasi menggunakan IndoBERT
inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
predicted = torch.argmax(probs, dim=1).item()
confidence = torch.max(probs).item()
indo_label = label_map[predicted]
results.append((comment, indo_label, confidence))
return results
# Fungsi untuk menghasilkan visualisasi data
def generate_visualization(results):
df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Confidence"])
fig, axs = plt.subplots(1, 2, figsize=(18, 5))
indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
axs[0].set_title("IndoBERT Sentiment Distribution")
axs[1].bar(["Positive", "Neutral", "Negative"],
indo_counts.values, color=["green", "yellow", "red"])
axs[1].set_title("Sentiment Comparison (Bar)")
buf = BytesIO()
plt.tight_layout()
plt.savefig(buf, format="png")
buf.seek(0)
encoded = base64.b64encode(buf.read()).decode("utf-8")
plt.close()
return f"<img src='data:image/png;base64,{encoded}'/>"
# Fungsi utama untuk analisis sentimen
def analyze_sentiment(url, jumlah):
comments = get_youtube_comments(url, max_comments=jumlah)
if not comments:
return pd.DataFrame(), "Tidak ada komentar ditemukan"
results = classify_sentiment(comments)
df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Confidence"])
chart = generate_visualization(results)
return df, chart
gr.Interface(
fn=analyze_sentiment,
inputs=[
gr.Text(label="URL Video YouTube"),
gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
],
outputs=[
gr.Dataframe(label="Preview Komentar dan Sentimen"),
gr.HTML(label="Visualisasi Sentimen")
],
title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT",
description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya menggunakan model IndoBERT."
).launch()
|