testing / app.py
hasbigani's picture
Update app.py
46269ce verified
import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
import pandas as pd
from io import BytesIO
import base64
import re
# Model yang digunakan sekarang hasbigani/indobertsentiment
model_name = "hasbigani/indobertsentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Fungsi untuk membersihkan teks
def clean_text(text):
# Menghapus URL
text = re.sub(r'http\S+|www\S+', '', text)
# Menghapus emoji dan karakter non-alfabet
text = re.sub(r'[^\w\s]', '', text)
# Menghapus angka
text = re.sub(r'\d+', '', text)
# Mengubah teks ke huruf kecil
text = text.lower()
return text
# Fungsi untuk mengambil ID video dari URL YouTube
def extract_video_id(url):
import re
match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
return match.group(1) if match else None
# Fungsi untuk mendapatkan komentar YouTube
def get_youtube_comments(url, max_comments=100):
video_id = extract_video_id(url)
if not video_id:
return []
comments = []
next_page_token = ""
while len(comments) < max_comments:
api_url = (
f"https://www.googleapis.com/youtube/v3/commentThreads"
f"?part=snippet&videoId={video_id}&key=AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"
f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
)
response = requests.get(api_url)
if response.status_code != 200:
break
data = response.json()
for item in data.get("items", []):
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
comments.append(comment)
if len(comments) >= max_comments:
break
next_page_token = data.get("nextPageToken", "")
if not next_page_token:
break
return comments
# Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT
def classify_sentiment(comments):
results = []
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
# Proses cleaning sebelum dikirim ke model
cleaned_comments = [clean_text(comment) for comment in comments]
for comment in cleaned_comments:
# Tokenisasi menggunakan IndoBERT
inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
predicted = torch.argmax(probs, dim=1).item()
confidence = torch.max(probs).item()
indo_label = label_map[predicted]
results.append((comment, indo_label, confidence))
return results
# Fungsi untuk menghasilkan visualisasi data
def generate_visualization(results):
df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Confidence"])
fig, axs = plt.subplots(1, 2, figsize=(18, 5))
indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
axs[0].set_title("IndoBERT Sentiment Distribution")
axs[1].bar(["Positive", "Neutral", "Negative"],
indo_counts.values, color=["green", "yellow", "red"])
axs[1].set_title("Sentiment Comparison (Bar)")
buf = BytesIO()
plt.tight_layout()
plt.savefig(buf, format="png")
buf.seek(0)
encoded = base64.b64encode(buf.read()).decode("utf-8")
plt.close()
return f"<img src='data:image/png;base64,{encoded}'/>"
# Fungsi utama untuk analisis sentimen
def analyze_sentiment(url, jumlah):
comments = get_youtube_comments(url, max_comments=jumlah)
if not comments:
return pd.DataFrame(), "Tidak ada komentar ditemukan"
results = classify_sentiment(comments)
df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Confidence"])
chart = generate_visualization(results)
return df, chart
gr.Interface(
fn=analyze_sentiment,
inputs=[
gr.Text(label="URL Video YouTube"),
gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
],
outputs=[
gr.Dataframe(label="Preview Komentar dan Sentimen"),
gr.HTML(label="Visualisasi Sentimen")
],
title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT",
description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya menggunakan model IndoBERT."
).launch()