File size: 5,651 Bytes
409b285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
import pandas as pd
from io import BytesIO
import base64
import re

API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"

model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"}
lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"}

# Fungsi untuk membersihkan teks
def clean_text(text):
    # Menghapus URL
    text = re.sub(r'http\S+|www\S+', '', text)
    # Mengubah teks ke huruf kecil
    text = text.lower()
    return text

# Fungsi untuk mengambil ID video dari URL YouTube
def extract_video_id(url):
    import re
    match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
    return match.group(1) if match else None

# Fungsi untuk mendapatkan komentar YouTube
def get_youtube_comments(url, max_comments=100):
    video_id = extract_video_id(url)
    if not video_id:
        return []
    comments = []
    next_page_token = ""
    while len(comments) < max_comments:
        api_url = (
            f"https://www.googleapis.com/youtube/v3/commentThreads"
            f"?part=snippet&videoId={video_id}&key={API_KEY}"
            f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
        )
        response = requests.get(api_url)
        if response.status_code != 200:
            break
        data = response.json()
        for item in data.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
            if len(comments) >= max_comments:
                break
        next_page_token = data.get("nextPageToken", "")
        if not next_page_token:
            break
    return comments

# Fungsi untuk klasifikasi berbasis lexicon
def classify_lexicon(comment):
    text = comment.lower()
    pos_count = sum(1 for word in lexicon_pos if word in text)
    neg_count = sum(1 for word in lexicon_neg if word in text)
    if pos_count > neg_count:
        return "Positive"
    elif neg_count > pos_count:
        return "Negative"
    else:
        return "Neutral"

# Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon
def classify_sentiment(comments):
    results = []
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

    # Proses cleaning sebelum dikirim ke model
    cleaned_comments = [clean_text(comment) for comment in comments]

    for comment in cleaned_comments:
        # Tokenisasi menggunakan IndoBERT
        inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
        confidence = torch.max(probs).item()
        indo_label = label_map[predicted]
        lex_label = classify_lexicon(comment)
        results.append((comment, indo_label, lex_label, confidence))
    return results

# Fungsi untuk menghasilkan visualisasi data
def generate_visualization(results):
    df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"])
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))

    indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
    axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
    axs[0].set_title("IndoBERT Sentiment Distribution")

    lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
    axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
    axs[1].set_title("Lexicon Sentiment Distribution")

    axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"],
               list(indo_counts.values) + list(lex_counts.values),
               color=["green", "yellow", "red", "green", "yellow", "red"])
    axs[2].set_title("Sentiment Comparison (Bar)")

    buf = BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format="png")
    buf.seek(0)
    encoded = base64.b64encode(buf.read()).decode("utf-8")
    plt.close()
    return f"<img src='data:image/png;base64,{encoded}'/>"

# Fungsi utama untuk analisis sentimen
def analyze_sentiment(url, jumlah):
    comments = get_youtube_comments(url, max_comments=jumlah)
    if not comments:
        return pd.DataFrame(), "Tidak ada komentar ditemukan"
    results = classify_sentiment(comments)
    df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"])
    chart = generate_visualization(results)
    return df, chart

gr.Interface(
    fn=analyze_sentiment,
    inputs=[
        gr.Text(label="URL Video YouTube"),
        gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
    ],
    outputs=[
        gr.Dataframe(label="Preview Komentar dan Sentimen"),
        gr.HTML(label="Visualisasi Komparatif")
    ],
    title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon",
    description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based."
).launch()