hasbigani commited on
Commit
409b285
·
verified ·
1 Parent(s): 65b1a32

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import requests
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ import torch
6
+ import matplotlib.pyplot as plt
7
+ import pandas as pd
8
+ from io import BytesIO
9
+ import base64
10
+ import re
11
+
12
+ API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"
13
+
14
+ model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier"
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
17
+
18
+ lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"}
19
+ lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"}
20
+
21
+ # Fungsi untuk membersihkan teks
22
+ def clean_text(text):
23
+ # Menghapus URL
24
+ text = re.sub(r'http\S+|www\S+', '', text)
25
+ # Menghapus emoji dan karakter non-alfabet
26
+ text = re.sub(r'[^\w\s]', '', text)
27
+ # Menghapus angka
28
+ text = re.sub(r'\d+', '', text)
29
+ # Mengubah teks ke huruf kecil
30
+ text = text.lower()
31
+ return text
32
+
33
+ # Fungsi untuk mengambil ID video dari URL YouTube
34
+ def extract_video_id(url):
35
+ import re
36
+ match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
37
+ return match.group(1) if match else None
38
+
39
+ # Fungsi untuk mendapatkan komentar YouTube
40
+ def get_youtube_comments(url, max_comments=100):
41
+ video_id = extract_video_id(url)
42
+ if not video_id:
43
+ return []
44
+ comments = []
45
+ next_page_token = ""
46
+ while len(comments) < max_comments:
47
+ api_url = (
48
+ f"https://www.googleapis.com/youtube/v3/commentThreads"
49
+ f"?part=snippet&videoId={video_id}&key={API_KEY}"
50
+ f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
51
+ )
52
+ response = requests.get(api_url)
53
+ if response.status_code != 200:
54
+ break
55
+ data = response.json()
56
+ for item in data.get("items", []):
57
+ comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
58
+ comments.append(comment)
59
+ if len(comments) >= max_comments:
60
+ break
61
+ next_page_token = data.get("nextPageToken", "")
62
+ if not next_page_token:
63
+ break
64
+ return comments
65
+
66
+ # Fungsi untuk klasifikasi berbasis lexicon
67
+ def classify_lexicon(comment):
68
+ text = comment.lower()
69
+ pos_count = sum(1 for word in lexicon_pos if word in text)
70
+ neg_count = sum(1 for word in lexicon_neg if word in text)
71
+ if pos_count > neg_count:
72
+ return "Positive"
73
+ elif neg_count > pos_count:
74
+ return "Negative"
75
+ else:
76
+ return "Neutral"
77
+
78
+ # Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon
79
+ def classify_sentiment(comments):
80
+ results = []
81
+ label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
82
+
83
+ # Proses cleaning sebelum dikirim ke model
84
+ cleaned_comments = [clean_text(comment) for comment in comments]
85
+
86
+ for comment in cleaned_comments:
87
+ # Tokenisasi menggunakan IndoBERT
88
+ inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
89
+ with torch.no_grad():
90
+ outputs = model(**inputs)
91
+ probs = torch.nn.functional.softmax(outputs.logits, dim=1)
92
+ predicted = torch.argmax(probs, dim=1).item()
93
+ confidence = torch.max(probs).item()
94
+ indo_label = label_map[predicted]
95
+ lex_label = classify_lexicon(comment)
96
+ results.append((comment, indo_label, lex_label, confidence))
97
+ return results
98
+
99
+ # Fungsi untuk menghasilkan visualisasi data
100
+ def generate_visualization(results):
101
+ df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"])
102
+ fig, axs = plt.subplots(1, 3, figsize=(18, 5))
103
+
104
+ indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
105
+ axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
106
+ axs[0].set_title("IndoBERT Sentiment Distribution")
107
+
108
+ lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
109
+ axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
110
+ axs[1].set_title("Lexicon Sentiment Distribution")
111
+
112
+ axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"],
113
+ list(indo_counts.values) + list(lex_counts.values),
114
+ color=["green", "yellow", "red", "green", "yellow", "red"])
115
+ axs[2].set_title("Sentiment Comparison (Bar)")
116
+
117
+ buf = BytesIO()
118
+ plt.tight_layout()
119
+ plt.savefig(buf, format="png")
120
+ buf.seek(0)
121
+ encoded = base64.b64encode(buf.read()).decode("utf-8")
122
+ plt.close()
123
+ return f"<img src='data:image/png;base64,{encoded}'/>"
124
+
125
+ # Fungsi utama untuk analisis sentimen
126
+ def analyze_sentiment(url, jumlah):
127
+ comments = get_youtube_comments(url, max_comments=jumlah)
128
+ if not comments:
129
+ return pd.DataFrame(), "Tidak ada komentar ditemukan"
130
+ results = classify_sentiment(comments)
131
+ df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"])
132
+ chart = generate_visualization(results)
133
+ return df, chart
134
+
135
+ gr.Interface(
136
+ fn=analyze_sentiment,
137
+ inputs=[
138
+ gr.Text(label="URL Video YouTube"),
139
+ gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
140
+ ],
141
+ outputs=[
142
+ gr.Dataframe(label="Preview Komentar dan Sentimen"),
143
+ gr.HTML(label="Visualisasi Komparatif")
144
+ ],
145
+ title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon",
146
+ description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based."
147
+ ).launch()