hasbigani commited on
Commit
7cfa7d7
·
verified ·
1 Parent(s): 7112c24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -28
app.py CHANGED
@@ -1,36 +1,136 @@
1
  import gradio as gr
 
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  import torch
 
 
 
 
 
 
 
4
 
5
- # Ganti dengan nama repository model kamu
6
  model_name = "hasbigani/indobertsentiment"
7
-
8
- # Load model & tokenizer
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
11
 
12
- # Label mapping (ubah sesuai label modelmu)
13
- label_map = {
14
- 0: "Negatif",
15
- 1: "Netral",
16
- 2: "Positif"
17
- }
18
-
19
- # Fungsi prediksi
20
- def predict_sentiment(text):
21
- inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
22
- with torch.no_grad():
23
- outputs = model(**inputs)
24
- pred = torch.argmax(outputs.logits, dim=-1).item()
25
- return label_map[pred]
26
-
27
- # Gradio interface
28
- iface = gr.Interface(
29
- fn=predict_sentiment,
30
- inputs=gr.Textbox(lines=3, placeholder="Tulis teks di sini..."),
31
- outputs=gr.Label(),
32
- title="Demo Sentimen IndoBERT",
33
- description="Masukkan kalimat berbahasa Indonesia untuk menguji model sentimen yang sudah diupload di Hugging Face."
34
- )
35
-
36
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import requests
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import torch
5
+ import matplotlib.pyplot as plt
6
+ import pandas as pd
7
+ from io import BytesIO
8
+ import base64
9
+ import re
10
+ from PIL import Image
11
+ from io import BytesIO
12
 
13
+ # Model yang digunakan sekarang hasbigani/indobertsentiment
14
  model_name = "hasbigani/indobertsentiment"
 
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
17
 
18
+ # Fungsi untuk membersihkan teks
19
+ def clean_text(text):
20
+ # Menghapus URL
21
+ text = re.sub(r'http\S+|www\S+', '', text)
22
+ # Menghapus emoji dan karakter non-alfabet
23
+ text = re.sub(r'[^\w\s]', '', text)
24
+ # Menghapus angka
25
+ text = re.sub(r'\d+', '', text)
26
+ # Mengubah teks ke huruf kecil
27
+ text = text.lower()
28
+ return text
29
+
30
+ # Fungsi untuk mengambil ID video dari URL YouTube
31
+ def extract_video_id(url):
32
+ import re
33
+ match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
34
+ return match.group(1) if match else None
35
+
36
+ # Fungsi untuk mendapatkan komentar YouTube
37
+ def get_youtube_comments(url, max_comments=100):
38
+ video_id = extract_video_id(url)
39
+ if not video_id:
40
+ return []
41
+ comments = []
42
+ next_page_token = ""
43
+ while len(comments) < max_comments:
44
+ api_url = (
45
+ f"https://www.googleapis.com/youtube/v3/commentThreads"
46
+ f"?part=snippet&videoId={video_id}&key=AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"
47
+ f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
48
+ )
49
+ response = requests.get(api_url)
50
+ if response.status_code != 200:
51
+ break
52
+ data = response.json()
53
+ for item in data.get("items", []):
54
+ comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
55
+ comments.append(comment)
56
+ if len(comments) >= max_comments:
57
+ break
58
+ next_page_token = data.get("nextPageToken", "")
59
+ if not next_page_token:
60
+ break
61
+ return comments
62
+
63
+ # Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT
64
+ def classify_sentiment(comments):
65
+ results = []
66
+ label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
67
+
68
+ # Proses cleaning sebelum dikirim ke model
69
+ cleaned_comments = [clean_text(comment) for comment in comments]
70
+
71
+ for comment in cleaned_comments:
72
+ # Tokenisasi menggunakan IndoBERT
73
+ inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
74
+ with torch.no_grad():
75
+ outputs = model(**inputs)
76
+ probs = torch.nn.functional.softmax(outputs.logits, dim=1)
77
+ predicted = torch.argmax(probs, dim=1).item()
78
+ confidence = torch.max(probs).item()
79
+ indo_label = label_map[predicted]
80
+ results.append((comment, indo_label, confidence))
81
+ return results
82
+
83
+ # Fungsi untuk menghasilkan visualisasi data
84
+ def generate_visualization(results):
85
+ df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Confidence"])
86
+ fig, axs = plt.subplots(1, 2, figsize=(18, 5))
87
+
88
+ indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
89
+ axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
90
+ axs[0].set_title("IndoBERT Sentiment Distribution")
91
+
92
+ axs[1].bar(["Positive", "Neutral", "Negative"],
93
+ indo_counts.values, color=["green", "yellow", "red"])
94
+ axs[1].set_title("Sentiment Comparison (Bar)")
95
+
96
+ buf = BytesIO()
97
+ plt.tight_layout()
98
+ plt.savefig(buf, format="png")
99
+ buf.seek(0)
100
+ encoded = base64.b64encode(buf.read()).decode("utf-8")
101
+ plt.close()
102
+ return f"<img src='data:image/png;base64,{encoded}'/>"
103
+
104
+ # Fungsi untuk mengambil thumbnail dari URL YouTube
105
+ def get_thumbnail(url):
106
+ video_id = extract_video_id(url)
107
+ if video_id:
108
+ return f"https://img.youtube.com/vi/{video_id}/0.jpg"
109
+ return None
110
+
111
+ # Fungsi utama untuk analisis sentimen
112
+ def analyze_sentiment(url, jumlah):
113
+ comments = get_youtube_comments(url, max_comments=jumlah)
114
+ if not comments:
115
+ return pd.DataFrame(), "Tidak ada komentar ditemukan", None
116
+ results = classify_sentiment(comments)
117
+ df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Confidence"])
118
+ chart = generate_visualization(results)
119
+ thumbnail_url = get_thumbnail(url)
120
+ return df, chart, thumbnail_url
121
+
122
+ gr.Interface(
123
+ fn=analyze_sentiment,
124
+ inputs=[
125
+ gr.Text(label="URL Video YouTube"),
126
+ gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
127
+ ],
128
+ outputs=[
129
+ gr.Dataframe(label="Preview Komentar dan Sentimen"),
130
+ gr.HTML(label="Visualisasi Sentimen"),
131
+ gr.Image(label="Thumbnail Video YouTube", type="url")
132
+ ],
133
+ title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT",
134
+ description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya menggunakan model IndoBERT."
135
+ ).launch()
136
+