import gradio as gr
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import matplotlib.pyplot as plt
import pandas as pd
from io import BytesIO
import base64
import re

API_KEY = "AIzaSyCsgA_lFc6rQTHiHWWDikYQDEHU8rtbygU"

model_name = "hanifnoerr/Fine-tuned-Indonesian-Sentiment-Classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

lexicon_pos = {"bagus", "luar biasa", "mantap", "terbaik", "menyenangkan", "indah", "hebat", "positif", "keren", "puas", "suka", "gokil", "bangga"}
lexicon_neg = {"buruk", "jelek", "parah", "mengecewakan", "negatif", "gagal", "benci", "marah", "sedih", "tidak suka", "jijik", "sampah"}

# Fungsi untuk membersihkan teks
def clean_text(text):
    # Menghapus URL
    text = re.sub(r'http\S+|www\S+', '', text)
    # Mengubah teks ke huruf kecil
    text = text.lower()
    return text

# Fungsi untuk mengambil ID video dari URL YouTube
def extract_video_id(url):
    import re
    match = re.search(r"(?:v=|youtu\.be/)([\w-]{11})", url)
    return match.group(1) if match else None

# Fungsi untuk mendapatkan komentar YouTube
def get_youtube_comments(url, max_comments=100):
    video_id = extract_video_id(url)
    if not video_id:
        return []
    comments = []
    next_page_token = ""
    while len(comments) < max_comments:
        api_url = (
            f"https://www.googleapis.com/youtube/v3/commentThreads"
            f"?part=snippet&videoId={video_id}&key={API_KEY}"
            f"&textFormat=plainText&maxResults=100&pageToken={next_page_token}"
        )
        response = requests.get(api_url)
        if response.status_code != 200:
            break
        data = response.json()
        for item in data.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
            if len(comments) >= max_comments:
                break
        next_page_token = data.get("nextPageToken", "")
        if not next_page_token:
            break
    return comments

# Fungsi untuk klasifikasi berbasis lexicon
def classify_lexicon(comment):
    text = comment.lower()
    pos_count = sum(1 for word in lexicon_pos if word in text)
    neg_count = sum(1 for word in lexicon_neg if word in text)
    if pos_count > neg_count:
        return "Positive"
    elif neg_count > pos_count:
        return "Negative"
    else:
        return "Neutral"

# Fungsi untuk mengklasifikasikan sentimen komentar menggunakan IndoBERT dan Lexicon
def classify_sentiment(comments):
    results = []
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

    # Proses cleaning sebelum dikirim ke model
    cleaned_comments = [clean_text(comment) for comment in comments]

    for comment in cleaned_comments:
        # Tokenisasi menggunakan IndoBERT
        inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
        confidence = torch.max(probs).item()
        indo_label = label_map[predicted]
        lex_label = classify_lexicon(comment)
        results.append((comment, indo_label, lex_label, confidence))
    return results

# Fungsi untuk menghasilkan visualisasi data
def generate_visualization(results):
    df = pd.DataFrame(results, columns=["Comment", "IndoBERT", "Lexicon", "Confidence"])
    fig, axs = plt.subplots(1, 3, figsize=(18, 5))

    indo_counts = df["IndoBERT"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
    axs[0].pie(indo_counts, labels=indo_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
    axs[0].set_title("IndoBERT Sentiment Distribution")

    lex_counts = df["Lexicon"].value_counts().reindex(["Positive", "Neutral", "Negative"], fill_value=0)
    axs[1].pie(lex_counts, labels=lex_counts.index, autopct='%1.1f%%', colors=["green", "yellow", "red"])
    axs[1].set_title("Lexicon Sentiment Distribution")

    axs[2].bar(["Indo-Pos", "Indo-Net", "Indo-Neg", "Lex-Pos", "Lex-Net", "Lex-Neg"],
               list(indo_counts.values) + list(lex_counts.values),
               color=["green", "yellow", "red", "green", "yellow", "red"])
    axs[2].set_title("Sentiment Comparison (Bar)")

    buf = BytesIO()
    plt.tight_layout()
    plt.savefig(buf, format="png")
    buf.seek(0)
    encoded = base64.b64encode(buf.read()).decode("utf-8")
    plt.close()
    return f"<img src='data:image/png;base64,{encoded}'/>"

# Fungsi utama untuk analisis sentimen
def analyze_sentiment(url, jumlah):
    comments = get_youtube_comments(url, max_comments=jumlah)
    if not comments:
        return pd.DataFrame(), "Tidak ada komentar ditemukan"
    results = classify_sentiment(comments)
    df = pd.DataFrame(results, columns=["Komentar", "IndoBERT", "Lexicon", "Confidence"])
    chart = generate_visualization(results)
    return df, chart

gr.Interface(
    fn=analyze_sentiment,
    inputs=[
        gr.Text(label="URL Video YouTube"),
        gr.Slider(10, 200, value=50, step=10, label="Jumlah komentar yang dianalisis")
    ],
    outputs=[
        gr.Dataframe(label="Preview Komentar dan Sentimen"),
        gr.HTML(label="Visualisasi Komparatif")
    ],
    title="Analisis Komentar YouTube 🇮🇩 dengan IndoBERT & Lexicon",
    description="Masukkan URL YouTube dan sistem akan menarik komentar dan menganalisisnya dengan 2 metode: IndoBERT Fine-Tuned dan Lexicon-Based."
).launch()