import re
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_DIR = "bert_classifier"   # folder you uploaded to the Space
MAX_LENGTH = 128
LABELS = {0: "🎭 Meme", 1: "📰 Real Event"}

_URL_RE      = re.compile(r"https?://\S+|www\.\S+")
_MENTION_RE  = re.compile(r"@\w+")
_HASHTAG_RE  = re.compile(r"#")
_NON_WORD_RE = re.compile(r"[^a-z0-9\s]")
_WS_RE       = re.compile(r"\s+")

def clean_tweet(text: str) -> str:
    t = text.lower()
    t = _URL_RE.sub(" ", t)
    t = _MENTION_RE.sub(" ", t)
    t = _HASHTAG_RE.sub(" ", t)
    t = _NON_WORD_RE.sub(" ", t)
    return _WS_RE.sub(" ", t).strip()

device    = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device).eval()

@torch.no_grad()
def classify(text: str):
    if not text.strip():
        return "Please enter a tweet."
    cleaned = clean_tweet(text)
    enc = tokenizer(cleaned, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
    probs = F.softmax(model(**enc).logits[0], dim=-1).cpu().numpy()
    pred  = int(probs.argmax())
    return {
        "Label":      LABELS[pred],
        "Confidence": f"{probs[pred]:.1%}",
        "P(meme)":    f"{probs[0]:.1%}",
        "P(real)":    f"{probs[1]:.1%}",
    }

gr.Interface(
    fn=classify,
    inputs=gr.Textbox(lines=3, placeholder="Paste a tweet here..."),
    outputs=gr.JSON(),
    title="Meme vs Real Event Classifier",
    examples=[
        ["Massive 6.5 earthquake just hit Istanbul, buildings swaying"],
        ["skibidi toilet ohio rizz level 9000 fr fr 💀"],
        ["AWS us-east-1 throwing 500s across the board"],
    ]
).launch()