# -*- coding: utf-8 -*-
"""
ml/topic_model.py
=================
Pure keyword/rule-based topic classifier for YouTube live-chat comments.
No ML models are loaded — classification is entirely keyword/regex-based.

Topics
------
  Appreciation      — praise, thanks, love, encouragement
  Question          — direct questions and doubts/confusion
  Request/Feedback  — content requests, faculty requests, feedback, suggestions
  Promo             — self-promotion, links, "check my channel"
  Spam              — repeated noise, irrelevant flood, gibberish
  MCQ Answer        — single letter answers (a/b/c/d/e)
  General           — anything that doesn't fit the above (fallback)
"""

from __future__ import annotations

import re

# ── Valid topics ───────────────────────────────────────────────────────────────
VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}

# ── Keyword fast-path ──────────────────────────────────────────────────────────
_APPRECIATION_KW = {
    "love", "thanks", "thank", "superb", "amazing", "excellent",
    "awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
    "mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
    "waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
    "dhansu", "pyaar", "bindaas", "khush", "happy",
    "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
    "tysm", "tqsm", "thx",
    "informative", "fruitful", "motivating", "lovely",
    "bestest", "loved", "nice", "helpful",
    "semma", "mass", "solid", "fire", "goated",
}

_QUESTION_KW = {
    "kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
    "kaise", "kyun", "kyunki",
    "what", "when", "where", "who", "which", "how", "why",
    "bata", "batao", "bataye", "tell", "explain",
    "samajh", "confused", "confusion", "doubt", "unclear",
    "matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
}

# Content requests — asking for new videos, topics, sessions
_RF_CONTENT_REQUEST_KW = {
    "banao", "banana", "banaye", "banaiye", "banado",
    "karo", "kariye", "karaiye", "kardo",
    "lao", "laiye", "layiye",
    "start", "shuru", "launch", "resume",
    "video", "series",   # removed "class" and "session" — too generic
    "separate", "alag", "akele", "single",
    "cover", "include", "add", "topic",
    "chahiye", "chahte", "chahta", "chahti",
    "request", "requesting",
}

# Academic/resource requests — PDFs, notes, downloads
_RF_ACADEMIC_KW = {
    "pdf", "notes", "note", "download", "upload",
    "drive", "google", "link", "material", "resource",
    "timeline", "schedule", "timetable", "syllabus",
    "infographic", "slides", "ppt", "handout",
    "provide", "share", "send", "dedo", "dedijiye",
    "milega", "milegi", "milenge",   # "where to find" — specific to resource queries
}

# Language requests
_RF_LANGUAGE_KW = {
    "hindi", "english", "medium", "language",
    "translate", "translation",
}

# Feedback/suggestion keywords
_RF_FEEDBACK_KW = {
    "side", "screen", "dikhta", "dikhai",
    "correction", "correct", "galat", "wrong", "mistake",
    "suggestion", "suggest", "improve", "better",
    "feedback", "review", "opinion",
    "sorry", "maafi", "apology",
    "please", "plz", "pls", "plss", "plzz",
    "dijiye", "dijie", "dena", "dedo",
    "chahiye", "zaroorat", "need",
}

# Product/app feature requests
_RF_PRODUCT_KW = {
    "app", "feature", "option", "button", "setting",
    "notification", "reminder", "alert",
    "website", "portal", "platform",
}

# Combined RF keyword set
_RF_ALL_KW = (
    _RF_CONTENT_REQUEST_KW
    | _RF_ACADEMIC_KW
    | _RF_LANGUAGE_KW
    | _RF_FEEDBACK_KW
    | _RF_PRODUCT_KW
)

# Phrases that strongly indicate Request/Feedback (multi-word)
_RF_PHRASES = [
    r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bsir\s+(please|pls|plz)\b",
    r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b",
    r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b",
    r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b",
    r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b",
    r"\b(correction|galat|wrong)\s+\w+\b",
    r"\brequest\s+(hai|he|h|kar|karna)\b",
    r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b",
]

_SPAM_PATTERNS = [
    r"^(.)\1{3,}$",
    r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
    r"https?://\S+",
    r"_{4,}",
    r"(?:\b[a-z0-9]{6,}\b\s*){6,}",   # raised from 3 to 6 — avoids catching real sentences
]

_SPAM_KW_SUBSTRINGS = {
    "onelink", "zazb", "gatewallah_official", "pwappweb",
    "kuldeepsir_pw",
}

_PROMO_KW = {
    "subscribe", "channel", "link", "instagram",
    "check", "visit", "click", "http", "www", ".com", "telegram",
    "https",
}

_MIN_FASTPATH_LEN = 4


# ── Classification ─────────────────────────────────────────────────────────────

def predict_topic(text: str) -> tuple[str, float]:
    """
    Classify a comment into a topic category.

    Parameters
    ----------
    text : str
        Raw comment text.

    Returns
    -------
    topic : str
        One of VALID_TOPICS.
    confidence : float
        Rule-based confidence in [0.50, 0.95].

    Notes
    -----
    - Fully keyword/regex-based, no ML models.
    - Anything that doesn't match a keyword falls back to "General".
    """
    if not text or not text.strip():
        return "General", 0.50

    t = text.strip().lower()
    t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
    t_clean = re.sub(r"\s+", " ", t_clean).strip()

    # ── MCQ Answer: single letter or repeated letter(s) ──
    if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
        return "MCQ Answer", 0.95
    if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean):
        return "MCQ Answer", 0.95

    # ── Spam: known spam substrings ──
    if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
        return "Spam", 0.90

    # ── Spam/Promo: URL present ──
    if re.search(r"https?://\S+", t_clean):
        if any(kw in t_clean for kw in _PROMO_KW):
            return "Promo", 0.85
        return "Spam", 0.85

    # ── Spam: repeated chars / gibberish ──
    for pat in _SPAM_PATTERNS[:-1]:
        if re.search(pat, t_clean):
            return "Spam", 0.85
    if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
        return "Spam", 0.82

    # ── Promo ──
    if any(kw in t_clean for kw in _PROMO_KW):
        return "Promo", 0.80

    if len(t_clean) < _MIN_FASTPATH_LEN:
        return "General", 0.55

    words = set(t_clean.split())
    has_question_mark = "?" in text

    question_hits     = len(words & _QUESTION_KW)
    appreciation_hits = len(words & _APPRECIATION_KW)
    rf_hits           = len(words & _RF_ALL_KW)

    # Check Request/Feedback phrase patterns (strong signal)
    rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)

    # ── Appreciation ──
    # Single strong appreciation word is enough regardless of length
    min_appr_hits = 1
    if (appreciation_hits >= min_appr_hits
            and question_hits == 0
            and not has_question_mark
            and rf_hits == 0
            and not rf_phrase_match):
        return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)

    # ── Question ──
    if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
        return "Question", min(0.75 + 0.04 * question_hits, 0.92)

    # ── Request/Feedback: phrase match ──
    if rf_phrase_match:
        return "Request/Feedback", 0.85

    # ── Request/Feedback: keyword hits ──
    min_rf_hits = 1 if len(t_clean) >= 20 else 2
    if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
        return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)

    # ── Fallback ──
    return "General", 0.55