Spaces:

Divyonko
/

LivePulse

Sleeping

File size: 8,428 Bytes

7d815fe
11a0fc5
 
 
7d815fe
 
11a0fc5
 
 
7d815fe
 
 
 
 
 
 
11a0fc5
 
 
 
 
 
7d815fe
 
11a0fc5
 
 
7d815fe
11a0fc5
 
 
7d815fe
 
 
 
5a13d2c
7d815fe
11a0fc5
 
 
67899d6
11a0fc5
 
7d815fe
11a0fc5
7d815fe
 
 
 
 
 
 
 
 
67899d6
7d815fe
 
 
 
11a0fc5
 
7d815fe
 
 
 
 
 
 
5a13d2c
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
7d815fe
 
 
 
5a13d2c
11a0fc5
 
7d815fe
 
 
 
 
11a0fc5
67899d6
11a0fc5
7d815fe
11a0fc5
 
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
 
7d815fe
 
11a0fc5
7d815fe
 
a4612d4
7d815fe
a4612d4
 
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
7d815fe
 
11a0fc5
7d815fe
 
11a0fc5
 
7d815fe
 
11a0fc5
7d815fe
11a0fc5
 
7d815fe
11a0fc5
7d815fe
 
 
 
 
 
5a13d2c
 
7d815fe
 
 
 
 
11a0fc5
 
7d815fe
 
11a0fc5
 
7d815fe
 
 
11a0fc5
7d815fe
 
 
 
11a0fc5
7d815fe

# -*- coding: utf-8 -*-
"""
ml/topic_model.py
=================
Pure keyword/rule-based topic classifier for YouTube live-chat comments.
No ML models are loaded — classification is entirely keyword/regex-based.

Topics
------
  Appreciation      — praise, thanks, love, encouragement
  Question          — direct questions and doubts/confusion
  Request/Feedback  — content requests, faculty requests, feedback, suggestions
  Promo             — self-promotion, links, "check my channel"
  Spam              — repeated noise, irrelevant flood, gibberish
  MCQ Answer        — single letter answers (a/b/c/d/e)
  General           — anything that doesn't fit the above (fallback)
"""

from __future__ import annotations

import re

# ── Valid topics ───────────────────────────────────────────────────────────────
VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}

# ── Keyword fast-path ──────────────────────────────────────────────────────────
_APPRECIATION_KW = {
    "love", "thanks", "thank", "superb", "amazing", "excellent",
    "awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
    "mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
    "waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
    "dhansu", "pyaar", "bindaas", "khush", "happy",
    "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
    "tysm", "tqsm", "thx",
    "informative", "fruitful", "motivating", "lovely",
    "bestest", "loved", "nice", "helpful",
    "semma", "mass", "solid", "fire", "goated",
}

_QUESTION_KW = {
    "kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
    "kaise", "kyun", "kyunki",
    "what", "when", "where", "who", "which", "how", "why",
    "bata", "batao", "bataye", "tell", "explain",
    "samajh", "confused", "confusion", "doubt", "unclear",
    "matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
}

# Content requests — asking for new videos, topics, sessions
_RF_CONTENT_REQUEST_KW = {
    "banao", "banana", "banaye", "banaiye", "banado",
    "karo", "kariye", "karaiye", "kardo",
    "lao", "laiye", "layiye",
    "start", "shuru", "launch", "resume",
    "video", "series",   # removed "class" and "session" — too generic
    "separate", "alag", "akele", "single",
    "cover", "include", "add", "topic",
    "chahiye", "chahte", "chahta", "chahti",
    "request", "requesting",
}

# Academic/resource requests — PDFs, notes, downloads
_RF_ACADEMIC_KW = {
    "pdf", "notes", "note", "download", "upload",
    "drive", "google", "link", "material", "resource",
    "timeline", "schedule", "timetable", "syllabus",
    "infographic", "slides", "ppt", "handout",
    "provide", "share", "send", "dedo", "dedijiye",
    "milega", "milegi", "milenge",   # "where to find" — specific to resource queries
}

# Language requests
_RF_LANGUAGE_KW = {
    "hindi", "english", "medium", "language",
    "translate", "translation",
}

# Feedback/suggestion keywords
_RF_FEEDBACK_KW = {
    "side", "screen", "dikhta", "dikhai",
    "correction", "correct", "galat", "wrong", "mistake",
    "suggestion", "suggest", "improve", "better",
    "feedback", "review", "opinion",
    "sorry", "maafi", "apology",
    "please", "plz", "pls", "plss", "plzz",
    "dijiye", "dijie", "dena", "dedo",
    "chahiye", "zaroorat", "need",
}

# Product/app feature requests
_RF_PRODUCT_KW = {
    "app", "feature", "option", "button", "setting",
    "notification", "reminder", "alert",
    "website", "portal", "platform",
}

# Combined RF keyword set
_RF_ALL_KW = (
    _RF_CONTENT_REQUEST_KW
    | _RF_ACADEMIC_KW
    | _RF_LANGUAGE_KW
    | _RF_FEEDBACK_KW
    | _RF_PRODUCT_KW
)

# Phrases that strongly indicate Request/Feedback (multi-word)
_RF_PHRASES = [
    r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bsir\s+(please|pls|plz)\b",
    r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b",
    r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b",
    r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b",
    r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b",
    r"\b(correction|galat|wrong)\s+\w+\b",
    r"\brequest\s+(hai|he|h|kar|karna)\b",
    r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b",
]

_SPAM_PATTERNS = [
    r"^(.)\1{3,}$",
    r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
    r"https?://\S+",
    r"_{4,}",
    r"(?:\b[a-z0-9]{6,}\b\s*){6,}",   # raised from 3 to 6 — avoids catching real sentences
]

_SPAM_KW_SUBSTRINGS = {
    "onelink", "zazb", "gatewallah_official", "pwappweb",
    "kuldeepsir_pw",
}

_PROMO_KW = {
    "subscribe", "channel", "link", "instagram",
    "check", "visit", "click", "http", "www", ".com", "telegram",
    "https",
}

_MIN_FASTPATH_LEN = 4


# ── Classification ─────────────────────────────────────────────────────────────

def predict_topic(text: str) -> tuple[str, float]:
    """
    Classify a comment into a topic category.

    Parameters
    ----------
    text : str
        Raw comment text.

    Returns
    -------
    topic : str
        One of VALID_TOPICS.
    confidence : float
        Rule-based confidence in [0.50, 0.95].

    Notes
    -----
    - Fully keyword/regex-based, no ML models.
    - Anything that doesn't match a keyword falls back to "General".
    """
    if not text or not text.strip():
        return "General", 0.50

    t = text.strip().lower()
    t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
    t_clean = re.sub(r"\s+", " ", t_clean).strip()

    # ── MCQ Answer: single letter or repeated letter(s) ──
    if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
        return "MCQ Answer", 0.95
    if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean):
        return "MCQ Answer", 0.95

    # ── Spam: known spam substrings ──
    if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
        return "Spam", 0.90

    # ── Spam/Promo: URL present ──
    if re.search(r"https?://\S+", t_clean):
        if any(kw in t_clean for kw in _PROMO_KW):
            return "Promo", 0.85
        return "Spam", 0.85

    # ── Spam: repeated chars / gibberish ──
    for pat in _SPAM_PATTERNS[:-1]:
        if re.search(pat, t_clean):
            return "Spam", 0.85
    if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
        return "Spam", 0.82

    # ── Promo ──
    if any(kw in t_clean for kw in _PROMO_KW):
        return "Promo", 0.80

    if len(t_clean) < _MIN_FASTPATH_LEN:
        return "General", 0.55

    words = set(t_clean.split())
    has_question_mark = "?" in text

    question_hits     = len(words & _QUESTION_KW)
    appreciation_hits = len(words & _APPRECIATION_KW)
    rf_hits           = len(words & _RF_ALL_KW)

    # Check Request/Feedback phrase patterns (strong signal)
    rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)

    # ── Appreciation ──
    # Single strong appreciation word is enough regardless of length
    min_appr_hits = 1
    if (appreciation_hits >= min_appr_hits
            and question_hits == 0
            and not has_question_mark
            and rf_hits == 0
            and not rf_phrase_match):
        return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)

    # ── Question ──
    if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
        return "Question", min(0.75 + 0.04 * question_hits, 0.92)

    # ── Request/Feedback: phrase match ──
    if rf_phrase_match:
        return "Request/Feedback", 0.85

    # ── Request/Feedback: keyword hits ──
    min_rf_hits = 1 if len(t_clean) >= 20 else 2
    if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
        return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)

    # ── Fallback ──
    return "General", 0.55