File size: 8,428 Bytes
7d815fe
11a0fc5
 
 
7d815fe
 
11a0fc5
 
 
7d815fe
 
 
 
 
 
 
11a0fc5
 
 
 
 
 
7d815fe
 
11a0fc5
 
 
7d815fe
11a0fc5
 
 
7d815fe
 
 
 
5a13d2c
7d815fe
11a0fc5
 
 
67899d6
11a0fc5
 
7d815fe
11a0fc5
7d815fe
 
 
 
 
 
 
 
 
67899d6
7d815fe
 
 
 
11a0fc5
 
7d815fe
 
 
 
 
 
 
5a13d2c
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
7d815fe
 
 
 
5a13d2c
11a0fc5
 
7d815fe
 
 
 
 
11a0fc5
67899d6
11a0fc5
7d815fe
11a0fc5
 
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
 
7d815fe
 
11a0fc5
7d815fe
 
a4612d4
7d815fe
a4612d4
 
7d815fe
 
 
 
 
 
 
 
 
 
 
 
 
11a0fc5
7d815fe
 
11a0fc5
7d815fe
 
11a0fc5
 
7d815fe
 
11a0fc5
7d815fe
11a0fc5
 
7d815fe
11a0fc5
7d815fe
 
 
 
 
 
5a13d2c
 
7d815fe
 
 
 
 
11a0fc5
 
7d815fe
 
11a0fc5
 
7d815fe
 
 
11a0fc5
7d815fe
 
 
 
11a0fc5
7d815fe
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
# -*- coding: utf-8 -*-
"""
ml/topic_model.py
=================
Pure keyword/rule-based topic classifier for YouTube live-chat comments.
No ML models are loaded β€” classification is entirely keyword/regex-based.

Topics
------
  Appreciation      β€” praise, thanks, love, encouragement
  Question          β€” direct questions and doubts/confusion
  Request/Feedback  β€” content requests, faculty requests, feedback, suggestions
  Promo             β€” self-promotion, links, "check my channel"
  Spam              β€” repeated noise, irrelevant flood, gibberish
  MCQ Answer        β€” single letter answers (a/b/c/d/e)
  General           β€” anything that doesn't fit the above (fallback)
"""

from __future__ import annotations

import re

# ── Valid topics ───────────────────────────────────────────────────────────────
VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}

# ── Keyword fast-path ──────────────────────────────────────────────────────────
_APPRECIATION_KW = {
    "love", "thanks", "thank", "superb", "amazing", "excellent",
    "awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
    "mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
    "waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
    "dhansu", "pyaar", "bindaas", "khush", "happy",
    "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
    "tysm", "tqsm", "thx",
    "informative", "fruitful", "motivating", "lovely",
    "bestest", "loved", "nice", "helpful",
    "semma", "mass", "solid", "fire", "goated",
}

_QUESTION_KW = {
    "kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
    "kaise", "kyun", "kyunki",
    "what", "when", "where", "who", "which", "how", "why",
    "bata", "batao", "bataye", "tell", "explain",
    "samajh", "confused", "confusion", "doubt", "unclear",
    "matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
}

# Content requests β€” asking for new videos, topics, sessions
_RF_CONTENT_REQUEST_KW = {
    "banao", "banana", "banaye", "banaiye", "banado",
    "karo", "kariye", "karaiye", "kardo",
    "lao", "laiye", "layiye",
    "start", "shuru", "launch", "resume",
    "video", "series",   # removed "class" and "session" β€” too generic
    "separate", "alag", "akele", "single",
    "cover", "include", "add", "topic",
    "chahiye", "chahte", "chahta", "chahti",
    "request", "requesting",
}

# Academic/resource requests β€” PDFs, notes, downloads
_RF_ACADEMIC_KW = {
    "pdf", "notes", "note", "download", "upload",
    "drive", "google", "link", "material", "resource",
    "timeline", "schedule", "timetable", "syllabus",
    "infographic", "slides", "ppt", "handout",
    "provide", "share", "send", "dedo", "dedijiye",
    "milega", "milegi", "milenge",   # "where to find" β€” specific to resource queries
}

# Language requests
_RF_LANGUAGE_KW = {
    "hindi", "english", "medium", "language",
    "translate", "translation",
}

# Feedback/suggestion keywords
_RF_FEEDBACK_KW = {
    "side", "screen", "dikhta", "dikhai",
    "correction", "correct", "galat", "wrong", "mistake",
    "suggestion", "suggest", "improve", "better",
    "feedback", "review", "opinion",
    "sorry", "maafi", "apology",
    "please", "plz", "pls", "plss", "plzz",
    "dijiye", "dijie", "dena", "dedo",
    "chahiye", "zaroorat", "need",
}

# Product/app feature requests
_RF_PRODUCT_KW = {
    "app", "feature", "option", "button", "setting",
    "notification", "reminder", "alert",
    "website", "portal", "platform",
}

# Combined RF keyword set
_RF_ALL_KW = (
    _RF_CONTENT_REQUEST_KW
    | _RF_ACADEMIC_KW
    | _RF_LANGUAGE_KW
    | _RF_FEEDBACK_KW
    | _RF_PRODUCT_KW
)

# Phrases that strongly indicate Request/Feedback (multi-word)
_RF_PHRASES = [
    r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
    r"\bsir\s+(please|pls|plz)\b",
    r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b",
    r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b",
    r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b",
    r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b",
    r"\b(correction|galat|wrong)\s+\w+\b",
    r"\brequest\s+(hai|he|h|kar|karna)\b",
    r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b",
]

_SPAM_PATTERNS = [
    r"^(.)\1{3,}$",
    r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
    r"https?://\S+",
    r"_{4,}",
    r"(?:\b[a-z0-9]{6,}\b\s*){6,}",   # raised from 3 to 6 β€” avoids catching real sentences
]

_SPAM_KW_SUBSTRINGS = {
    "onelink", "zazb", "gatewallah_official", "pwappweb",
    "kuldeepsir_pw",
}

_PROMO_KW = {
    "subscribe", "channel", "link", "instagram",
    "check", "visit", "click", "http", "www", ".com", "telegram",
    "https",
}

_MIN_FASTPATH_LEN = 4


# ── Classification ─────────────────────────────────────────────────────────────

def predict_topic(text: str) -> tuple[str, float]:
    """
    Classify a comment into a topic category.

    Parameters
    ----------
    text : str
        Raw comment text.

    Returns
    -------
    topic : str
        One of VALID_TOPICS.
    confidence : float
        Rule-based confidence in [0.50, 0.95].

    Notes
    -----
    - Fully keyword/regex-based, no ML models.
    - Anything that doesn't match a keyword falls back to "General".
    """
    if not text or not text.strip():
        return "General", 0.50

    t = text.strip().lower()
    t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
    t_clean = re.sub(r"\s+", " ", t_clean).strip()

    # ── MCQ Answer: single letter or repeated letter(s) ──
    if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
        return "MCQ Answer", 0.95
    if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean):
        return "MCQ Answer", 0.95

    # ── Spam: known spam substrings ──
    if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
        return "Spam", 0.90

    # ── Spam/Promo: URL present ──
    if re.search(r"https?://\S+", t_clean):
        if any(kw in t_clean for kw in _PROMO_KW):
            return "Promo", 0.85
        return "Spam", 0.85

    # ── Spam: repeated chars / gibberish ──
    for pat in _SPAM_PATTERNS[:-1]:
        if re.search(pat, t_clean):
            return "Spam", 0.85
    if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
        return "Spam", 0.82

    # ── Promo ──
    if any(kw in t_clean for kw in _PROMO_KW):
        return "Promo", 0.80

    if len(t_clean) < _MIN_FASTPATH_LEN:
        return "General", 0.55

    words = set(t_clean.split())
    has_question_mark = "?" in text

    question_hits     = len(words & _QUESTION_KW)
    appreciation_hits = len(words & _APPRECIATION_KW)
    rf_hits           = len(words & _RF_ALL_KW)

    # Check Request/Feedback phrase patterns (strong signal)
    rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)

    # ── Appreciation ──
    # Single strong appreciation word is enough regardless of length
    min_appr_hits = 1
    if (appreciation_hits >= min_appr_hits
            and question_hits == 0
            and not has_question_mark
            and rf_hits == 0
            and not rf_phrase_match):
        return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)

    # ── Question ──
    if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
        return "Question", min(0.75 + 0.04 * question_hits, 0.92)

    # ── Request/Feedback: phrase match ──
    if rf_phrase_match:
        return "Request/Feedback", 0.85

    # ── Request/Feedback: keyword hits ──
    min_rf_hits = 1 if len(t_clean) >= 20 else 2
    if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
        return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)

    # ── Fallback ──
    return "General", 0.55