DivYonko commited on
Commit Β·
5a13d2c
1
Parent(s): 6b26039
Improve keyword accuracy from CSV analysis + gate action_type on topic
Browse files- app.py +5 -1
- backend/scraper.py +26 -3
- ml/action_type_model.py +15 -22
- ml/sentiment_model.py +21 -2
- ml/topic_model.py +7 -5
- shared.py +5 -1
app.py
CHANGED
|
@@ -292,7 +292,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
|
|
| 292 |
try:
|
| 293 |
sentiment, s_conf = _safe_sentiment(text)
|
| 294 |
topic, t_conf = _safe_topic(text)
|
| 295 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
except Exception as exc:
|
| 297 |
logger.error("ML inference failed for text=%r: %s", text[:50], exc)
|
| 298 |
sentiment, s_conf = "Neutral", 0.5
|
|
|
|
| 292 |
try:
|
| 293 |
sentiment, s_conf = _safe_sentiment(text)
|
| 294 |
topic, t_conf = _safe_topic(text)
|
| 295 |
+
# Only classify action type for Question/Request topics
|
| 296 |
+
if topic in ("Question", "Request/Feedback"):
|
| 297 |
+
action_type, at_conf = _safe_action_type(text)
|
| 298 |
+
else:
|
| 299 |
+
action_type, at_conf = "N/A", 0.50
|
| 300 |
except Exception as exc:
|
| 301 |
logger.error("ML inference failed for text=%r: %s", text[:50], exc)
|
| 302 |
sentiment, s_conf = "Neutral", 0.5
|
backend/scraper.py
CHANGED
|
@@ -27,6 +27,7 @@ from backend.config import (
|
|
| 27 |
)
|
| 28 |
from ml.sentiment_model import predict_sentiment
|
| 29 |
from ml.topic_model import predict_topic, VALID_TOPICS
|
|
|
|
| 30 |
|
| 31 |
logging.basicConfig(
|
| 32 |
level=logging.INFO,
|
|
@@ -35,7 +36,7 @@ logging.basicConfig(
|
|
| 35 |
)
|
| 36 |
logger = logging.getLogger("scraper")
|
| 37 |
|
| 38 |
-
MAX_REDIS_MESSAGES =
|
| 39 |
|
| 40 |
|
| 41 |
def _safe_sentiment(text: str) -> tuple[str, float]:
|
|
@@ -57,6 +58,17 @@ def _safe_topic(text: str) -> tuple[str, float]:
|
|
| 57 |
return "General", 0.50
|
| 58 |
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def run(video_id: str, redis_key: str) -> None:
|
| 61 |
r = redis.Redis(
|
| 62 |
host=REDIS_HOST,
|
|
@@ -84,13 +96,21 @@ def run(video_id: str, redis_key: str) -> None:
|
|
| 84 |
while chat.is_alive():
|
| 85 |
try:
|
| 86 |
for c in chat.get().sync_items():
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
| 88 |
author = c.author.name
|
| 89 |
if not text:
|
| 90 |
continue
|
| 91 |
|
| 92 |
sentiment, s_conf = _safe_sentiment(text)
|
| 93 |
topic, t_conf = _safe_topic(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
message_data = {
|
| 96 |
"author": author,
|
|
@@ -99,6 +119,8 @@ def run(video_id: str, redis_key: str) -> None:
|
|
| 99 |
"confidence": round(s_conf, 3),
|
| 100 |
"topic": topic,
|
| 101 |
"topic_conf": round(t_conf, 3),
|
|
|
|
|
|
|
| 102 |
"time": datetime.now().isoformat(),
|
| 103 |
}
|
| 104 |
|
|
@@ -108,11 +130,12 @@ def run(video_id: str, redis_key: str) -> None:
|
|
| 108 |
pipe.execute()
|
| 109 |
|
| 110 |
logger.info(
|
| 111 |
-
"[%s] %s | %s(%.2f) %s(%.2f) | %r",
|
| 112 |
message_data["time"][11:19],
|
| 113 |
author[:20],
|
| 114 |
sentiment, s_conf,
|
| 115 |
topic, t_conf,
|
|
|
|
| 116 |
text[:60],
|
| 117 |
)
|
| 118 |
|
|
|
|
| 27 |
)
|
| 28 |
from ml.sentiment_model import predict_sentiment
|
| 29 |
from ml.topic_model import predict_topic, VALID_TOPICS
|
| 30 |
+
from ml.action_type_model import predict_action_type, VALID_ACTION_TYPES
|
| 31 |
|
| 32 |
logging.basicConfig(
|
| 33 |
level=logging.INFO,
|
|
|
|
| 36 |
)
|
| 37 |
logger = logging.getLogger("scraper")
|
| 38 |
|
| 39 |
+
MAX_REDIS_MESSAGES = 40000
|
| 40 |
|
| 41 |
|
| 42 |
def _safe_sentiment(text: str) -> tuple[str, float]:
|
|
|
|
| 58 |
return "General", 0.50
|
| 59 |
|
| 60 |
|
| 61 |
+
def _safe_action_type(text: str) -> tuple[str, float]:
|
| 62 |
+
try:
|
| 63 |
+
action_type, conf = predict_action_type(text)
|
| 64 |
+
if action_type not in VALID_ACTION_TYPES:
|
| 65 |
+
return "N/A", 0.50
|
| 66 |
+
return action_type, conf
|
| 67 |
+
except Exception as exc:
|
| 68 |
+
logger.error("predict_action_type failed for %r: %s", text[:60], exc)
|
| 69 |
+
return "N/A", 0.50
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def run(video_id: str, redis_key: str) -> None:
|
| 73 |
r = redis.Redis(
|
| 74 |
host=REDIS_HOST,
|
|
|
|
| 96 |
while chat.is_alive():
|
| 97 |
try:
|
| 98 |
for c in chat.get().sync_items():
|
| 99 |
+
# pytchat converts emoji to :name: codes β convert back to actual characters
|
| 100 |
+
import emoji as _emoji
|
| 101 |
+
raw_text = c.message.strip()
|
| 102 |
+
text = _emoji.emojize(raw_text, language="alias")
|
| 103 |
author = c.author.name
|
| 104 |
if not text:
|
| 105 |
continue
|
| 106 |
|
| 107 |
sentiment, s_conf = _safe_sentiment(text)
|
| 108 |
topic, t_conf = _safe_topic(text)
|
| 109 |
+
# Only classify action type for Question/Request topics
|
| 110 |
+
if topic in ("Question", "Request/Feedback"):
|
| 111 |
+
action_type, at_conf = _safe_action_type(text)
|
| 112 |
+
else:
|
| 113 |
+
action_type, at_conf = "N/A", 0.50
|
| 114 |
|
| 115 |
message_data = {
|
| 116 |
"author": author,
|
|
|
|
| 119 |
"confidence": round(s_conf, 3),
|
| 120 |
"topic": topic,
|
| 121 |
"topic_conf": round(t_conf, 3),
|
| 122 |
+
"action_type": action_type,
|
| 123 |
+
"action_type_conf": round(at_conf, 3),
|
| 124 |
"time": datetime.now().isoformat(),
|
| 125 |
}
|
| 126 |
|
|
|
|
| 130 |
pipe.execute()
|
| 131 |
|
| 132 |
logger.info(
|
| 133 |
+
"[%s] %s | %s(%.2f) %s(%.2f) %s(%.2f) | %r",
|
| 134 |
message_data["time"][11:19],
|
| 135 |
author[:20],
|
| 136 |
sentiment, s_conf,
|
| 137 |
topic, t_conf,
|
| 138 |
+
action_type, at_conf,
|
| 139 |
text[:60],
|
| 140 |
)
|
| 141 |
|
ml/action_type_model.py
CHANGED
|
@@ -271,18 +271,16 @@ _PRICING_KW: set[str] = {
|
|
| 271 |
|
| 272 |
# Fees + Financial Queries β how to purchase, payment, stipend
|
| 273 |
_FEES_KW: set[str] = {
|
| 274 |
-
# Purchase / payment
|
| 275 |
"purchase", "buy", "kharidna", "kharide", "kharido",
|
| 276 |
"payment", "pay", "paid",
|
| 277 |
-
"kaise", "kaha", "kahan", "milega", "milegi",
|
| 278 |
# Financial
|
| 279 |
"stipend", "salary", "income", "earn", "earning",
|
| 280 |
"emi", "installment", "loan",
|
| 281 |
# Batch purchase
|
| 282 |
-
"
|
| 283 |
"register", "registration",
|
| 284 |
-
# Hinglish
|
| 285 |
-
"lena", "lena hai", "chahiye", "chahta", "chahti",
|
| 286 |
"pw", "physics wallah", "umeed",
|
| 287 |
}
|
| 288 |
|
|
@@ -322,41 +320,38 @@ _BATCH_KW: set[str] = {
|
|
| 322 |
"worth", "value",
|
| 323 |
# Faculty in batch
|
| 324 |
"faculty", "teacher", "sir", "mam",
|
| 325 |
-
"included", "include",
|
| 326 |
}
|
| 327 |
|
| 328 |
# ββ Keyword sets: remaining categories βββββββββββββββββββββββββββββββββββββββ
|
| 329 |
|
| 330 |
# Information- Exam β exam rules, cutoffs, forms, reservations, percentages
|
| 331 |
_EXAM_INFO_KW: set[str] = {
|
| 332 |
-
# Exam process
|
| 333 |
"form", "bhara", "bharana", "apply", "application",
|
| 334 |
-
"notification", "vacancy",
|
| 335 |
# Exam results / cutoffs
|
| 336 |
"cutoff", "cut off", "marks", "percentage", "percent",
|
| 337 |
"prelims", "mains", "interview", "daf",
|
| 338 |
"clear", "cleared", "qualify", "qualified",
|
| 339 |
# Reservation / rules
|
| 340 |
-
"reservation", "sc", "st", "obc", "ews",
|
| 341 |
"rule", "rules", "regulation", "norms",
|
| 342 |
"upsc", "ssc", "ias", "ips", "ifs",
|
| 343 |
# Exam statistics
|
| 344 |
"attempt", "attempts", "age", "limit",
|
| 345 |
-
"seats",
|
| 346 |
-
# Hinglish
|
| 347 |
-
"ata", "aata", "kitna", "kitne", "kya", "hai",
|
| 348 |
"dono", "both",
|
| 349 |
}
|
| 350 |
|
| 351 |
# Guidance β study strategy, what to study, life advice
|
| 352 |
_GUIDANCE_KW: set[str] = {
|
| 353 |
-
# Study strategy
|
| 354 |
-
"
|
| 355 |
-
"where", "how", "which", "what",
|
| 356 |
-
"se", "kre", "karo", "karein", "start", "shuru",
|
| 357 |
"strategy", "plan", "approach",
|
| 358 |
# Subject selection
|
| 359 |
-
"optional", "subject", "choose", "select",
|
| 360 |
"economy", "geography", "history", "polity",
|
| 361 |
"physics", "chemistry", "biology", "maths",
|
| 362 |
# Life / personal advice
|
|
@@ -364,9 +359,9 @@ _GUIDANCE_KW: set[str] = {
|
|
| 364 |
"control", "manage", "balance",
|
| 365 |
"motivation", "motivate", "inspired",
|
| 366 |
"chhod", "chhodna", "drop", "leave",
|
| 367 |
-
# Hinglish
|
| 368 |
-
"kya
|
| 369 |
-
"
|
| 370 |
"ioc", "baaki", "rest",
|
| 371 |
}
|
| 372 |
|
|
@@ -721,7 +716,6 @@ def _fast_path(t: str, words: set[str], has_q: bool) -> tuple[str, float] | None
|
|
| 721 |
"link", "telegram", "channel", "group", "invite",
|
| 722 |
"help", "support", "contact",
|
| 723 |
"refund", "cancel",
|
| 724 |
-
"chal", "chalta", "kaam", "karta",
|
| 725 |
"khul", "khulta",
|
| 726 |
}
|
| 727 |
if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
|
|
@@ -778,7 +772,6 @@ def _rule_chain(t: str, words: set[str], has_q: bool) -> tuple[str, float] | Non
|
|
| 778 |
"kab", "when", "bje", "baje", "time", "timing", "schedule",
|
| 779 |
"aayega", "aayegi", "aata", "aati", "ata", "ati",
|
| 780 |
"next", "agla", "agli",
|
| 781 |
-
"end", "khatam", "finish",
|
| 782 |
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
|
| 783 |
"somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
|
| 784 |
"kb",
|
|
|
|
| 271 |
|
| 272 |
# Fees + Financial Queries β how to purchase, payment, stipend
|
| 273 |
_FEES_KW: set[str] = {
|
| 274 |
+
# Purchase / payment β specific financial terms only
|
| 275 |
"purchase", "buy", "kharidna", "kharide", "kharido",
|
| 276 |
"payment", "pay", "paid",
|
|
|
|
| 277 |
# Financial
|
| 278 |
"stipend", "salary", "income", "earn", "earning",
|
| 279 |
"emi", "installment", "loan",
|
| 280 |
# Batch purchase
|
| 281 |
+
"enroll", "enrollment", "admission",
|
| 282 |
"register", "registration",
|
| 283 |
+
# Hinglish β only specific ones
|
|
|
|
| 284 |
"pw", "physics wallah", "umeed",
|
| 285 |
}
|
| 286 |
|
|
|
|
| 320 |
"worth", "value",
|
| 321 |
# Faculty in batch
|
| 322 |
"faculty", "teacher", "sir", "mam",
|
| 323 |
+
"included", "include",
|
| 324 |
}
|
| 325 |
|
| 326 |
# ββ Keyword sets: remaining categories βββββββββββββββββββββββββββββββββββββββ
|
| 327 |
|
| 328 |
# Information- Exam β exam rules, cutoffs, forms, reservations, percentages
|
| 329 |
_EXAM_INFO_KW: set[str] = {
|
| 330 |
+
# Exam process β specific exam terms only
|
| 331 |
"form", "bhara", "bharana", "apply", "application",
|
| 332 |
+
"notification", "vacancy",
|
| 333 |
# Exam results / cutoffs
|
| 334 |
"cutoff", "cut off", "marks", "percentage", "percent",
|
| 335 |
"prelims", "mains", "interview", "daf",
|
| 336 |
"clear", "cleared", "qualify", "qualified",
|
| 337 |
# Reservation / rules
|
| 338 |
+
"reservation", "sc", "st", "obc", "ews",
|
| 339 |
"rule", "rules", "regulation", "norms",
|
| 340 |
"upsc", "ssc", "ias", "ips", "ifs",
|
| 341 |
# Exam statistics
|
| 342 |
"attempt", "attempts", "age", "limit",
|
| 343 |
+
"seats",
|
| 344 |
+
# Hinglish β only specific exam-related ones
|
|
|
|
| 345 |
"dono", "both",
|
| 346 |
}
|
| 347 |
|
| 348 |
# Guidance β study strategy, what to study, life advice
|
| 349 |
_GUIDANCE_KW: set[str] = {
|
| 350 |
+
# Study strategy β specific guidance words only
|
| 351 |
+
"konsa", "konsi",
|
|
|
|
|
|
|
| 352 |
"strategy", "plan", "approach",
|
| 353 |
# Subject selection
|
| 354 |
+
"optional", "subject", "choose", "select",
|
| 355 |
"economy", "geography", "history", "polity",
|
| 356 |
"physics", "chemistry", "biology", "maths",
|
| 357 |
# Life / personal advice
|
|
|
|
| 359 |
"control", "manage", "balance",
|
| 360 |
"motivation", "motivate", "inspired",
|
| 361 |
"chhod", "chhodna", "drop", "leave",
|
| 362 |
+
# Hinglish β specific guidance phrases
|
| 363 |
+
"kya kre", "kya karu", "kya karun",
|
| 364 |
+
"suggest",
|
| 365 |
"ioc", "baaki", "rest",
|
| 366 |
}
|
| 367 |
|
|
|
|
| 716 |
"link", "telegram", "channel", "group", "invite",
|
| 717 |
"help", "support", "contact",
|
| 718 |
"refund", "cancel",
|
|
|
|
| 719 |
"khul", "khulta",
|
| 720 |
}
|
| 721 |
if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
|
|
|
|
| 772 |
"kab", "when", "bje", "baje", "time", "timing", "schedule",
|
| 773 |
"aayega", "aayegi", "aata", "aati", "ata", "ati",
|
| 774 |
"next", "agla", "agli",
|
|
|
|
| 775 |
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
|
| 776 |
"somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
|
| 777 |
"kb",
|
ml/sentiment_model.py
CHANGED
|
@@ -203,9 +203,28 @@ _POS_WORDS: set[str] = {
|
|
| 203 |
# ββ Common live chat positives ββ
|
| 204 |
"woww", "wowww", "woah", "whoa", "yay", "yayy",
|
| 205 |
"haha", "hahaha", "lol", "lmao", # laughter = positive
|
| 206 |
-
"clap", "claps", "bravo", "chappal",
|
| 207 |
"heart", "hearts",
|
| 208 |
-
"100", "1000",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
}
|
| 210 |
|
| 211 |
# ββ Negative keyword set βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 203 |
# ββ Common live chat positives ββ
|
| 204 |
"woww", "wowww", "woah", "whoa", "yay", "yayy",
|
| 205 |
"haha", "hahaha", "lol", "lmao", # laughter = positive
|
| 206 |
+
"clap", "claps", "bravo", "chappal",
|
| 207 |
"heart", "hearts",
|
| 208 |
+
"100", "1000",
|
| 209 |
+
|
| 210 |
+
# ββ Greetings / blessings (common in Indian live chats) ββ
|
| 211 |
+
"pranam", "pranaam", "namaskar", "namaste", "namasthe",
|
| 212 |
+
"assalamualaikum", "walaikum", "walekum", "waalaikum",
|
| 213 |
+
"jai hind", "jai ho",
|
| 214 |
+
"gm", "gn", "ge",
|
| 215 |
+
"mubarak", "mubarakho",
|
| 216 |
+
"atb",
|
| 217 |
+
"god bless", "stay blessed", "stay safe",
|
| 218 |
+
"welcome", "wlcm", "wlc",
|
| 219 |
+
"congratulations", "congrats",
|
| 220 |
+
"well done", "keep it up", "keep going",
|
| 221 |
+
"proud", "proudly",
|
| 222 |
+
"maza aa gaya", "maza aaya", "maja aa gaya",
|
| 223 |
+
"khyal rakhna",
|
| 224 |
+
"take care",
|
| 225 |
+
"luck", # "good luck", "best of luck" β "luck" alone is positive context
|
| 226 |
+
"morning", # "good morning" β "morning" alone in greeting context
|
| 227 |
+
"evening", # "good evening"
|
| 228 |
}
|
| 229 |
|
| 230 |
# ββ Negative keyword set βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
ml/topic_model.py
CHANGED
|
@@ -33,12 +33,12 @@ _APPRECIATION_KW = {
|
|
| 33 |
"thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
|
| 34 |
"tysm", "tqsm", "thx",
|
| 35 |
"informative", "fruitful", "motivating", "lovely",
|
| 36 |
-
"bestest", "loved", "
|
| 37 |
"semma", "mass", "solid", "fire", "goated",
|
| 38 |
}
|
| 39 |
|
| 40 |
_QUESTION_KW = {
|
| 41 |
-
"kya", "kab", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
|
| 42 |
"kaise", "kyun", "kyunki",
|
| 43 |
"what", "when", "where", "who", "which", "how", "why",
|
| 44 |
"bata", "batao", "bataye", "tell", "explain",
|
|
@@ -52,7 +52,7 @@ _RF_CONTENT_REQUEST_KW = {
|
|
| 52 |
"karo", "kariye", "karaiye", "kardo",
|
| 53 |
"lao", "laiye", "layiye",
|
| 54 |
"start", "shuru", "launch", "resume",
|
| 55 |
-
"video", "
|
| 56 |
"separate", "alag", "akele", "single",
|
| 57 |
"cover", "include", "add", "topic",
|
| 58 |
"chahiye", "chahte", "chahta", "chahti",
|
|
@@ -66,6 +66,7 @@ _RF_ACADEMIC_KW = {
|
|
| 66 |
"timeline", "schedule", "timetable", "syllabus",
|
| 67 |
"infographic", "slides", "ppt", "handout",
|
| 68 |
"provide", "share", "send", "dedo", "dedijiye",
|
|
|
|
| 69 |
}
|
| 70 |
|
| 71 |
# Language requests
|
|
@@ -121,7 +122,7 @@ _SPAM_PATTERNS = [
|
|
| 121 |
r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
|
| 122 |
r"https?://\S+",
|
| 123 |
r"_{4,}",
|
| 124 |
-
r"(?:\b[a-z0-9]{6,}\b\s*){
|
| 125 |
]
|
| 126 |
|
| 127 |
_SPAM_KW_SUBSTRINGS = {
|
|
@@ -209,7 +210,8 @@ def predict_topic(text: str) -> tuple[str, float]:
|
|
| 209 |
rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
|
| 210 |
|
| 211 |
# ββ Appreciation ββ
|
| 212 |
-
|
|
|
|
| 213 |
if (appreciation_hits >= min_appr_hits
|
| 214 |
and question_hits == 0
|
| 215 |
and not has_question_mark
|
|
|
|
| 33 |
"thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
|
| 34 |
"tysm", "tqsm", "thx",
|
| 35 |
"informative", "fruitful", "motivating", "lovely",
|
| 36 |
+
"bestest", "loved", "nice", "helpful",
|
| 37 |
"semma", "mass", "solid", "fire", "goated",
|
| 38 |
}
|
| 39 |
|
| 40 |
_QUESTION_KW = {
|
| 41 |
+
"kya", "kab", "kb", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
|
| 42 |
"kaise", "kyun", "kyunki",
|
| 43 |
"what", "when", "where", "who", "which", "how", "why",
|
| 44 |
"bata", "batao", "bataye", "tell", "explain",
|
|
|
|
| 52 |
"karo", "kariye", "karaiye", "kardo",
|
| 53 |
"lao", "laiye", "layiye",
|
| 54 |
"start", "shuru", "launch", "resume",
|
| 55 |
+
"video", "class", "series", # removed "session" and "lecture" β too generic
|
| 56 |
"separate", "alag", "akele", "single",
|
| 57 |
"cover", "include", "add", "topic",
|
| 58 |
"chahiye", "chahte", "chahta", "chahti",
|
|
|
|
| 66 |
"timeline", "schedule", "timetable", "syllabus",
|
| 67 |
"infographic", "slides", "ppt", "handout",
|
| 68 |
"provide", "share", "send", "dedo", "dedijiye",
|
| 69 |
+
"milega", "milegi", "milenge", # "where to find" β specific to resource queries
|
| 70 |
}
|
| 71 |
|
| 72 |
# Language requests
|
|
|
|
| 122 |
r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
|
| 123 |
r"https?://\S+",
|
| 124 |
r"_{4,}",
|
| 125 |
+
r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 β avoids catching real sentences
|
| 126 |
]
|
| 127 |
|
| 128 |
_SPAM_KW_SUBSTRINGS = {
|
|
|
|
| 210 |
rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
|
| 211 |
|
| 212 |
# ββ Appreciation ββ
|
| 213 |
+
# Single strong appreciation word is enough regardless of length
|
| 214 |
+
min_appr_hits = 1
|
| 215 |
if (appreciation_hits >= min_appr_hits
|
| 216 |
and question_hits == 0
|
| 217 |
and not has_question_mark
|
shared.py
CHANGED
|
@@ -268,7 +268,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
|
|
| 268 |
try:
|
| 269 |
sentiment, s_conf = _safe_sentiment(text)
|
| 270 |
topic, t_conf = _safe_topic(text)
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
except Exception as exc:
|
| 273 |
logger.error("ML inference failed: %s", exc)
|
| 274 |
sentiment, s_conf = "Neutral", 0.5
|
|
|
|
| 268 |
try:
|
| 269 |
sentiment, s_conf = _safe_sentiment(text)
|
| 270 |
topic, t_conf = _safe_topic(text)
|
| 271 |
+
# Only classify action type for Question/Request topics
|
| 272 |
+
if topic in ("Question", "Request/Feedback"):
|
| 273 |
+
action_type, at_conf = _safe_action_type(text)
|
| 274 |
+
else:
|
| 275 |
+
action_type, at_conf = "N/A", 0.50
|
| 276 |
except Exception as exc:
|
| 277 |
logger.error("ML inference failed: %s", exc)
|
| 278 |
sentiment, s_conf = "Neutral", 0.5
|