Spaces:
Sleeping
Sleeping
Commit Β·
af22fb0
1
Parent(s): 676b0cc
push
Browse files- utils/llm_handler.py +99 -23
utils/llm_handler.py
CHANGED
|
@@ -239,22 +239,74 @@ Include:
|
|
| 239 |
|
| 240 |
def check_educational(self, transcript: str, title: str = "") -> bool:
|
| 241 |
"""
|
| 242 |
-
|
| 243 |
-
|
|
|
|
| 244 |
"""
|
| 245 |
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
'recipe', 'cooking', 'ingredient', 'tablespoon', 'teaspoon',
|
| 249 |
-
'karahi', 'biryani', 'pakana', 'khana banana',
|
| 250 |
-
|
| 251 |
-
'
|
| 252 |
-
'
|
| 253 |
-
'funny prank', 'comedy sketch',
|
| 254 |
-
'gaming gameplay', 'game stream',
|
| 255 |
]
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
#
|
| 258 |
hard_accept_keywords = [
|
| 259 |
'lecture', 'lesson', 'chapter', 'tutorial', 'course',
|
| 260 |
'university', 'college', 'school', 'class',
|
|
@@ -267,20 +319,44 @@ Include:
|
|
| 267 |
'accounting', 'finance', 'networking', 'compiler',
|
| 268 |
'parh', 'seekhna', 'samajhna', 'taleem', 'ilm', 'sabaq',
|
| 269 |
'teacher', 'student', 'syllabus', 'notes',
|
| 270 |
-
'explain', 'definition', 'concept', 'theory',
|
| 271 |
]
|
| 272 |
-
|
| 273 |
-
text = (transcript[:3000] + " " + title).lower()
|
| 274 |
-
|
| 275 |
-
# Koi bhi hard_accept mila β turant allow
|
| 276 |
for kw in hard_accept_keywords:
|
| 277 |
if kw in text:
|
| 278 |
return True
|
| 279 |
|
| 280 |
-
#
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
def check_educational(self, transcript: str, title: str = "") -> bool:
|
| 241 |
"""
|
| 242 |
+
Step 1: Title se check karo (agar available ho)
|
| 243 |
+
Step 2: Transcript keywords check karo
|
| 244 |
+
Step 3: Har case mein AI se final confirm karo
|
| 245 |
"""
|
| 246 |
|
| 247 |
+
title_lower = title.lower().strip()
|
| 248 |
+
text = (transcript[:3000] + " " + title).lower()
|
| 249 |
+
|
| 250 |
+
# βββ STEP 1: TITLE check (sirf tab jab title available ho) βββ
|
| 251 |
+
if title_lower:
|
| 252 |
+
title_reject_patterns = [
|
| 253 |
+
# Songs / Music
|
| 254 |
+
'official music video', 'official video', 'official audio',
|
| 255 |
+
'lyrics video', 'lyric video', 'music video',
|
| 256 |
+
'live performance', 'live at', 'concert', 'official mv',
|
| 257 |
+
'(mv)', 'audio song', 'full song', 'new song',
|
| 258 |
+
# Vlogs
|
| 259 |
+
'vlog', 'day in my life', 'daily routine', 'morning routine',
|
| 260 |
+
'night routine', 'week in my life', 'come with me',
|
| 261 |
+
'grwm', 'get ready with me',
|
| 262 |
+
# Cartoons / Entertainment
|
| 263 |
+
'cartoon', 'anime', 'episode', 'season', 'animated series',
|
| 264 |
+
'full episode', 'dubbed', 'subtitled',
|
| 265 |
+
# Other entertainment
|
| 266 |
+
'reaction video', 'reacting to', 'prank', 'challenge',
|
| 267 |
+
'funny moments', 'highlights', 'compilation',
|
| 268 |
+
'trailer', 'teaser', 'behind the scenes',
|
| 269 |
+
]
|
| 270 |
+
for pattern in title_reject_patterns:
|
| 271 |
+
if pattern in title_lower:
|
| 272 |
+
return False
|
| 273 |
+
|
| 274 |
+
# βββ STEP 2: TRANSCRIPT keyword check βββ
|
| 275 |
+
|
| 276 |
+
# Ye hon to β TURANT REJECT (1 bhi kafi hai)
|
| 277 |
+
strong_reject_single = [
|
| 278 |
+
'official music video', 'music video',
|
| 279 |
+
'verse 1', 'verse 2', 'chorus', 'pre-chorus', 'bridge', # song structure
|
| 280 |
+
'subscribe to my channel', 'like share subscribe',
|
| 281 |
+
'aaj ka vlog', 'aaj ki vlog', 'today\'s vlog',
|
| 282 |
+
'cartoon network', 'nickelodeon',
|
| 283 |
+
]
|
| 284 |
+
for kw in strong_reject_single:
|
| 285 |
+
if kw in text:
|
| 286 |
+
return False
|
| 287 |
+
|
| 288 |
+
# Ye 2+ milein to β REJECT
|
| 289 |
+
soft_reject_keywords = [
|
| 290 |
+
# Song related
|
| 291 |
+
'lyrics', 'singer', 'hook', 'beat drop', 'rap', 'singing',
|
| 292 |
+
# Vlog related
|
| 293 |
+
'vlog', 'daily routine', 'morning routine', 'outfit', 'haul',
|
| 294 |
+
'grwm', 'storytime', 'q&a', 'q and a',
|
| 295 |
+
# Cartoon related
|
| 296 |
+
'cartoon', 'anime', 'animation', 'animated', 'episode', 'character',
|
| 297 |
+
'villain', 'superhero', 'pixar', 'disney',
|
| 298 |
+
# Cooking
|
| 299 |
'recipe', 'cooking', 'ingredient', 'tablespoon', 'teaspoon',
|
| 300 |
+
'karahi', 'biryani', 'pakana', 'khana banana', 'masala',
|
| 301 |
+
# Other
|
| 302 |
+
'makeup', 'skincare', 'prank', 'challenge', 'funny',
|
| 303 |
+
'gameplay', 'game stream', 'drama serial',
|
|
|
|
|
|
|
| 304 |
]
|
| 305 |
+
reject_count = sum(1 for kw in soft_reject_keywords if kw in text)
|
| 306 |
+
if reject_count >= 2:
|
| 307 |
+
return False
|
| 308 |
|
| 309 |
+
# βββ STEP 3: Educational keywords β ALLOW βββ
|
| 310 |
hard_accept_keywords = [
|
| 311 |
'lecture', 'lesson', 'chapter', 'tutorial', 'course',
|
| 312 |
'university', 'college', 'school', 'class',
|
|
|
|
| 319 |
'accounting', 'finance', 'networking', 'compiler',
|
| 320 |
'parh', 'seekhna', 'samajhna', 'taleem', 'ilm', 'sabaq',
|
| 321 |
'teacher', 'student', 'syllabus', 'notes',
|
| 322 |
+
'explain', 'definition', 'concept', 'theory', 'topic',
|
| 323 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
for kw in hard_accept_keywords:
|
| 325 |
if kw in text:
|
| 326 |
return True
|
| 327 |
|
| 328 |
+
# βββ STEP 4: Kuch clear nahi β AI se ZAROOR pocho βββ
|
| 329 |
+
# Title ho ya na ho β AI transcript se analyse karega
|
| 330 |
+
try:
|
| 331 |
+
title_info = f"Title: {title}" if title_lower else "Title: (not available)"
|
| 332 |
+
system_prompt = """You are a strict content classifier. Return ONLY 'yes' or 'no'. No explanation.
|
| 333 |
+
|
| 334 |
+
RETURN 'yes' ONLY IF the content is clearly one of:
|
| 335 |
+
- University or school lecture
|
| 336 |
+
- Coding or programming tutorial
|
| 337 |
+
- Academic subject explanation (math, science, history, etc.)
|
| 338 |
+
- Professional skill training
|
| 339 |
+
|
| 340 |
+
RETURN 'no' IF the content is:
|
| 341 |
+
- Song, music video, lyrics
|
| 342 |
+
- Vlog, daily routine, lifestyle content
|
| 343 |
+
- Cartoon, anime, animated show
|
| 344 |
+
- Cooking, recipe
|
| 345 |
+
- Drama, TV show, movie
|
| 346 |
+
- Gaming, gameplay
|
| 347 |
+
- Comedy, prank, challenge
|
| 348 |
+
- News, entertainment
|
| 349 |
+
- Fashion, beauty, makeup
|
| 350 |
+
- Travel, tourism"""
|
| 351 |
+
|
| 352 |
+
user_prompt = f"""{title_info}
|
| 353 |
+
Transcript (first 1000 chars):
|
| 354 |
+
{transcript[:1000]}
|
| 355 |
+
|
| 356 |
+
Is this educational content? Answer yes or no only."""
|
| 357 |
+
|
| 358 |
+
response = self._call_llm(system_prompt, user_prompt, max_tokens=5)
|
| 359 |
+
return 'yes' in response.lower().strip()
|
| 360 |
+
except:
|
| 361 |
+
# AI fail ho jaye to safe side β reject
|
| 362 |
+
return False
|