eshameo045 commited on
Commit
af22fb0
Β·
1 Parent(s): 676b0cc
Files changed (1) hide show
  1. utils/llm_handler.py +99 -23
utils/llm_handler.py CHANGED
@@ -239,22 +239,74 @@ Include:
239
 
240
  def check_educational(self, transcript: str, title: str = "") -> bool:
241
  """
242
- FIX: Pehle sirf obvious non-educational content reject karo.
243
- Doubt ho to ALLOW karo β€” reject mat karo.
 
244
  """
245
 
246
- # Sirf ye cheezein clearly reject karo
247
- hard_reject_keywords = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  'recipe', 'cooking', 'ingredient', 'tablespoon', 'teaspoon',
249
- 'karahi', 'biryani', 'pakana', 'khana banana',
250
- 'drama serial', 'episode dekho', 'actor', 'actress',
251
- 'song lyrics', 'music video', 'concert',
252
- 'makeup tutorial', 'skincare routine', 'fashion haul',
253
- 'funny prank', 'comedy sketch',
254
- 'gaming gameplay', 'game stream',
255
  ]
 
 
 
256
 
257
- # Ye hain to definitely educational hai
258
  hard_accept_keywords = [
259
  'lecture', 'lesson', 'chapter', 'tutorial', 'course',
260
  'university', 'college', 'school', 'class',
@@ -267,20 +319,44 @@ Include:
267
  'accounting', 'finance', 'networking', 'compiler',
268
  'parh', 'seekhna', 'samajhna', 'taleem', 'ilm', 'sabaq',
269
  'teacher', 'student', 'syllabus', 'notes',
270
- 'explain', 'definition', 'concept', 'theory',
271
  ]
272
-
273
- text = (transcript[:3000] + " " + title).lower()
274
-
275
- # Koi bhi hard_accept mila β†’ turant allow
276
  for kw in hard_accept_keywords:
277
  if kw in text:
278
  return True
279
 
280
- # Sirf tab reject karo jab 3+ hard_reject keywords hon
281
- reject_count = sum(1 for kw in hard_reject_keywords if kw in text)
282
- if reject_count >= 3:
283
- return False
284
-
285
- # Baaki sab cases mein β†’ ALLOW (doubt ka faida user ko do)
286
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  def check_educational(self, transcript: str, title: str = "") -> bool:
241
  """
242
+ Step 1: Title se check karo (agar available ho)
243
+ Step 2: Transcript keywords check karo
244
+ Step 3: Har case mein AI se final confirm karo
245
  """
246
 
247
+ title_lower = title.lower().strip()
248
+ text = (transcript[:3000] + " " + title).lower()
249
+
250
+ # ─── STEP 1: TITLE check (sirf tab jab title available ho) ───
251
+ if title_lower:
252
+ title_reject_patterns = [
253
+ # Songs / Music
254
+ 'official music video', 'official video', 'official audio',
255
+ 'lyrics video', 'lyric video', 'music video',
256
+ 'live performance', 'live at', 'concert', 'official mv',
257
+ '(mv)', 'audio song', 'full song', 'new song',
258
+ # Vlogs
259
+ 'vlog', 'day in my life', 'daily routine', 'morning routine',
260
+ 'night routine', 'week in my life', 'come with me',
261
+ 'grwm', 'get ready with me',
262
+ # Cartoons / Entertainment
263
+ 'cartoon', 'anime', 'episode', 'season', 'animated series',
264
+ 'full episode', 'dubbed', 'subtitled',
265
+ # Other entertainment
266
+ 'reaction video', 'reacting to', 'prank', 'challenge',
267
+ 'funny moments', 'highlights', 'compilation',
268
+ 'trailer', 'teaser', 'behind the scenes',
269
+ ]
270
+ for pattern in title_reject_patterns:
271
+ if pattern in title_lower:
272
+ return False
273
+
274
+ # ─── STEP 2: TRANSCRIPT keyword check ───
275
+
276
+ # Ye hon to β†’ TURANT REJECT (1 bhi kafi hai)
277
+ strong_reject_single = [
278
+ 'official music video', 'music video',
279
+ 'verse 1', 'verse 2', 'chorus', 'pre-chorus', 'bridge', # song structure
280
+ 'subscribe to my channel', 'like share subscribe',
281
+ 'aaj ka vlog', 'aaj ki vlog', 'today\'s vlog',
282
+ 'cartoon network', 'nickelodeon',
283
+ ]
284
+ for kw in strong_reject_single:
285
+ if kw in text:
286
+ return False
287
+
288
+ # Ye 2+ milein to β†’ REJECT
289
+ soft_reject_keywords = [
290
+ # Song related
291
+ 'lyrics', 'singer', 'hook', 'beat drop', 'rap', 'singing',
292
+ # Vlog related
293
+ 'vlog', 'daily routine', 'morning routine', 'outfit', 'haul',
294
+ 'grwm', 'storytime', 'q&a', 'q and a',
295
+ # Cartoon related
296
+ 'cartoon', 'anime', 'animation', 'animated', 'episode', 'character',
297
+ 'villain', 'superhero', 'pixar', 'disney',
298
+ # Cooking
299
  'recipe', 'cooking', 'ingredient', 'tablespoon', 'teaspoon',
300
+ 'karahi', 'biryani', 'pakana', 'khana banana', 'masala',
301
+ # Other
302
+ 'makeup', 'skincare', 'prank', 'challenge', 'funny',
303
+ 'gameplay', 'game stream', 'drama serial',
 
 
304
  ]
305
+ reject_count = sum(1 for kw in soft_reject_keywords if kw in text)
306
+ if reject_count >= 2:
307
+ return False
308
 
309
+ # ─── STEP 3: Educational keywords β†’ ALLOW ───
310
  hard_accept_keywords = [
311
  'lecture', 'lesson', 'chapter', 'tutorial', 'course',
312
  'university', 'college', 'school', 'class',
 
319
  'accounting', 'finance', 'networking', 'compiler',
320
  'parh', 'seekhna', 'samajhna', 'taleem', 'ilm', 'sabaq',
321
  'teacher', 'student', 'syllabus', 'notes',
322
+ 'explain', 'definition', 'concept', 'theory', 'topic',
323
  ]
 
 
 
 
324
  for kw in hard_accept_keywords:
325
  if kw in text:
326
  return True
327
 
328
+ # ─── STEP 4: Kuch clear nahi β†’ AI se ZAROOR pocho ───
329
+ # Title ho ya na ho β€” AI transcript se analyse karega
330
+ try:
331
+ title_info = f"Title: {title}" if title_lower else "Title: (not available)"
332
+ system_prompt = """You are a strict content classifier. Return ONLY 'yes' or 'no'. No explanation.
333
+
334
+ RETURN 'yes' ONLY IF the content is clearly one of:
335
+ - University or school lecture
336
+ - Coding or programming tutorial
337
+ - Academic subject explanation (math, science, history, etc.)
338
+ - Professional skill training
339
+
340
+ RETURN 'no' IF the content is:
341
+ - Song, music video, lyrics
342
+ - Vlog, daily routine, lifestyle content
343
+ - Cartoon, anime, animated show
344
+ - Cooking, recipe
345
+ - Drama, TV show, movie
346
+ - Gaming, gameplay
347
+ - Comedy, prank, challenge
348
+ - News, entertainment
349
+ - Fashion, beauty, makeup
350
+ - Travel, tourism"""
351
+
352
+ user_prompt = f"""{title_info}
353
+ Transcript (first 1000 chars):
354
+ {transcript[:1000]}
355
+
356
+ Is this educational content? Answer yes or no only."""
357
+
358
+ response = self._call_llm(system_prompt, user_prompt, max_tokens=5)
359
+ return 'yes' in response.lower().strip()
360
+ except:
361
+ # AI fail ho jaye to safe side β†’ reject
362
+ return False