Chaitanya-aitf commited on
Commit
9d6c396
·
verified ·
1 Parent(s): 70442bb

Update scoring/podcast_context.py

Browse files
Files changed (1) hide show
  1. scoring/podcast_context.py +143 -34
scoring/podcast_context.py CHANGED
@@ -56,9 +56,11 @@ class ContextAnalysis:
56
  # Scores
57
  self_contained_score: float # 0-1: How understandable is this clip alone?
58
  context_clarity_score: float # 0-1: How clear is the context?
 
59
 
60
- # Issues found
61
- issues: List[Tuple[ContextIssue, str, float]] = field(default_factory=list) # (issue, detail, timestamp)
 
62
 
63
  # Strengths found
64
  strengths: List[Tuple[ContextStrength, str, float]] = field(default_factory=list)
@@ -166,6 +168,14 @@ STRONG_OPENER_PATTERNS = [
166
  r'^(?:the\s+(?:real|actual|honest)\s+(?:answer|truth|reason))',
167
  ]
168
 
 
 
 
 
 
 
 
 
169
 
170
  class PodcastContextAnalyzer:
171
  """
@@ -219,12 +229,22 @@ class PodcastContextAnalyzer:
219
  # Calculate scores
220
  analysis.self_contained_score = self._calculate_self_contained_score(analysis)
221
  analysis.context_clarity_score = self._calculate_clarity_score(analysis)
 
222
 
223
  # Find question if this looks like an answer
224
- if full_transcript and self._looks_like_answer(clip_transcript):
225
- self._find_preceding_question(
226
- clip_start, full_transcript, analysis
227
- )
 
 
 
 
 
 
 
 
 
228
 
229
  # Determine if expansion or intro is needed
230
  self._recommend_adjustments(analysis, full_transcript)
@@ -245,6 +265,7 @@ class PodcastContextAnalyzer:
245
  ) -> None:
246
  """Detect context issues in the clip text."""
247
  text_lower = text.lower()
 
248
  sentences = self._split_sentences(text)
249
 
250
  # Check first sentence specifically (most important for hooks)
@@ -252,23 +273,34 @@ class PodcastContextAnalyzer:
252
  first_lower = first_sentence.lower()
253
 
254
  # 1. Pronouns without antecedents (especially at start)
255
- for pattern in PRONOUN_PATTERNS:
256
- matches = re.findall(pattern, first_lower, re.IGNORECASE)
257
- if matches:
258
- analysis.issues.append((
259
- ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
260
- f"Pronoun '{matches[0]}' at start without clear reference",
261
- start_time
262
- ))
 
 
 
 
 
 
 
263
 
264
  # 2. References to earlier content
265
  for pattern in REFERENCE_TO_EARLIER_PATTERNS:
266
- if re.search(pattern, text_lower, re.IGNORECASE):
 
 
267
  analysis.issues.append((
268
  ContextIssue.REFERENCE_TO_EARLIER,
269
  f"Reference to earlier content: {pattern}",
270
- start_time
 
271
  ))
 
272
 
273
  # 3. Mid-argument start detection
274
  mid_argument_starters = [
@@ -281,7 +313,8 @@ class PodcastContextAnalyzer:
281
  analysis.issues.append((
282
  ContextIssue.MID_ARGUMENT_START,
283
  "Clip starts mid-argument/mid-thought",
284
- start_time
 
285
  ))
286
  break
287
 
@@ -290,7 +323,8 @@ class PodcastContextAnalyzer:
290
  analysis.issues.append((
291
  ContextIssue.INCOMPLETE_THOUGHT,
292
  "Clip ends with incomplete thought",
293
- start_time
 
294
  ))
295
 
296
  def _detect_context_strengths(
@@ -344,20 +378,36 @@ class PodcastContextAnalyzer:
344
  start_time
345
  ))
346
 
 
 
 
 
 
 
 
 
 
 
347
  def _calculate_self_contained_score(self, analysis: ContextAnalysis) -> float:
348
  """Calculate how self-contained the clip is."""
349
  score = 1.0
350
 
351
- # Apply penalties
352
- for issue, detail, _ in analysis.issues:
 
 
 
 
353
  if issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
354
- score -= self.config.pronoun_penalty
355
  elif issue == ContextIssue.REFERENCE_TO_EARLIER:
356
- score -= self.config.reference_penalty
357
  elif issue == ContextIssue.MID_ARGUMENT_START:
358
- score -= self.config.mid_argument_penalty
359
  elif issue == ContextIssue.INCOMPLETE_THOUGHT:
360
- score -= 0.10
 
 
361
 
362
  # Apply bonuses
363
  for strength, detail, _ in analysis.strengths:
@@ -369,6 +419,8 @@ class PodcastContextAnalyzer:
369
  score += self.config.cause_effect_bonus
370
  elif strength == ContextStrength.QUESTION_ANSWER_PAIR:
371
  score += self.config.question_answer_bonus
 
 
372
 
373
  return max(0.0, min(1.0, score))
374
 
@@ -376,14 +428,18 @@ class PodcastContextAnalyzer:
376
  """Calculate how clear the context is."""
377
  score = 1.0
378
 
379
- # Heavy penalty for mid-argument starts
380
- for issue, _, _ in analysis.issues:
 
 
381
  if issue == ContextIssue.MID_ARGUMENT_START:
382
- score -= 0.30
383
  elif issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
384
- score -= 0.15
385
  elif issue == ContextIssue.REFERENCE_TO_EARLIER:
386
- score -= 0.25
 
 
387
 
388
  # Bonus for explicit context
389
  for strength, _, _ in analysis.strengths:
@@ -394,6 +450,41 @@ class PodcastContextAnalyzer:
394
 
395
  return max(0.0, min(1.0, score))
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  def _looks_like_answer(self, text: str) -> bool:
398
  """Check if text looks like an answer to a question."""
399
  text_lower = text.lower().strip()
@@ -460,7 +551,7 @@ class PodcastContextAnalyzer:
460
  # Check if expansion is needed
461
  needs_expansion = False
462
 
463
- for issue, _, _ in analysis.issues:
464
  if issue == ContextIssue.MID_ARGUMENT_START and self.config.expand_on_mid_argument:
465
  needs_expansion = True
466
  elif issue == ContextIssue.MISSING_QUESTION and self.config.expand_on_missing_question:
@@ -547,10 +638,24 @@ class PodcastContextAnalyzer:
547
  return best_start
548
 
549
  def _split_sentences(self, text: str) -> List[str]:
550
- """Split text into sentences."""
551
  # Simple sentence splitter
552
  sentences = re.split(r'(?<=[.!?])\s+', text)
553
- return [s.strip() for s in sentences if s.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
  def score_clip_for_reranking(
556
  self,
@@ -585,6 +690,7 @@ class PodcastContextAnalyzer:
585
 
586
  # Don't penalize too harshly - context issues can be fixed
587
  # But do reward good context
 
588
  if context_factor >= 0.7:
589
  # Good context: slight boost
590
  adjustment = 1.0 + (context_factor - 0.7) * 0.3 # Up to 1.09x
@@ -592,8 +698,11 @@ class PodcastContextAnalyzer:
592
  # Okay context: neutral
593
  adjustment = 1.0
594
  else:
595
- # Poor context: penalize
596
- adjustment = 0.8 + context_factor * 0.4 # 0.8x to 1.0x
 
 
 
597
 
598
  adjusted_score = original_hype_score * adjustment
599
 
 
56
  # Scores
57
  self_contained_score: float # 0-1: How understandable is this clip alone?
58
  context_clarity_score: float # 0-1: How clear is the context?
59
+ fixability_score: float = 1.0 # 0-1: How easy to fix issues (1.0 = easy/no issues)
60
 
61
+ # Issues found - (issue, detail, timestamp, position_ratio)
62
+ # position_ratio: 0.0 = start of clip, 1.0 = end of clip
63
+ issues: List[Tuple[ContextIssue, str, float, float]] = field(default_factory=list)
64
 
65
  # Strengths found
66
  strengths: List[Tuple[ContextStrength, str, float]] = field(default_factory=list)
 
168
  r'^(?:the\s+(?:real|actual|honest)\s+(?:answer|truth|reason))',
169
  ]
170
 
171
+ SELF_CONTAINED_CLAIM_PATTERNS = [
172
+ r'^(?:the\s+reason\s+(?:is|was|why))',
173
+ r'^(?:this\s+means\s+that)',
174
+ r'^(?:what\s+this\s+(?:shows|proves|demonstrates)\s+is)',
175
+ r'^(?:the\s+(?:bottom\s+line|takeaway|conclusion)\s+is)',
176
+ r'^(?:in\s+other\s+words)',
177
+ ]
178
+
179
 
180
  class PodcastContextAnalyzer:
181
  """
 
229
  # Calculate scores
230
  analysis.self_contained_score = self._calculate_self_contained_score(analysis)
231
  analysis.context_clarity_score = self._calculate_clarity_score(analysis)
232
+ analysis.fixability_score = self._calculate_fixability_score(analysis)
233
 
234
  # Find question if this looks like an answer
235
+ if self._looks_like_answer(clip_transcript):
236
+ if full_transcript:
237
+ self._find_preceding_question(
238
+ clip_start, full_transcript, analysis
239
+ )
240
+ # Bug fix: Add MISSING_QUESTION issue if answer detected but no question found
241
+ if not analysis.question_timestamp:
242
+ analysis.issues.append((
243
+ ContextIssue.MISSING_QUESTION,
244
+ "Answer without preceding question found",
245
+ clip_start,
246
+ 0.0 # Effectively a "start" issue since context is missing
247
+ ))
248
 
249
  # Determine if expansion or intro is needed
250
  self._recommend_adjustments(analysis, full_transcript)
 
265
  ) -> None:
266
  """Detect context issues in the clip text."""
267
  text_lower = text.lower()
268
+ text_len = len(text) if text else 1
269
  sentences = self._split_sentences(text)
270
 
271
  # Check first sentence specifically (most important for hooks)
 
273
  first_lower = first_sentence.lower()
274
 
275
  # 1. Pronouns without antecedents (especially at start)
276
+ # Only penalize if pronoun appears before any explicit noun in the clip
277
+ has_explicit_noun_early = bool(re.match(r'^[^.!?]*\b[A-Z][a-z]+', first_sentence))
278
+ if not has_explicit_noun_early:
279
+ for pattern in PRONOUN_PATTERNS:
280
+ match = re.search(pattern, first_lower, re.IGNORECASE)
281
+ if match:
282
+ # Calculate position ratio (0.0 = start, 1.0 = end)
283
+ position_ratio = match.start() / text_len
284
+ analysis.issues.append((
285
+ ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
286
+ f"Pronoun '{match.group()}' at start without clear reference",
287
+ start_time,
288
+ position_ratio
289
+ ))
290
+ break # Only add one pronoun issue per clip
291
 
292
  # 2. References to earlier content
293
  for pattern in REFERENCE_TO_EARLIER_PATTERNS:
294
+ match = re.search(pattern, text_lower, re.IGNORECASE)
295
+ if match:
296
+ position_ratio = match.start() / text_len
297
  analysis.issues.append((
298
  ContextIssue.REFERENCE_TO_EARLIER,
299
  f"Reference to earlier content: {pattern}",
300
+ start_time,
301
+ position_ratio
302
  ))
303
+ break # One reference issue is enough
304
 
305
  # 3. Mid-argument start detection
306
  mid_argument_starters = [
 
313
  analysis.issues.append((
314
  ContextIssue.MID_ARGUMENT_START,
315
  "Clip starts mid-argument/mid-thought",
316
+ start_time,
317
+ 0.0 # Always at start
318
  ))
319
  break
320
 
 
323
  analysis.issues.append((
324
  ContextIssue.INCOMPLETE_THOUGHT,
325
  "Clip ends with incomplete thought",
326
+ start_time,
327
+ 1.0 # Always at end
328
  ))
329
 
330
  def _detect_context_strengths(
 
378
  start_time
379
  ))
380
 
381
+ # 5. Self-contained claims (explains itself)
382
+ for pattern in SELF_CONTAINED_CLAIM_PATTERNS:
383
+ if re.match(pattern, first_lower, re.IGNORECASE):
384
+ analysis.strengths.append((
385
+ ContextStrength.SELF_CONTAINED_CLAIM,
386
+ "Self-explanatory claim structure",
387
+ start_time
388
+ ))
389
+ break
390
+
391
  def _calculate_self_contained_score(self, analysis: ContextAnalysis) -> float:
392
  """Calculate how self-contained the clip is."""
393
  score = 1.0
394
 
395
+ # Apply penalties with position weighting
396
+ # Issues at start (position_ratio ~0) get full penalty
397
+ # Issues later (position_ratio ~1) get reduced penalty
398
+ for issue, detail, _, position_ratio in analysis.issues:
399
+ position_weight = 1.0 - (position_ratio * 0.7) # 1.0 at start, 0.3 at end
400
+
401
  if issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
402
+ score -= self.config.pronoun_penalty * position_weight
403
  elif issue == ContextIssue.REFERENCE_TO_EARLIER:
404
+ score -= self.config.reference_penalty * position_weight
405
  elif issue == ContextIssue.MID_ARGUMENT_START:
406
+ score -= self.config.mid_argument_penalty # Full penalty (always at start)
407
  elif issue == ContextIssue.INCOMPLETE_THOUGHT:
408
+ score -= 0.10 # End issues don't need position weighting
409
+ elif issue == ContextIssue.MISSING_QUESTION:
410
+ score -= 0.15 * position_weight
411
 
412
  # Apply bonuses
413
  for strength, detail, _ in analysis.strengths:
 
419
  score += self.config.cause_effect_bonus
420
  elif strength == ContextStrength.QUESTION_ANSWER_PAIR:
421
  score += self.config.question_answer_bonus
422
+ elif strength == ContextStrength.SELF_CONTAINED_CLAIM:
423
+ score += 0.10
424
 
425
  return max(0.0, min(1.0, score))
426
 
 
428
  """Calculate how clear the context is."""
429
  score = 1.0
430
 
431
+ # Heavy penalty for mid-argument starts, with position weighting
432
+ for issue, _, _, position_ratio in analysis.issues:
433
+ position_weight = 1.0 - (position_ratio * 0.7)
434
+
435
  if issue == ContextIssue.MID_ARGUMENT_START:
436
+ score -= 0.30 # Full penalty (always at start)
437
  elif issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
438
+ score -= 0.15 * position_weight
439
  elif issue == ContextIssue.REFERENCE_TO_EARLIER:
440
+ score -= 0.25 * position_weight
441
+ elif issue == ContextIssue.MISSING_QUESTION:
442
+ score -= 0.20
443
 
444
  # Bonus for explicit context
445
  for strength, _, _ in analysis.strengths:
 
450
 
451
  return max(0.0, min(1.0, score))
452
 
453
+ def _calculate_fixability_score(self, analysis: ContextAnalysis) -> float:
454
+ """
455
+ Calculate how easy it is to fix context issues.
456
+
457
+ Hard issues: MID_ARGUMENT_START, REFERENCE_TO_EARLIER (require expansion)
458
+ Easy issues: PRONOUN_WITHOUT_ANTECEDENT, MISSING_QUESTION (can add intro)
459
+ """
460
+ if not analysis.issues:
461
+ return 1.0
462
+
463
+ # Categorize issues by difficulty
464
+ hard_issues = {
465
+ ContextIssue.MID_ARGUMENT_START,
466
+ ContextIssue.REFERENCE_TO_EARLIER,
467
+ ContextIssue.ASSUMES_PRIOR_KNOWLEDGE,
468
+ }
469
+ easy_issues = {
470
+ ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
471
+ ContextIssue.MISSING_QUESTION,
472
+ ContextIssue.INCOMPLETE_THOUGHT,
473
+ }
474
+
475
+ hard_count = sum(1 for issue, _, _, _ in analysis.issues if issue in hard_issues)
476
+ easy_count = sum(1 for issue, _, _, _ in analysis.issues if issue in easy_issues)
477
+ total = hard_count + easy_count
478
+
479
+ if total == 0:
480
+ return 1.0
481
+
482
+ # Fixability: 1.0 if all easy, 0.0 if all hard
483
+ # Easy issues contribute 0.8 to fixability, hard issues contribute 0.2
484
+ fixability = (easy_count * 0.8 + hard_count * 0.2) / total
485
+
486
+ return max(0.0, min(1.0, fixability))
487
+
488
  def _looks_like_answer(self, text: str) -> bool:
489
  """Check if text looks like an answer to a question."""
490
  text_lower = text.lower().strip()
 
551
  # Check if expansion is needed
552
  needs_expansion = False
553
 
554
+ for issue, _, _, _ in analysis.issues:
555
  if issue == ContextIssue.MID_ARGUMENT_START and self.config.expand_on_mid_argument:
556
  needs_expansion = True
557
  elif issue == ContextIssue.MISSING_QUESTION and self.config.expand_on_missing_question:
 
638
  return best_start
639
 
640
  def _split_sentences(self, text: str) -> List[str]:
641
+ """Split text into sentences with fallback for unpunctuated transcripts."""
642
  # Simple sentence splitter
643
  sentences = re.split(r'(?<=[.!?])\s+', text)
644
+ sentences = [s.strip() for s in sentences if s.strip()]
645
+
646
+ # Fallback: if only one "sentence" and it's very long (likely unpunctuated transcript),
647
+ # treat the first ~200 chars as the "first sentence" for analysis purposes
648
+ if len(sentences) == 1 and len(sentences[0]) > 200:
649
+ # Try to find a natural break point (comma, conjunction)
650
+ first_chunk = sentences[0][:200]
651
+ break_match = re.search(r'[,;]\s+|\s+(?:and|but|so|because)\s+', first_chunk[100:])
652
+ if break_match:
653
+ cut_point = 100 + break_match.start()
654
+ sentences = [sentences[0][:cut_point].strip(), sentences[0][cut_point:].strip()]
655
+ else:
656
+ sentences = [first_chunk.strip()]
657
+
658
+ return sentences
659
 
660
  def score_clip_for_reranking(
661
  self,
 
690
 
691
  # Don't penalize too harshly - context issues can be fixed
692
  # But do reward good context
693
+ # Use fixability to soften penalties for easily-fixable clips
694
  if context_factor >= 0.7:
695
  # Good context: slight boost
696
  adjustment = 1.0 + (context_factor - 0.7) * 0.3 # Up to 1.09x
 
698
  # Okay context: neutral
699
  adjustment = 1.0
700
  else:
701
+ # Poor context: penalize, but less if fixable
702
+ base_penalty = 0.8 + context_factor * 0.4 # 0.8x to 1.0x
703
+ # Fixable clips get softer penalty
704
+ fixability_boost = analysis.fixability_score * 0.15 # Up to +0.15
705
+ adjustment = min(1.0, base_penalty + fixability_boost)
706
 
707
  adjusted_score = original_hype_score * adjustment
708