Spaces:

AI-Talent-Force
/

ShortSmith_v3

Paused

App Files Files Community

Chaitanya-aitf commited on Jan 6

Commit

9d6c396

verified ·

1 Parent(s): 70442bb

Update scoring/podcast_context.py

Browse files

Files changed (1) hide show

scoring/podcast_context.py +143 -34

scoring/podcast_context.py CHANGED Viewed

@@ -56,9 +56,11 @@ class ContextAnalysis:
     # Scores
     self_contained_score: float      # 0-1: How understandable is this clip alone?
     context_clarity_score: float     # 0-1: How clear is the context?
-    # Issues found
-    issues: List[Tuple[ContextIssue, str, float]] = field(default_factory=list)  # (issue, detail, timestamp)
     # Strengths found
     strengths: List[Tuple[ContextStrength, str, float]] = field(default_factory=list)
@@ -166,6 +168,14 @@ STRONG_OPENER_PATTERNS = [
     r'^(?:the\s+(?:real|actual|honest)\s+(?:answer|truth|reason))',
 ]
 class PodcastContextAnalyzer:
     """
@@ -219,12 +229,22 @@ class PodcastContextAnalyzer:
         # Calculate scores
         analysis.self_contained_score = self._calculate_self_contained_score(analysis)
         analysis.context_clarity_score = self._calculate_clarity_score(analysis)
         # Find question if this looks like an answer
-        if full_transcript and self._looks_like_answer(clip_transcript):
-            self._find_preceding_question(
-                clip_start, full_transcript, analysis
-            )
         # Determine if expansion or intro is needed
         self._recommend_adjustments(analysis, full_transcript)
@@ -245,6 +265,7 @@ class PodcastContextAnalyzer:
     ) -> None:
         """Detect context issues in the clip text."""
         text_lower = text.lower()
         sentences = self._split_sentences(text)
         # Check first sentence specifically (most important for hooks)
@@ -252,23 +273,34 @@ class PodcastContextAnalyzer:
         first_lower = first_sentence.lower()
         # 1. Pronouns without antecedents (especially at start)
-        for pattern in PRONOUN_PATTERNS:
-            matches = re.findall(pattern, first_lower, re.IGNORECASE)
-            if matches:
-                analysis.issues.append((
-                    ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
-                    f"Pronoun '{matches[0]}' at start without clear reference",
-                    start_time
-                ))
         # 2. References to earlier content
         for pattern in REFERENCE_TO_EARLIER_PATTERNS:
-            if re.search(pattern, text_lower, re.IGNORECASE):
                 analysis.issues.append((
                     ContextIssue.REFERENCE_TO_EARLIER,
                     f"Reference to earlier content: {pattern}",
-                    start_time
                 ))
         # 3. Mid-argument start detection
         mid_argument_starters = [
@@ -281,7 +313,8 @@ class PodcastContextAnalyzer:
                 analysis.issues.append((
                     ContextIssue.MID_ARGUMENT_START,
                     "Clip starts mid-argument/mid-thought",
-                    start_time
                 ))
                 break
@@ -290,7 +323,8 @@ class PodcastContextAnalyzer:
             analysis.issues.append((
                 ContextIssue.INCOMPLETE_THOUGHT,
                 "Clip ends with incomplete thought",
-                start_time
             ))
     def _detect_context_strengths(
@@ -344,20 +378,36 @@ class PodcastContextAnalyzer:
                 start_time
             ))
     def _calculate_self_contained_score(self, analysis: ContextAnalysis) -> float:
         """Calculate how self-contained the clip is."""
         score = 1.0
-        # Apply penalties
-        for issue, detail, _ in analysis.issues:
             if issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
-                score -= self.config.pronoun_penalty
             elif issue == ContextIssue.REFERENCE_TO_EARLIER:
-                score -= self.config.reference_penalty
             elif issue == ContextIssue.MID_ARGUMENT_START:
-                score -= self.config.mid_argument_penalty
             elif issue == ContextIssue.INCOMPLETE_THOUGHT:
-                score -= 0.10
         # Apply bonuses
         for strength, detail, _ in analysis.strengths:
@@ -369,6 +419,8 @@ class PodcastContextAnalyzer:
                 score += self.config.cause_effect_bonus
             elif strength == ContextStrength.QUESTION_ANSWER_PAIR:
                 score += self.config.question_answer_bonus
         return max(0.0, min(1.0, score))
@@ -376,14 +428,18 @@ class PodcastContextAnalyzer:
         """Calculate how clear the context is."""
         score = 1.0
-        # Heavy penalty for mid-argument starts
-        for issue, _, _ in analysis.issues:
             if issue == ContextIssue.MID_ARGUMENT_START:
-                score -= 0.30
             elif issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
-                score -= 0.15
             elif issue == ContextIssue.REFERENCE_TO_EARLIER:
-                score -= 0.25
         # Bonus for explicit context
         for strength, _, _ in analysis.strengths:
@@ -394,6 +450,41 @@ class PodcastContextAnalyzer:
         return max(0.0, min(1.0, score))
     def _looks_like_answer(self, text: str) -> bool:
         """Check if text looks like an answer to a question."""
         text_lower = text.lower().strip()
@@ -460,7 +551,7 @@ class PodcastContextAnalyzer:
         # Check if expansion is needed
         needs_expansion = False
-        for issue, _, _ in analysis.issues:
             if issue == ContextIssue.MID_ARGUMENT_START and self.config.expand_on_mid_argument:
                 needs_expansion = True
             elif issue == ContextIssue.MISSING_QUESTION and self.config.expand_on_missing_question:
@@ -547,10 +638,24 @@ class PodcastContextAnalyzer:
         return best_start
     def _split_sentences(self, text: str) -> List[str]:
-        """Split text into sentences."""
         # Simple sentence splitter
         sentences = re.split(r'(?<=[.!?])\s+', text)
-        return [s.strip() for s in sentences if s.strip()]
     def score_clip_for_reranking(
         self,
@@ -585,6 +690,7 @@ class PodcastContextAnalyzer:
         # Don't penalize too harshly - context issues can be fixed
         # But do reward good context
         if context_factor >= 0.7:
             # Good context: slight boost
             adjustment = 1.0 + (context_factor - 0.7) * 0.3  # Up to 1.09x
@@ -592,8 +698,11 @@ class PodcastContextAnalyzer:
             # Okay context: neutral
             adjustment = 1.0
         else:
-            # Poor context: penalize
-            adjustment = 0.8 + context_factor * 0.4  # 0.8x to 1.0x
         adjusted_score = original_hype_score * adjustment

     # Scores
     self_contained_score: float      # 0-1: How understandable is this clip alone?
     context_clarity_score: float     # 0-1: How clear is the context?
+    fixability_score: float = 1.0    # 0-1: How easy to fix issues (1.0 = easy/no issues)
+    # Issues found - (issue, detail, timestamp, position_ratio)
+    # position_ratio: 0.0 = start of clip, 1.0 = end of clip
+    issues: List[Tuple[ContextIssue, str, float, float]] = field(default_factory=list)
     # Strengths found
     strengths: List[Tuple[ContextStrength, str, float]] = field(default_factory=list)
     r'^(?:the\s+(?:real|actual|honest)\s+(?:answer|truth|reason))',
 ]
+SELF_CONTAINED_CLAIM_PATTERNS = [
+    r'^(?:the\s+reason\s+(?:is|was|why))',
+    r'^(?:this\s+means\s+that)',
+    r'^(?:what\s+this\s+(?:shows|proves|demonstrates)\s+is)',
+    r'^(?:the\s+(?:bottom\s+line|takeaway|conclusion)\s+is)',
+    r'^(?:in\s+other\s+words)',
+]
 class PodcastContextAnalyzer:
     """
         # Calculate scores
         analysis.self_contained_score = self._calculate_self_contained_score(analysis)
         analysis.context_clarity_score = self._calculate_clarity_score(analysis)
+        analysis.fixability_score = self._calculate_fixability_score(analysis)
         # Find question if this looks like an answer
+        if self._looks_like_answer(clip_transcript):
+            if full_transcript:
+                self._find_preceding_question(
+                    clip_start, full_transcript, analysis
+                )
+            # Bug fix: Add MISSING_QUESTION issue if answer detected but no question found
+            if not analysis.question_timestamp:
+                analysis.issues.append((
+                    ContextIssue.MISSING_QUESTION,
+                    "Answer without preceding question found",
+                    clip_start,
+                    0.0  # Effectively a "start" issue since context is missing
+                ))
         # Determine if expansion or intro is needed
         self._recommend_adjustments(analysis, full_transcript)
     ) -> None:
         """Detect context issues in the clip text."""
         text_lower = text.lower()
+        text_len = len(text) if text else 1
         sentences = self._split_sentences(text)
         # Check first sentence specifically (most important for hooks)
         first_lower = first_sentence.lower()
         # 1. Pronouns without antecedents (especially at start)
+        # Only penalize if pronoun appears before any explicit noun in the clip
+        has_explicit_noun_early = bool(re.match(r'^[^.!?]*\b[A-Z][a-z]+', first_sentence))
+        if not has_explicit_noun_early:
+            for pattern in PRONOUN_PATTERNS:
+                match = re.search(pattern, first_lower, re.IGNORECASE)
+                if match:
+                    # Calculate position ratio (0.0 = start, 1.0 = end)
+                    position_ratio = match.start() / text_len
+                    analysis.issues.append((
+                        ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
+                        f"Pronoun '{match.group()}' at start without clear reference",
+                        start_time,
+                        position_ratio
+                    ))
+                    break  # Only add one pronoun issue per clip
         # 2. References to earlier content
         for pattern in REFERENCE_TO_EARLIER_PATTERNS:
+            match = re.search(pattern, text_lower, re.IGNORECASE)
+            if match:
+                position_ratio = match.start() / text_len
                 analysis.issues.append((
                     ContextIssue.REFERENCE_TO_EARLIER,
                     f"Reference to earlier content: {pattern}",
+                    start_time,
+                    position_ratio
                 ))
+                break  # One reference issue is enough
         # 3. Mid-argument start detection
         mid_argument_starters = [
                 analysis.issues.append((
                     ContextIssue.MID_ARGUMENT_START,
                     "Clip starts mid-argument/mid-thought",
+                    start_time,
+                    0.0  # Always at start
                 ))
                 break
             analysis.issues.append((
                 ContextIssue.INCOMPLETE_THOUGHT,
                 "Clip ends with incomplete thought",
+                start_time,
+                1.0  # Always at end
             ))
     def _detect_context_strengths(
                 start_time
             ))
+        # 5. Self-contained claims (explains itself)
+        for pattern in SELF_CONTAINED_CLAIM_PATTERNS:
+            if re.match(pattern, first_lower, re.IGNORECASE):
+                analysis.strengths.append((
+                    ContextStrength.SELF_CONTAINED_CLAIM,
+                    "Self-explanatory claim structure",
+                    start_time
+                ))
+                break
     def _calculate_self_contained_score(self, analysis: ContextAnalysis) -> float:
         """Calculate how self-contained the clip is."""
         score = 1.0
+        # Apply penalties with position weighting
+        # Issues at start (position_ratio ~0) get full penalty
+        # Issues later (position_ratio ~1) get reduced penalty
+        for issue, detail, _, position_ratio in analysis.issues:
+            position_weight = 1.0 - (position_ratio * 0.7)  # 1.0 at start, 0.3 at end
             if issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
+                score -= self.config.pronoun_penalty * position_weight
             elif issue == ContextIssue.REFERENCE_TO_EARLIER:
+                score -= self.config.reference_penalty * position_weight
             elif issue == ContextIssue.MID_ARGUMENT_START:
+                score -= self.config.mid_argument_penalty  # Full penalty (always at start)
             elif issue == ContextIssue.INCOMPLETE_THOUGHT:
+                score -= 0.10  # End issues don't need position weighting
+            elif issue == ContextIssue.MISSING_QUESTION:
+                score -= 0.15 * position_weight
         # Apply bonuses
         for strength, detail, _ in analysis.strengths:
                 score += self.config.cause_effect_bonus
             elif strength == ContextStrength.QUESTION_ANSWER_PAIR:
                 score += self.config.question_answer_bonus
+            elif strength == ContextStrength.SELF_CONTAINED_CLAIM:
+                score += 0.10
         return max(0.0, min(1.0, score))
         """Calculate how clear the context is."""
         score = 1.0
+        # Heavy penalty for mid-argument starts, with position weighting
+        for issue, _, _, position_ratio in analysis.issues:
+            position_weight = 1.0 - (position_ratio * 0.7)
             if issue == ContextIssue.MID_ARGUMENT_START:
+                score -= 0.30  # Full penalty (always at start)
             elif issue == ContextIssue.PRONOUN_WITHOUT_ANTECEDENT:
+                score -= 0.15 * position_weight
             elif issue == ContextIssue.REFERENCE_TO_EARLIER:
+                score -= 0.25 * position_weight
+            elif issue == ContextIssue.MISSING_QUESTION:
+                score -= 0.20
         # Bonus for explicit context
         for strength, _, _ in analysis.strengths:
         return max(0.0, min(1.0, score))
+    def _calculate_fixability_score(self, analysis: ContextAnalysis) -> float:
+        """
+        Calculate how easy it is to fix context issues.
+        Hard issues: MID_ARGUMENT_START, REFERENCE_TO_EARLIER (require expansion)
+        Easy issues: PRONOUN_WITHOUT_ANTECEDENT, MISSING_QUESTION (can add intro)
+        """
+        if not analysis.issues:
+            return 1.0
+        # Categorize issues by difficulty
+        hard_issues = {
+            ContextIssue.MID_ARGUMENT_START,
+            ContextIssue.REFERENCE_TO_EARLIER,
+            ContextIssue.ASSUMES_PRIOR_KNOWLEDGE,
+        }
+        easy_issues = {
+            ContextIssue.PRONOUN_WITHOUT_ANTECEDENT,
+            ContextIssue.MISSING_QUESTION,
+            ContextIssue.INCOMPLETE_THOUGHT,
+        }
+        hard_count = sum(1 for issue, _, _, _ in analysis.issues if issue in hard_issues)
+        easy_count = sum(1 for issue, _, _, _ in analysis.issues if issue in easy_issues)
+        total = hard_count + easy_count
+        if total == 0:
+            return 1.0
+        # Fixability: 1.0 if all easy, 0.0 if all hard
+        # Easy issues contribute 0.8 to fixability, hard issues contribute 0.2
+        fixability = (easy_count * 0.8 + hard_count * 0.2) / total
+        return max(0.0, min(1.0, fixability))
     def _looks_like_answer(self, text: str) -> bool:
         """Check if text looks like an answer to a question."""
         text_lower = text.lower().strip()
         # Check if expansion is needed
         needs_expansion = False
+        for issue, _, _, _ in analysis.issues:
             if issue == ContextIssue.MID_ARGUMENT_START and self.config.expand_on_mid_argument:
                 needs_expansion = True
             elif issue == ContextIssue.MISSING_QUESTION and self.config.expand_on_missing_question:
         return best_start
     def _split_sentences(self, text: str) -> List[str]:
+        """Split text into sentences with fallback for unpunctuated transcripts."""
         # Simple sentence splitter
         sentences = re.split(r'(?<=[.!?])\s+', text)
+        sentences = [s.strip() for s in sentences if s.strip()]
+        # Fallback: if only one "sentence" and it's very long (likely unpunctuated transcript),
+        # treat the first ~200 chars as the "first sentence" for analysis purposes
+        if len(sentences) == 1 and len(sentences[0]) > 200:
+            # Try to find a natural break point (comma, conjunction)
+            first_chunk = sentences[0][:200]
+            break_match = re.search(r'[,;]\s+|\s+(?:and|but|so|because)\s+', first_chunk[100:])
+            if break_match:
+                cut_point = 100 + break_match.start()
+                sentences = [sentences[0][:cut_point].strip(), sentences[0][cut_point:].strip()]
+            else:
+                sentences = [first_chunk.strip()]
+        return sentences
     def score_clip_for_reranking(
         self,
         # Don't penalize too harshly - context issues can be fixed
         # But do reward good context
+        # Use fixability to soften penalties for easily-fixable clips
         if context_factor >= 0.7:
             # Good context: slight boost
             adjustment = 1.0 + (context_factor - 0.7) * 0.3  # Up to 1.09x
             # Okay context: neutral
             adjustment = 1.0
         else:
+            # Poor context: penalize, but less if fixable
+            base_penalty = 0.8 + context_factor * 0.4  # 0.8x to 1.0x
+            # Fixable clips get softer penalty
+            fixability_boost = analysis.fixability_score * 0.15  # Up to +0.15
+            adjustment = min(1.0, base_penalty + fixability_boost)
         adjusted_score = original_hype_score * adjustment