Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

6b1dee4

verified ·

1 Parent(s): dfe500d

Upload app.py

Browse files

Files changed (1) hide show

app.py +50 -16

app.py CHANGED Viewed

@@ -584,16 +584,20 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
         # Handle mixed case speaker names that should be all caps
         lines = generated_text.split('\n')
         normalized_lines = []
         for i, line in enumerate(lines):
             line_stripped = line.strip()
             # Check if line is a potential speaker name (title case or mixed case, 2+ words)
-            # Pattern: "Romeo and juliet", "Romeo And Juliet", etc.
             speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
             match = re.match(speaker_pattern, line_stripped)
             if match:
                 # Check if next line is dialogue (not another speaker)
                 is_speaker = False
@@ -610,6 +614,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                     speaker_name = match.group(1).upper()
                     normalized_lines.append(speaker_name + ':')
                     continue
             normalized_lines.append(line)
@@ -671,25 +683,25 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         generated_text = '\n'.join(fixed_dialogue_lines)
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
-        # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
         # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
         lines = generated_text.split('\n')
         cleaned_lines = []
-        speaker_history = []  # Track recent speakers with their line numbers
         for i, line in enumerate(lines):
             line_stripped = line.strip()
-            # Check if this line is a speaker name
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
                 speaker_upper = speaker.upper()  # For case-insensitive comparison
-                # Check if this speaker appeared recently (within last 3 lines - more aggressive)
                 # Check both exact match and case-insensitive match
                 recent_speaker = False
-                for hist_speaker, hist_line_num in speaker_history[-3:]:
                     hist_speaker_upper = hist_speaker.upper()
                     if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
                         recent_speaker = True
@@ -699,10 +711,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                     # Skip this duplicate speaker
                     continue
-                # Add to history
-                speaker_history.append((speaker, i))
-                # Keep only last 10 speakers in history
-                if len(speaker_history) > 10:
                     speaker_history.pop(0)
                 cleaned_lines.append(line)
@@ -755,12 +767,34 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
         # Handle both exact duplicates and case-insensitive duplicates
-        generated_text = re.sub(
-            r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
-            r'\1:\n',
-            generated_text,
-            flags=re.MULTILINE
-        )
         # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
         # Pattern: word followed by same word (case-insensitive)

         # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
         # Handle mixed case speaker names that should be all caps
+        # Also handle "First Citizen:" -> "FIRST CITIZEN:"
         lines = generated_text.split('\n')
         normalized_lines = []
         for i, line in enumerate(lines):
             line_stripped = line.strip()
             # Check if line is a potential speaker name (title case or mixed case, 2+ words)
+            # Pattern: "Romeo and juliet", "Romeo And Juliet", "First Citizen", etc.
             speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
             match = re.match(speaker_pattern, line_stripped)
+            # Also check for all-caps speaker names (already normalized)
+            all_caps_speaker = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if match:
                 # Check if next line is dialogue (not another speaker)
                 is_speaker = False
                     speaker_name = match.group(1).upper()
                     normalized_lines.append(speaker_name + ':')
                     continue
+            elif all_caps_speaker:
+                # Already all caps, just ensure it has colon
+                speaker_name = all_caps_speaker.group(1).strip()
+                if not line_stripped.endswith(':'):
+                    normalized_lines.append(speaker_name + ':')
+                else:
+                    normalized_lines.append(line)
+                continue
             normalized_lines.append(line)
         generated_text = '\n'.join(fixed_dialogue_lines)
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
+        # More aggressive: remove same speaker if it appears within 5 lines (expanded window for empty lines)
         # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
         lines = generated_text.split('\n')
         cleaned_lines = []
+        speaker_history = []  # Track recent speakers with their line numbers (case-insensitive)
         for i, line in enumerate(lines):
             line_stripped = line.strip()
+            # Check if this line is a speaker name (all caps after normalization)
             speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
                 speaker_upper = speaker.upper()  # For case-insensitive comparison
+                # Check if this speaker appeared recently (within last 5 lines - expanded for empty lines)
                 # Check both exact match and case-insensitive match
                 recent_speaker = False
+                for hist_speaker, hist_line_num in speaker_history[-5:]:  # Check last 5 speakers
                     hist_speaker_upper = hist_speaker.upper()
                     if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
                         recent_speaker = True
                     # Skip this duplicate speaker
                     continue
+                # Add to history (store uppercase version for consistent comparison)
+                speaker_history.append((speaker_upper, i))
+                # Keep only last 15 speakers in history (expanded)
+                if len(speaker_history) > 15:
                     speaker_history.pop(0)
                 cleaned_lines.append(line)
         # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
         # Handle both exact duplicates and case-insensitive duplicates
+        # This handles cases like "FIRST CITIZEN:\n\nFIRST CITIZEN:" -> "FIRST CITIZEN:"
+        lines = generated_text.split('\n')
+        final_cleaned_lines = []
+        last_speaker_upper = None
+        for i, line in enumerate(lines):
+            line_stripped = line.strip()
+            speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
+            if speaker_match:
+                speaker = speaker_match.group(1).strip()
+                speaker_upper = speaker.upper()
+                # If this is the same speaker as the last one (case-insensitive), skip it
+                if speaker_upper == last_speaker_upper:
+                    continue
+                last_speaker_upper = speaker_upper
+                final_cleaned_lines.append(line)
+            else:
+                # Reset speaker tracking on non-speaker lines (but keep last_speaker for nearby duplicates)
+                # Only reset if we have substantial content (not just empty lines)
+                if line_stripped:  # Non-empty line
+                    # Keep last_speaker for a few lines in case of empty lines
+                    pass
+                final_cleaned_lines.append(line)
+        generated_text = '\n'.join(final_cleaned_lines)
         # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
         # Pattern: word followed by same word (case-insensitive)