Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

dfe500d

verified ·

1 Parent(s): 9908f05

Upload app.py

Browse files

Files changed (1) hide show

app.py +29 -1

app.py CHANGED Viewed

@@ -513,6 +513,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             (r'\bth\s+an\b', 'than'),
             # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
             # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -666,6 +672,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
         # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
         lines = generated_text.split('\n')
         cleaned_lines = []
         speaker_history = []  # Track recent speakers with their line numbers
@@ -677,11 +684,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
                 # Check if this speaker appeared recently (within last 3 lines - more aggressive)
                 recent_speaker = False
                 for hist_speaker, hist_line_num in speaker_history[-3:]:
-                    if speaker == hist_speaker:
                         recent_speaker = True
                         break
@@ -744,6 +754,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
         # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
         generated_text = re.sub(
             r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
             r'\1:\n',
@@ -751,6 +762,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             flags=re.MULTILINE
         )
         # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
         # This happens when the model hits the token limit mid-generation
         if generated_text.strip():

             (r'\bth\s+an\b', 'than'),
             # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
             # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
+            # Fix duplicate words: "if it be it possible" -> "if it be possible"
+            (r'\bif it be it\b', 'if it be'),
+            (r'\bIf it be it\b', 'If it be'),
+            # Fix duplicate "belike" -> remove one
+            (r'\bbelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'belike that you were right gentle exercise'),
+            (r'\bBelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'Belike that you were right gentle exercise'),
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
         # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
+        # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
         lines = generated_text.split('\n')
         cleaned_lines = []
         speaker_history = []  # Track recent speakers with their line numbers
             if speaker_match:
                 speaker = speaker_match.group(1).strip()
+                speaker_upper = speaker.upper()  # For case-insensitive comparison
                 # Check if this speaker appeared recently (within last 3 lines - more aggressive)
+                # Check both exact match and case-insensitive match
                 recent_speaker = False
                 for hist_speaker, hist_line_num in speaker_history[-3:]:
+                    hist_speaker_upper = hist_speaker.upper()
+                    if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
                         recent_speaker = True
                         break
         generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
         # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
+        # Handle both exact duplicates and case-insensitive duplicates
         generated_text = re.sub(
             r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
             r'\1:\n',
             flags=re.MULTILINE
         )
+        # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
+        # Pattern: word followed by same word (case-insensitive)
+        # But be careful not to remove valid repetitions like "very very" or "more more"
+        # Only remove common function words that shouldn't repeat
+        duplicate_word_patterns = [
+            (r'\b(it)\s+\1\b', r'\1'),  # "it it" -> "it"
+            (r'\b(the)\s+\1\b', r'\1'),  # "the the" -> "the"
+            (r'\b(a)\s+\1\b', r'\1'),  # "a a" -> "a"
+            (r'\b(an)\s+\1\b', r'\1'),  # "an an" -> "an"
+            (r'\b(is)\s+\1\b', r'\1'),  # "is is" -> "is"
+            (r'\b(was)\s+\1\b', r'\1'),  # "was was" -> "was"
+            (r'\b(are)\s+\1\b', r'\1'),  # "are are" -> "are"
+            (r'\b(be)\s+\1\b', r'\1'),  # "be be" -> "be"
+        ]
+        for pattern, replacement in duplicate_word_patterns:
+            generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
         # This happens when the model hits the token limit mid-generation
         if generated_text.strip():