Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

9908f05

verified ·

1 Parent(s): 78c94b5

Upload app.py

Browse files

Files changed (1) hide show

app.py +82 -6

app.py CHANGED Viewed

@@ -349,8 +349,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                                 # Just remove the orphaned first line, don't add a speaker
                                 generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
-        # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
         # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
         # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
@@ -371,7 +373,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
             'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
             'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
-            'again', 'government', 'honour', 'light', 'stands', 'fly'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
@@ -421,7 +428,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
-        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
@@ -489,9 +496,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             # Fix "ag a in" -> "again" (multiple splits)
             (r'\bag\s+a\s+in\b', 'again'),
             (r'\bAg\s+a\s+in\b', 'Again'),
-            # Fix "ag a in" -> "again" (two-part split)
-            (r'\bag\s+a\s+in\b', 'again'),
-            (r'\bAg\s+a\s+in\b', 'Again'),
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -588,6 +609,61 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         generated_text = '\n'.join(normalized_lines)
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
         # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
         lines = generated_text.split('\n')

                                 # Just remove the orphaned first line, don't add a speaker
                                 generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
+        # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
+        # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
+        generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
         # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
         # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
             'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
             'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
             'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
+            'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
+            'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
+            'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
+            'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
+            'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
+            'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
         # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
         # Handle cases where a word got split into multiple parts
+        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
         for word in multi_split_words:
             word_lower = word.lower()
             # Create pattern for word split into individual letters with spaces
             # Fix "ag a in" -> "again" (multiple splits)
             (r'\bag\s+a\s+in\b', 'again'),
             (r'\bAg\s+a\s+in\b', 'Again'),
+            # Fix "UN TO" -> "UNTO" (before Fix 3c processes it)
+            (r'\bUN\s+TO\b', 'UNTO'),
+            (r'\bun\s+to\b', 'unto'),
+            # Fix potential word issues
+            (r'\bcoronured\b', 'crowned'),  # "coronured" -> "crowned"
+            (r'\beyuls\b', 'evils'),  # "eyuls" -> "evils"
+            # Fix "AOr" -> "A Or" or "Or" (if it's at start of sentence)
+            (r'\bAOr\b', 'A Or'),
+            (r'^A Or\s+', 'Or '),  # If "A Or" is at start, might just be "Or"
+            # Fix "fe at" -> "feat"
+            (r'\bfe\s+at\b', 'feat'),
+            (r'\bFe\s+at\b', 'Feat'),
+            # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
+            (r'\bTH\s+AN\b', 'THAN'),
+            (r'\bth\s+an\b', 'than'),
+            # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
+            # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         generated_text = '\n'.join(normalized_lines)
+        # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
+        # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
+        # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
+        #           "THERE SHOULD RUE:" -> "THERE SHOULD RUE."
+        #           "UN TO THE LADY GREY:" -> "UNTO THE LADY GREY."
+        # These are usually long phrases (3+ words) that don't look like character names
+        lines = generated_text.split('\n')
+        fixed_dialogue_lines = []
+        # Known speaker names (keep these as speakers)
+        known_speakers = ['BAPTISTA', 'GLOUCESTER', 'CLARENCE', 'ROMEO', 'JULIET', 'HAMLET', 'MACBETH',
+                         'KING', 'QUEEN', 'DUKE', 'PRINCE', 'LADY', 'FIRST', 'SECOND', 'THIRD',
+                         'CITIZEN', 'GENTLEMAN', 'SERVANT', 'MENENIUS', 'COMINIUS', 'CORIOLANUS',
+                         'VINCENTIO', 'ANGELO', 'ISABELLA', 'OTHELLO', 'DESDEMONA', 'IAGO']
+        for i, line in enumerate(lines):
+            line_stripped = line.strip()
+            # Check if line looks like all-caps speaker but is actually dialogue
+            # Pattern: All caps, ends with colon
+            if re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped):
+                words = line_stripped.split()
+                speaker_name = words[0] if words else ''
+                # Check if it's a known speaker name (1-2 words, known name)
+                is_known_speaker = (len(words) <= 2 and speaker_name in known_speakers) or \
+                                  (len(words) == 2 and words[0] in ['FIRST', 'SECOND', 'THIRD'] and words[1] in ['CITIZEN', 'GENTLEMAN', 'SERVANT'])
+                if is_known_speaker:
+                    # Keep as speaker name
+                    fixed_dialogue_lines.append(line)
+                # If it has 3+ words, it's likely dialogue, not a speaker name
+                elif len(words) >= 3:
+                    # Convert colon to period (dialogue ending)
+                    dialogue = line_stripped[:-1] + '.'  # Remove colon, add period
+                    fixed_dialogue_lines.append(dialogue)
+                # Also check if it contains common dialogue words (not speaker names)
+                elif any(word in ['ARE', 'YOUR', 'HONOUR', 'TO', 'ENTER', 'SHOULD', 'RUE', 'THE', 'GREY', 'HENCE', 'THERE', 'UN', 'UNTIL', 'UNTO', 'MORE', 'THAN', 'HALF', 'TH', 'AN'] for word in words):
+                    # Likely dialogue, not speaker
+                    dialogue = line_stripped[:-1] + '.'  # Remove colon, add period
+                    fixed_dialogue_lines.append(dialogue)
+                # Special case: Single letter "A:" is likely dialogue or incomplete, not a speaker
+                elif len(words) == 1 and words[0] == 'A':
+                    # Convert to dialogue
+                    fixed_dialogue_lines.append('A.')
+                # Special case: "MORE THAN HALF:" is dialogue, not speaker
+                elif 'MORE' in words and 'THAN' in words:
+                    dialogue = line_stripped[:-1] + '.'  # Remove colon, add period
+                    fixed_dialogue_lines.append(dialogue)
+                else:
+                    # Keep as speaker name (might be a short unknown character name)
+                    fixed_dialogue_lines.append(line)
+            else:
+                fixed_dialogue_lines.append(line)
+        generated_text = '\n'.join(fixed_dialogue_lines)
         # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
         # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
         lines = generated_text.split('\n')