Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

c673f73

verified ·

1 Parent(s): 5becd16

Upload app.py

Browse files

Files changed (1) hide show

app.py +50 -83

app.py CHANGED Viewed

@@ -369,13 +369,19 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
         # Keep removing orphaned dialogue at the start until we find a speaker or valid content
-        while generated_text.strip():
-            lines = generated_text.split('\n')
-            first_line = lines[0].strip() if lines else ''
             if not first_line:
                 # Remove empty first line
-                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
                 continue
             # Check if first line is a speaker name
@@ -389,76 +395,51 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             # Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
             if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
                 # Remove the orphaned first line
-                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
             else:
                 # Not clearly orphaned dialogue, stop removing
                 break
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
         # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
         generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
         # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
-        # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
         common_words_fix = [
             'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
-            'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
-            'house', 'neck', 'car', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
-            'well', 'newly', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
             'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
             'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
-            'you', 'me', 'my', 'his', 'hers', 'its', 'our', 'ours', 'yours', 'theirs',
-            'into', 'onto', 'upon', 'within', 'without', 'through', 'though', 'although',
-            'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
-            'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
-            'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
-            'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
-            'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
-            'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
-            'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
-            'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
-            'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
-            'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
-            'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
-            'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
-            'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
-            'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
-            'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
-            'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
-            'know', 'edward'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
-            # Try all possible split positions
-            for i in range(1, len(word_lower)):
                 first_part = word_lower[:i]
                 second_part = word_lower[i:]
-                # Pattern 1: lowercase split (e.g., "furt her" -> "further", "th at" -> "that")
-                # Use word boundaries but also allow punctuation/whitespace around
-                pattern1 = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
-                generated_text = re.sub(pattern1, word, generated_text, flags=re.IGNORECASE)
-                # Pattern 2: Capital first letter (e.g., "Th at" -> "That")
-                pattern2 = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
-                generated_text = re.sub(pattern2, word.capitalize(), generated_text)
-                # Pattern 3: All caps (e.g., "TH AT" -> "THAT")
-                pattern3 = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
-                generated_text = re.sub(pattern3, word.upper(), generated_text)
-                # Pattern 4: Mixed case - first letter capitalized (e.g., "Th at" -> "That")
-                if len(first_part) > 0:
-                    pattern4 = r'\b' + re.escape(first_part[0].upper() + first_part[1:]) + r'\s+' + re.escape(second_part) + r'\b'
-                    generated_text = re.sub(pattern4, word.capitalize(), generated_text, flags=re.IGNORECASE)
-                # Pattern 5: Handle multiple splits in one word (e.g., "c o u n t" -> "count")
-                # This is a special case for words that got split multiple times
-                if len(word_lower) > 4:  # Only for longer words
-                    # Try to find pattern like "c o u n t" or "y o u r"
-                    # This is more complex, so we'll handle it separately
-                    pass
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
@@ -474,43 +455,29 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
-        # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
-        # Handle cases where a word got split into multiple parts
-        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great', 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'call', 'rod', 'respect', 'drunk', 'signior', 'gremio', 'compound', 'soft', 'unvish', 'know', 'edward', 'man', 'men']
         for word in multi_split_words:
             word_lower = word.lower()
-            # Create pattern for word split into individual letters with spaces
-            # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at" or "y our"
             if len(word_lower) > 2:
-                # Pattern 1: letter space letter space ... (all letters of the word split individually)
-                letters = list(word_lower)
-                pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
-                pattern_parts.append(re.escape(letters[-1]))
-                pattern = r'\b' + ''.join(pattern_parts) + r'\b'
-                generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
-                # Also handle with some capitalization (e.g., "T h is" -> "This", "Wh at" -> "What")
-                pattern_cap = r'\b' + re.escape(letters[0].upper()) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[1:-1]]) + re.escape(letters[-1]) + r'\b'
-                generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
-                # Handle mixed case like "Wh at" -> "What"
-                if len(letters) > 2:
-                    # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
-                    pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
-                    generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
-                # Pattern 2: Handle two-part splits (e.g., "y our" -> "your", "h onour" -> "honour")
-                # Try all possible two-part splits
-                for split_pos in range(1, len(word_lower)):
                     first_part = word_lower[:split_pos]
                     second_part = word_lower[split_pos:]
-                    # Pattern: "y our" -> "your"
                     pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
                     generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
-                    # Capitalized version: "Y our" -> "Your"
-                    pattern_2part_cap = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
-                    generated_text = re.sub(pattern_2part_cap, word.capitalize(), generated_text)
-                    # All caps: "Y OUR" -> "YOUR"
-                    pattern_2part_allcap = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
-                    generated_text = re.sub(pattern_2part_allcap, word.upper(), generated_text)
         # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
         # Common patterns where words got merged incorrectly

         # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
         # Keep removing orphaned dialogue at the start until we find a speaker or valid content
+        # Limit to max 10 iterations to avoid infinite loops
+        lines = generated_text.split('\n')
+        start_idx = 0
+        max_iterations = 10
+        iteration = 0
+        while start_idx < len(lines) and iteration < max_iterations:
+            iteration += 1
+            first_line = lines[start_idx].strip() if start_idx < len(lines) else ''
             if not first_line:
                 # Remove empty first line
+                start_idx += 1
                 continue
             # Check if first line is a speaker name
             # Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
             if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
                 # Remove the orphaned first line
+                start_idx += 1
             else:
                 # Not clearly orphaned dialogue, stop removing
                 break
+        generated_text = '\n'.join(lines[start_idx:])
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
         # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
         generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
         # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
+        # OPTIMIZED: Only process most common split words to reduce computation
+        # Focus on words that are most likely to be split incorrectly
         common_words_fix = [
             'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
+            'man', 'men', 'woman', 'women', 'content', 'gentle', 'gently',
+            'house', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
+            'well', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
             'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
             'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
+            'you', 'me', 'my', 'his', 'into', 'onto', 'upon', 'within', 'without',
+            'together', 'honour', 'honor', 'common', 'complain', 'again', 'apparent'
         ]
+        # Pre-compile patterns for common splits (only most common 2-3 splits per word)
         for word in common_words_fix:
             word_lower = word.lower()
+            # Only try 2-3 most common split positions (middle, quarter, three-quarter)
+            split_positions = []
+            if len(word_lower) > 2:
+                split_positions = [len(word_lower) // 2]  # Most common: middle split
+                if len(word_lower) > 4:
+                    split_positions.append(len(word_lower) // 4)
+                    split_positions.append(3 * len(word_lower) // 4)
+            for i in split_positions:
+                if i < 1 or i >= len(word_lower):
+                    continue
                 first_part = word_lower[:i]
                 second_part = word_lower[i:]
+                # Combined pattern with case-insensitive flag (more efficient)
+                pattern = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
+                generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
+        # Fix 1c: Fix multiple splits in one word - OPTIMIZED: Only handle most common cases
+        # Focus on very common words that are most likely to be split
+        multi_split_words = ['count', 'your', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were',
+                            'been', 'have', 'has', 'had', 'will', 'this', 'that', 'there', 'where', 'here',
+                            'their', 'what', 'common', 'complain', 'honour', 'honor', 'again', 'apparent']
         for word in multi_split_words:
             word_lower = word.lower()
             if len(word_lower) > 2:
+                # Pattern 1: letter space letter space ... (all letters split individually) - only for short words
+                if len(word_lower) <= 5:
+                    letters = list(word_lower)
+                    pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
+                    pattern_parts.append(re.escape(letters[-1]))
+                    pattern = r'\b' + ''.join(pattern_parts) + r'\b'
+                    generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
+                # Pattern 2: Handle two-part splits - only try most common split (middle)
+                split_pos = len(word_lower) // 2
+                if split_pos > 0 and split_pos < len(word_lower):
                     first_part = word_lower[:split_pos]
                     second_part = word_lower[split_pos:]
                     pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
                     generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
         # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
         # Common patterns where words got merged incorrectly