Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

82f907e

verified ·

1 Parent(s): c037b52

Upload app.py

Browse files

Files changed (1) hide show

app.py +49 -9

app.py CHANGED Viewed

@@ -361,17 +361,36 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
-        # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This"
         # Remove spaces in the middle of common words
-        common_words_fix = ['further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our', 'your']
         for word in common_words_fix:
-            # Pattern: word split incorrectly (e.g., "furt her", "T his")
-            pattern = r'\b' + word[0] + r'\s+' + word[1:] + r'\b'
-            generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
-            # Also handle reversed (less common)
-            if len(word) > 3:
-                pattern2 = r'\b' + word[:-1] + r'\s+' + word[-1] + r'\b'
-                generated_text = re.sub(pattern2, word, generated_text, flags=re.IGNORECASE)
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
@@ -387,6 +406,27 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
         # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
         # Add space after contractions before lowercase words
         contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]

         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
+        # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your"
         # Remove spaces in the middle of common words
+        common_words_fix = [
+            'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
+            'man', 'men', 'woman', 'women', 'padua', 'padua', 'content', 'gentle', 'gently',
+            'house', 'neck', 'car', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
+            'well', 'newly', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
+            'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
+            'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
+            'you', 'me', 'my', 'his', 'hers', 'its', 'our', 'ours', 'yours', 'theirs',
+            'into', 'onto', 'upon', 'within', 'without', 'through', 'though', 'although',
+            'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
+            'after', 'while', 'until', 'since', 'because', 'though', 'although'
+        ]
         for word in common_words_fix:
+            # Pattern: word split incorrectly (e.g., "furt her", "T his", "y our", "a m an", "Padu a")
+            # Handle split at any position
+            word_lower = word.lower()
+            for i in range(1, len(word_lower)):
+                # Split at position i: first part + space + second part
+                first_part = word_lower[:i]
+                second_part = word_lower[i:]
+                # Pattern: word split at this position (case insensitive)
+                pattern = r'\b' + first_part + r'\s+' + second_part + r'\b'
+                generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
+                # Also handle with capital letters (e.g., "Padu a" -> "Padua")
+                pattern_cap = r'\b' + first_part.capitalize() + r'\s+' + second_part + r'\b'
+                generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
+                pattern_all_cap = r'\b' + first_part.upper() + r'\s+' + second_part.upper() + r'\b'
+                generated_text = re.sub(pattern_all_cap, word.upper(), generated_text)
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
+        # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to")
+        # Common patterns where words got merged incorrectly
+        # Pattern: pronoun + "t" (likely "to" got merged)
+        merged_fixes = [
+            (r'\bhimt\s+', 'him to '),  # "himt me" -> "him to me"
+            (r'\bhert\s+', 'her to '),  # "hert him" -> "her to him"
+            (r'\bthemt\s+', 'them to '),  # "themt us" -> "them to us"
+            (r'\byout\s+', 'you to '),  # "yout me" -> "you to me"
+            (r'\bhimt([,.;:!?])', r'him to\1'),  # "himt," -> "him to,"
+            (r'\bhert([,.;:!?])', r'her to\1'),
+            (r'\bthemt([,.;:!?])', r'them to\1'),
+            (r'\byout([,.;:!?])', r'you to\1'),
+        ]
+        for pattern, replacement in merged_fixes:
+            generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
+        # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
+        # "content on" should stay as "content on" (already correct)
+        # But if it's "contenton" -> "content on"
+        generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
         # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
         # Add space after contractions before lowercase words
         contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]