Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

3bc9884

verified ·

1 Parent(s): 1e393db

Upload app.py

Browse files

Files changed (1) hide show

app.py +128 -38

app.py CHANGED Viewed

@@ -341,28 +341,19 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                     lines = generated_text.split('\n')
                     if lines and lines[0].strip():
                         first_line = lines[0].strip()
-                        # If first line is not a speaker name and looks like dialogue, add a speaker
                         if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
                             # Check if it's dialogue-like (starts with capital, has punctuation)
                             if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
-                                # Add a generic speaker name based on the prompt context
-                                # For story prompts like "Romeo and Juliet", use a character from the prompt
-                                prompt_words = [w.capitalize() for w in prompt_lower.split() if len(w) > 2]
-                                if len(prompt_words) >= 2:
-                                    # Use first significant word as speaker (e.g., "Romeo" from "Romeo and Juliet")
-                                    speaker_name = prompt_words[0].upper()
-                                else:
-                                    # Generic speaker
-                                    speaker_name = "NARRATOR"
-                                # Add speaker before the dialogue
-                                generated_text = f"{speaker_name}:\n{first_line}\n" + '\n'.join(lines[1:]) if len(lines) > 1 else f"{speaker_name}:\n{first_line}"
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
-        # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "Th at" -> "That"
-        # Remove spaces in the middle of common words
         common_words_fix = [
             'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
             'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
@@ -375,33 +366,40 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
             'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
             'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
-            'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold'
         ]
         for word in common_words_fix:
-            # Pattern: word split incorrectly (e.g., "furt her", "T his", "y our", "a m an", "Padu a", "Th at")
-            # Handle split at any position, including with capital letters
             word_lower = word.lower()
             for i in range(1, len(word_lower)):
-                # Split at position i: first part + space + second part
                 first_part = word_lower[:i]
                 second_part = word_lower[i:]
-                # Pattern 1: lowercase split (e.g., "furt her" -> "further")
-                pattern = r'\b' + first_part + r'\s+' + second_part + r'\b'
-                generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
-                # Pattern 2: Capital letter split (e.g., "Th at" -> "That", "T his" -> "This")
-                pattern_cap = r'\b' + first_part.capitalize() + r'\s+' + second_part + r'\b'
-                generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
-                # Pattern 3: All caps split (e.g., "TH AT" -> "THAT")
-                pattern_all_cap = r'\b' + first_part.upper() + r'\s+' + second_part.upper() + r'\b'
-                generated_text = re.sub(pattern_all_cap, word.upper(), generated_text)
-                # Pattern 4: Mixed case with capital in first part (e.g., "Th at" -> "That")
                 if len(first_part) > 0:
-                    pattern_mixed = r'\b' + first_part[0].upper() + first_part[1:] + r'\s+' + second_part + r'\b'
-                    generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
@@ -417,6 +415,29 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
         # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
         # Common patterns where words got merged incorrectly
         merged_fixes = [
@@ -432,20 +453,28 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             # Other merged patterns
             (r'\bincwold\b', 'in cold'),  # "incwold" -> "in cold"
             (r'\bincold\b', 'in cold'),  # "incold" -> "in cold"
-            (r'\blikeled\b', 'liked'),  # "likeled" -> "liked" (or could be "like led" but "liked" is more common)
             (r'\bh\s+on\s+our\b', 'honour'),  # "h on our" -> "honour"
-            (r'\bh\s+on\s+or\b', 'honor'),  # "h on or" -> "honor" (American spelling)
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
-        # "content on" should stay as "content on" (already correct)
-        # But if it's "contenton" -> "content on"
         generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
-        # Fix 2g: Fix "toget her" -> "together" (but be careful - "get her" is also valid)
-        # Only fix if it's clearly "together" (context-dependent, but "toget her" is likely "together")
         generated_text = re.sub(r'\btoget\s+her\b', 'together', generated_text, flags=re.IGNORECASE)
         # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
@@ -456,7 +485,11 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             pattern = r"(" + re.escape(contraction) + r")([a-z])"
             generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
-        # Fix 3: Add space before character names (all caps words)
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
         # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
@@ -577,6 +610,63 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             flags=re.MULTILINE
         )
         return generated_text
     except Exception as e:
         import traceback

                     lines = generated_text.split('\n')
                     if lines and lines[0].strip():
                         first_line = lines[0].strip()
+                        # If first line is not a speaker name and looks like dialogue, just remove it
+                        # Don't add NARRATOR - let the model's natural flow continue
                         if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
                             # Check if it's dialogue-like (starts with capital, has punctuation)
                             if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
+                                # Just remove the orphaned first line, don't add a speaker
+                                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
+        # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
+        # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
         common_words_fix = [
             'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
             'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
             'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
             'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
             'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
+            'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
+            'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she'
         ]
         for word in common_words_fix:
             word_lower = word.lower()
+            # Try all possible split positions
             for i in range(1, len(word_lower)):
                 first_part = word_lower[:i]
                 second_part = word_lower[i:]
+                # Pattern 1: lowercase split (e.g., "furt her" -> "further", "th at" -> "that")
+                # Use word boundaries but also allow punctuation/whitespace around
+                pattern1 = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
+                generated_text = re.sub(pattern1, word, generated_text, flags=re.IGNORECASE)
+                # Pattern 2: Capital first letter (e.g., "Th at" -> "That")
+                pattern2 = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
+                generated_text = re.sub(pattern2, word.capitalize(), generated_text)
+                # Pattern 3: All caps (e.g., "TH AT" -> "THAT")
+                pattern3 = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
+                generated_text = re.sub(pattern3, word.upper(), generated_text)
+                # Pattern 4: Mixed case - first letter capitalized (e.g., "Th at" -> "That")
                 if len(first_part) > 0:
+                    pattern4 = r'\b' + re.escape(first_part[0].upper() + first_part[1:]) + r'\s+' + re.escape(second_part) + r'\b'
+                    generated_text = re.sub(pattern4, word.capitalize(), generated_text, flags=re.IGNORECASE)
+                # Pattern 5: Handle multiple splits in one word (e.g., "c o u n t" -> "count")
+                # This is a special case for words that got split multiple times
+                if len(word_lower) > 4:  # Only for longer words
+                    # Try to find pattern like "c o u n t" or "y o u r"
+                    # This is more complex, so we'll handle it separately
+                    pass
         # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
         # Add space before common words that might have been merged
         # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
         generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
+        # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "T h is" -> "This")
+        # Handle cases where a word got split into multiple parts
+        multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon']
+        for word in multi_split_words:
+            word_lower = word.lower()
+            # Create pattern for word split into individual letters with spaces
+            # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at"
+            if len(word_lower) > 2:
+                # Pattern: letter space letter space ... (all letters of the word)
+                letters = list(word_lower)
+                pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
+                pattern_parts.append(re.escape(letters[-1]))
+                pattern = r'\b' + ''.join(pattern_parts) + r'\b'
+                generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
+                # Also handle with some capitalization (e.g., "T h is" -> "This", "Wh at" -> "What")
+                pattern_cap = r'\b' + re.escape(letters[0].upper()) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[1:-1]]) + re.escape(letters[-1]) + r'\b'
+                generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
+                # Handle mixed case like "Wh at" -> "What"
+                if len(letters) > 2:
+                    # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
+                    pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
+                    generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
         # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
         # Common patterns where words got merged incorrectly
         merged_fixes = [
             # Other merged patterns
             (r'\bincwold\b', 'in cold'),  # "incwold" -> "in cold"
             (r'\bincold\b', 'in cold'),  # "incold" -> "in cold"
+            (r'\blikeled\b', 'liked'),  # "likeled" -> "liked"
             (r'\bh\s+on\s+our\b', 'honour'),  # "h on our" -> "honour"
+            (r'\bh\s+on\s+or\b', 'honor'),  # "h on or" -> "honor"
+            (r'\bHapp\s+up\s+on\'t\b', "Happen upon't"),  # "Happ up on't" -> "Happen upon't"
+            (r'\bhapp\s+up\s+on\'t\b', "happen upon't"),
+            # Fix "comm on" -> "common" (if not already fixed)
+            (r'\bcomm\s+on\b', 'common'),
+            (r'\bComm\s+on\b', 'Common'),
+            # Fix "compl a in" -> "complain" (multiple splits)
+            (r'\bcompl\s+a\s+in\b', 'complain'),
+            (r'\bCompl\s+a\s+in\b', 'Complain'),
+            # Fix "As s he" -> "As she"
+            (r'\bAs\s+s\s+he\b', 'As she'),
+            (r'\bas\s+s\s+he\b', 'as she'),
         ]
         for pattern, replacement in merged_fixes:
             generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
         # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
         generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
+        # Fix 2g: Fix "toget her" -> "together"
         generated_text = re.sub(r'\btoget\s+her\b', 'together', generated_text, flags=re.IGNORECASE)
         # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
             pattern = r"(" + re.escape(contraction) + r")([a-z])"
             generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
+        # Fix 3: Add space before character names (all caps words) and fix missing punctuation
+        # First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
+        # Pattern: lowercase word followed immediately by all-caps speaker name
+        generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)
+        # Then add space before character names
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
         # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
             flags=re.MULTILINE
         )
+        # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
+        # This happens when the model hits the token limit mid-generation
+        if generated_text.strip():
+            # Remove incomplete word at the end (word that doesn't end with punctuation or space)
+            # Pattern: ends with a word that has no trailing punctuation/space
+            # But keep if it ends with proper punctuation (. ! ? , ; :)
+            lines = generated_text.split('\n')
+            if lines:
+                last_line = lines[-1].strip()
+                # If last line doesn't end with punctuation and is not a speaker name
+                if last_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', last_line):
+                    # Check if it ends with incomplete word (no punctuation, not a complete sentence)
+                    # Remove if it ends with a word that looks incomplete
+                    # Pattern: ends with word that has no punctuation
+                    if not re.search(r'[.!?,;:]$', last_line):
+                        # Check if the last "word" is very short (likely incomplete)
+                        # Or if it's a single character/letter (likely cut off)
+                        words = last_line.split()
+                        if words:
+                            last_word = words[-1]
+                            # If last word is very short (1-2 chars) and not punctuation, likely incomplete
+                            if len(last_word) <= 2 and last_word.isalpha():
+                                # Remove the incomplete last word
+                                lines[-1] = ' '.join(words[:-1]) if len(words) > 1 else ''
+                            # If last word doesn't end with punctuation and line is short, might be incomplete
+                            elif len(last_line) < 20 and not last_word.endswith(('.', '!', '?', ',', ';', ':')):
+                                # Check if removing last word makes sense
+                                # Only remove if it's clearly incomplete (very short word)
+                                if len(last_word) < 4:
+                                    lines[-1] = ' '.join(words[:-1]) if len(words) > 1 else ''
+                    # If after processing, last line is empty or just whitespace, remove it
+                    if not lines[-1].strip():
+                        lines = lines[:-1]
+                # Reconstruct text
+                generated_text = '\n'.join(lines)
+                # Final check: if text doesn't end with punctuation and is not a speaker,
+                # try to find the last complete sentence
+                if generated_text.strip():
+                    # Find the last complete sentence (ends with . ! ?)
+                    # Split by sentences
+                    sentences = re.split(r'([.!?]+)', generated_text)
+                    if len(sentences) > 1:
+                        # Reconstruct, keeping only complete sentences
+                        complete_text = ''
+                        for i in range(0, len(sentences) - 1, 2):
+                            if i + 1 < len(sentences):
+                                complete_text += sentences[i] + sentences[i + 1]
+                        # If we have complete sentences, use them; otherwise keep original
+                        if complete_text.strip():
+                            # But check if we removed too much (more than 50% of text)
+                            if len(complete_text.strip()) > len(generated_text.strip()) * 0.3:
+                                generated_text = complete_text.strip()
         return generated_text
     except Exception as e:
         import traceback