Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

2485390

verified ·

1 Parent(s): 6b1dee4

Upload app.py

Browse files

Files changed (1) hide show

app.py +51 -28

app.py CHANGED Viewed

@@ -320,34 +320,41 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
         import re
         # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
-        # This handles cases where user enters "Romeo and Juliet" and model treats it as speaker
-        prompt_lower = prompt.lower().strip()
-        generated_lower = generated_text.lower()
-        # If prompt appears at the very start and looks like it was treated as a speaker
-        if generated_lower.startswith(prompt_lower):
-            # Check if it's followed by a newline (speaker format) or dialogue
-            prompt_len = len(prompt)
-            if len(generated_text) > prompt_len:
-                next_chars = generated_text[prompt_len:prompt_len+5].strip()
-                # If prompt is followed by newline or colon-like pattern, it was treated as speaker
-                if not next_chars or ':' in next_chars or '\n' in generated_text[prompt_len:prompt_len+5]:
-                    # Remove the prompt from output (it's the input, not part of generated story)
-                    generated_text = generated_text[len(prompt):].strip()
-                    # Remove leading newlines/colons
-                    generated_text = re.sub(r'^[\s:]+', '', generated_text)
-                    # Check if the first line after removal is orphaned dialogue (no speaker)
                     lines = generated_text.split('\n')
-                    if lines and lines[0].strip():
-                        first_line = lines[0].strip()
-                        # If first line is not a speaker name and looks like dialogue, just remove it
-                        # Don't add NARRATOR - let the model's natural flow continue
-                        if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
-                            # Check if it's dialogue-like (starts with capital, has punctuation)
-                            if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
-                                # Just remove the orphaned first line, don't add a speaker
-                                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
@@ -511,6 +518,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
             # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
             (r'\bTH\s+AN\b', 'THAN'),
             (r'\bth\s+an\b', 'than'),
             # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
             # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
             # Fix duplicate words: "if it be it possible" -> "if it be possible"
@@ -560,9 +576,16 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
                         break
                 if not is_known:
-                    # Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
                     # Remove spaces between all-caps words before colon
                     merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
                     # Only use merged if it makes sense (not too long, looks like a word)
                     if len(merged) < 30:  # Reasonable speaker name length
                         fixed_lines.append(merged)

         import re
         # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
+        # This handles cases where user enters "First Citizen:" and model repeats it
+        prompt_stripped = prompt.strip().replace(':', '').strip()
+        lines = generated_text.split('\n')
+        if lines:
+            first_line = lines[0].strip()
+            # Normalize both prompt and first line for comparison (remove colons, case-insensitive)
+            first_line_normalized = first_line.replace(':', '').strip().upper()
+            prompt_normalized = prompt_stripped.upper()
+            # If first line matches the prompt (case-insensitive, allowing for colon)
+            if first_line_normalized == prompt_normalized:
+                # Remove the first line (it's the prompt, not generated content)
+                generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
+                # Also check if the next line is also the same speaker (duplicate)
+                if generated_text.strip():
                     lines = generated_text.split('\n')
+                    next_line = lines[0].strip() if lines else ''
+                    if next_line:
+                        next_line_normalized = next_line.replace(':', '').strip().upper()
+                        # If next line is also the same speaker, remove it too
+                        if next_line_normalized == prompt_normalized and re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
+                            generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
+                # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
+                if generated_text.strip():
+                    lines = generated_text.split('\n')
+                    first_line = lines[0].strip() if lines else ''
+                    # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
+                    if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
+                        # Check if it's dialogue-like (starts with capital, has punctuation)
+                        if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
+                            # Just remove the orphaned first line, don't add a speaker
+                            generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
             # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
             (r'\bTH\s+AN\b', 'THAN'),
             (r'\bth\s+an\b', 'than'),
+            # Fix "F IT" -> "FIT" (in all caps dialogue)
+            (r'\bF\s+IT\b', 'FIT'),
+            (r'\bf\s+it\b', 'fit'),
+            (r'\bF\s+it\b', 'Fit'),
+            # Fix "C A" -> "CA" (but be careful - might be part of "C A:" speaker name)
+            # Actually, "C A:" should be merged to "CA:" or might be "CLARENCE:" - handle in speaker fix
+            # Fix "OUCESTER" -> "GLOUCESTER" (missing "GL" prefix)
+            (r'\bOUCESTER\b', 'GLOUCESTER'),
+            (r'\bOucester\b', 'Gloucester'),
             # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
             # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
             # Fix duplicate words: "if it be it possible" -> "if it be possible"
                         break
                 if not is_known:
+                    # Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:", "C A:" -> "CA:" or "CLARENCE:"
                     # Remove spaces between all-caps words before colon
                     merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
+                    # Special case: "C A:" might be "CLARENCE:" - check if it's a known pattern
+                    if re.match(r'^C\s+A:\s*$', line_stripped):
+                        # Check context - if it's near "Clarence" or "Sir Clarence", it's likely "CLARENCE:"
+                        # For now, merge to "CA:" and let it be handled as a potential speaker
+                        merged = 'CLARENCE:'
                     # Only use merged if it makes sense (not too long, looks like a word)
                     if len(merged) < 30:  # Reasonable speaker name length
                         fixed_lines.append(merged)