Spaces:

shwethd
/

DecoderModel124M

Sleeping

App Files Files Community

shwethd commited on Nov 14, 2025

Commit

7360b49

verified ·

1 Parent(s): 4e5f1e6

Upload app.py

Browse files

Files changed (1) hide show

app.py +39 -1

app.py CHANGED Viewed

@@ -277,6 +277,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
         # Post-process to fix spacing issues (common with BPE tokenizers)
         import re
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
@@ -291,6 +292,42 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
         # Fix 3: Add space before character names (all caps words)
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
         return generated_text
     except Exception as e:
         import traceback
@@ -355,7 +392,8 @@ with gr.Blocks(title="GPT-2 124M Shakespeare Model") as demo:
             output = gr.Textbox(
                 label="Generated Text",
                 lines=10,
-                interactive=False
             )
     # Example prompts

         # Post-process to fix spacing issues (common with BPE tokenizers)
         import re
         # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
         generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
         # Fix 3: Add space before character names (all caps words)
         generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
+        # Fix 4: Remove duplicate speaker names (e.g., "Shepherd:\n\nShepherd:" -> "Shepherd:")
+        # Pattern: Character name followed by colon, then newline(s), then same character name and colon
+        lines = generated_text.split('\n')
+        cleaned_lines = []
+        prev_speaker = None
+        prev_was_speaker = False
+        for line in lines:
+            line_stripped = line.strip()
+            # Check if this line is a speaker name (various formats: "SHEPHERD:", "First Citizen:", "LADY MACBETH:")
+            # Pattern: Starts with capital letter(s), may have spaces, ends with colon, optionally followed by whitespace
+            speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
+            if speaker_match:
+                speaker = speaker_match.group(1).strip()
+                # If it's the same speaker as previous AND previous line was also a speaker, skip this duplicate
+                if speaker == prev_speaker and prev_was_speaker:
+                    continue  # Skip duplicate
+                prev_speaker = speaker
+                prev_was_speaker = True
+                cleaned_lines.append(line)
+            else:
+                # Reset speaker tracking when we see actual dialogue (non-empty line that's not a speaker)
+                if line_stripped:  # Non-empty line that's not a speaker name
+                    prev_speaker = None
+                    prev_was_speaker = False
+                cleaned_lines.append(line)
+        generated_text = '\n'.join(cleaned_lines)
+        # Fix 5: Remove multiple empty lines between speaker and dialogue
+        generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
+        # Fix 6: Remove triple+ consecutive speaker names (edge case)
+        generated_text = re.sub(r'^([A-Z][A-Z\s]+?):\s*\n\1:\s*\n\1:\s*\n', r'\1:\n', generated_text, flags=re.MULTILINE)
         return generated_text
     except Exception as e:
         import traceback
             output = gr.Textbox(
                 label="Generated Text",
                 lines=10,
+                interactive=True,  # Make it interactive so users can select and copy
+                show_copy_button=True  # Add copy button
             )
     # Example prompts