shwethd commited on
Commit
7360b49
·
verified ·
1 Parent(s): 4e5f1e6

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -1
app.py CHANGED
@@ -277,6 +277,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
277
 
278
  # Post-process to fix spacing issues (common with BPE tokenizers)
279
  import re
 
280
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
281
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
282
 
@@ -291,6 +292,42 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
291
  # Fix 3: Add space before character names (all caps words)
292
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  return generated_text
295
  except Exception as e:
296
  import traceback
@@ -355,7 +392,8 @@ with gr.Blocks(title="GPT-2 124M Shakespeare Model") as demo:
355
  output = gr.Textbox(
356
  label="Generated Text",
357
  lines=10,
358
- interactive=False
 
359
  )
360
 
361
  # Example prompts
 
277
 
278
  # Post-process to fix spacing issues (common with BPE tokenizers)
279
  import re
280
+
281
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
282
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
283
 
 
292
  # Fix 3: Add space before character names (all caps words)
293
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
294
 
295
+ # Fix 4: Remove duplicate speaker names (e.g., "Shepherd:\n\nShepherd:" -> "Shepherd:")
296
+ # Pattern: Character name followed by colon, then newline(s), then same character name and colon
297
+ lines = generated_text.split('\n')
298
+ cleaned_lines = []
299
+ prev_speaker = None
300
+ prev_was_speaker = False
301
+
302
+ for line in lines:
303
+ line_stripped = line.strip()
304
+ # Check if this line is a speaker name (various formats: "SHEPHERD:", "First Citizen:", "LADY MACBETH:")
305
+ # Pattern: Starts with capital letter(s), may have spaces, ends with colon, optionally followed by whitespace
306
+ speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
307
+
308
+ if speaker_match:
309
+ speaker = speaker_match.group(1).strip()
310
+ # If it's the same speaker as previous AND previous line was also a speaker, skip this duplicate
311
+ if speaker == prev_speaker and prev_was_speaker:
312
+ continue # Skip duplicate
313
+ prev_speaker = speaker
314
+ prev_was_speaker = True
315
+ cleaned_lines.append(line)
316
+ else:
317
+ # Reset speaker tracking when we see actual dialogue (non-empty line that's not a speaker)
318
+ if line_stripped: # Non-empty line that's not a speaker name
319
+ prev_speaker = None
320
+ prev_was_speaker = False
321
+ cleaned_lines.append(line)
322
+
323
+ generated_text = '\n'.join(cleaned_lines)
324
+
325
+ # Fix 5: Remove multiple empty lines between speaker and dialogue
326
+ generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
327
+
328
+ # Fix 6: Remove triple+ consecutive speaker names (edge case)
329
+ generated_text = re.sub(r'^([A-Z][A-Z\s]+?):\s*\n\1:\s*\n\1:\s*\n', r'\1:\n', generated_text, flags=re.MULTILINE)
330
+
331
  return generated_text
332
  except Exception as e:
333
  import traceback
 
392
  output = gr.Textbox(
393
  label="Generated Text",
394
  lines=10,
395
+ interactive=True, # Make it interactive so users can select and copy
396
+ show_copy_button=True # Add copy button
397
  )
398
 
399
  # Example prompts