shwethd commited on
Commit
2485390
·
verified ·
1 Parent(s): 6b1dee4

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -28
app.py CHANGED
@@ -320,34 +320,41 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
320
  import re
321
 
322
  # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
323
- # This handles cases where user enters "Romeo and Juliet" and model treats it as speaker
324
- prompt_lower = prompt.lower().strip()
325
- generated_lower = generated_text.lower()
326
-
327
- # If prompt appears at the very start and looks like it was treated as a speaker
328
- if generated_lower.startswith(prompt_lower):
329
- # Check if it's followed by a newline (speaker format) or dialogue
330
- prompt_len = len(prompt)
331
- if len(generated_text) > prompt_len:
332
- next_chars = generated_text[prompt_len:prompt_len+5].strip()
333
- # If prompt is followed by newline or colon-like pattern, it was treated as speaker
334
- if not next_chars or ':' in next_chars or '\n' in generated_text[prompt_len:prompt_len+5]:
335
- # Remove the prompt from output (it's the input, not part of generated story)
336
- generated_text = generated_text[len(prompt):].strip()
337
- # Remove leading newlines/colons
338
- generated_text = re.sub(r'^[\s:]+', '', generated_text)
339
-
340
- # Check if the first line after removal is orphaned dialogue (no speaker)
341
  lines = generated_text.split('\n')
342
- if lines and lines[0].strip():
343
- first_line = lines[0].strip()
344
- # If first line is not a speaker name and looks like dialogue, just remove it
345
- # Don't add NARRATOR - let the model's natural flow continue
346
- if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
347
- # Check if it's dialogue-like (starts with capital, has punctuation)
348
- if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
349
- # Just remove the orphaned first line, don't add a speaker
350
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
 
 
 
 
 
 
 
 
351
 
352
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
353
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
@@ -511,6 +518,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
511
  # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
512
  (r'\bTH\s+AN\b', 'THAN'),
513
  (r'\bth\s+an\b', 'than'),
 
 
 
 
 
 
 
 
 
514
  # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
515
  # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
516
  # Fix duplicate words: "if it be it possible" -> "if it be possible"
@@ -560,9 +576,16 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
560
  break
561
 
562
  if not is_known:
563
- # Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
564
  # Remove spaces between all-caps words before colon
565
  merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
 
 
 
 
 
 
 
566
  # Only use merged if it makes sense (not too long, looks like a word)
567
  if len(merged) < 30: # Reasonable speaker name length
568
  fixed_lines.append(merged)
 
320
  import re
321
 
322
  # Fix 0: Remove the prompt from the beginning if it appears as a speaker name
323
+ # This handles cases where user enters "First Citizen:" and model repeats it
324
+ prompt_stripped = prompt.strip().replace(':', '').strip()
325
+ lines = generated_text.split('\n')
326
+
327
+ if lines:
328
+ first_line = lines[0].strip()
329
+ # Normalize both prompt and first line for comparison (remove colons, case-insensitive)
330
+ first_line_normalized = first_line.replace(':', '').strip().upper()
331
+ prompt_normalized = prompt_stripped.upper()
332
+
333
+ # If first line matches the prompt (case-insensitive, allowing for colon)
334
+ if first_line_normalized == prompt_normalized:
335
+ # Remove the first line (it's the prompt, not generated content)
336
+ generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
337
+
338
+ # Also check if the next line is also the same speaker (duplicate)
339
+ if generated_text.strip():
 
340
  lines = generated_text.split('\n')
341
+ next_line = lines[0].strip() if lines else ''
342
+ if next_line:
343
+ next_line_normalized = next_line.replace(':', '').strip().upper()
344
+ # If next line is also the same speaker, remove it too
345
+ if next_line_normalized == prompt_normalized and re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
346
+ generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
347
+
348
+ # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
349
+ if generated_text.strip():
350
+ lines = generated_text.split('\n')
351
+ first_line = lines[0].strip() if lines else ''
352
+ # Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
353
+ if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
354
+ # Check if it's dialogue-like (starts with capital, has punctuation)
355
+ if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
356
+ # Just remove the orphaned first line, don't add a speaker
357
+ generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
358
 
359
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
360
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
 
518
  # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
519
  (r'\bTH\s+AN\b', 'THAN'),
520
  (r'\bth\s+an\b', 'than'),
521
+ # Fix "F IT" -> "FIT" (in all caps dialogue)
522
+ (r'\bF\s+IT\b', 'FIT'),
523
+ (r'\bf\s+it\b', 'fit'),
524
+ (r'\bF\s+it\b', 'Fit'),
525
+ # Fix "C A" -> "CA" (but be careful - might be part of "C A:" speaker name)
526
+ # Actually, "C A:" should be merged to "CA:" or might be "CLARENCE:" - handle in speaker fix
527
+ # Fix "OUCESTER" -> "GLOUCESTER" (missing "GL" prefix)
528
+ (r'\bOUCESTER\b', 'GLOUCESTER'),
529
+ (r'\bOucester\b', 'Gloucester'),
530
  # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
531
  # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
532
  # Fix duplicate words: "if it be it possible" -> "if it be possible"
 
576
  break
577
 
578
  if not is_known:
579
+ # Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:", "C A:" -> "CA:" or "CLARENCE:"
580
  # Remove spaces between all-caps words before colon
581
  merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
582
+
583
+ # Special case: "C A:" might be "CLARENCE:" - check if it's a known pattern
584
+ if re.match(r'^C\s+A:\s*$', line_stripped):
585
+ # Check context - if it's near "Clarence" or "Sir Clarence", it's likely "CLARENCE:"
586
+ # For now, merge to "CA:" and let it be handled as a potential speaker
587
+ merged = 'CLARENCE:'
588
+
589
  # Only use merged if it makes sense (not too long, looks like a word)
590
  if len(merged) < 30: # Reasonable speaker name length
591
  fixed_lines.append(merged)