shwethd commited on
Commit
9908f05
·
verified ·
1 Parent(s): 78c94b5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -6
app.py CHANGED
@@ -349,8 +349,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
349
  # Just remove the orphaned first line, don't add a speaker
350
  generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
351
 
352
- # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
353
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
 
 
354
 
355
  # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
356
  # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
@@ -371,7 +373,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
371
  'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
372
  'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
373
  'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
374
- 'again', 'government', 'honour', 'light', 'stands', 'fly'
 
 
 
 
 
375
  ]
376
  for word in common_words_fix:
377
  word_lower = word.lower()
@@ -421,7 +428,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
421
 
422
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
423
  # Handle cases where a word got split into multiple parts
424
- multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
425
  for word in multi_split_words:
426
  word_lower = word.lower()
427
  # Create pattern for word split into individual letters with spaces
@@ -489,9 +496,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
489
  # Fix "ag a in" -> "again" (multiple splits)
490
  (r'\bag\s+a\s+in\b', 'again'),
491
  (r'\bAg\s+a\s+in\b', 'Again'),
492
- # Fix "ag a in" -> "again" (two-part split)
493
- (r'\bag\s+a\s+in\b', 'again'),
494
- (r'\bAg\s+a\s+in\b', 'Again'),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  ]
496
  for pattern, replacement in merged_fixes:
497
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -588,6 +609,61 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
588
 
589
  generated_text = '\n'.join(normalized_lines)
590
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
592
  # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
593
  lines = generated_text.split('\n')
 
349
  # Just remove the orphaned first line, don't add a speaker
350
  generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
351
 
352
+ # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
353
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
354
+ # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
355
+ generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
356
 
357
  # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
358
  # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
 
373
  'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
374
  'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
375
  'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
376
+ 'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
377
+ 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
378
+ 'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
379
+ 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
380
+ 'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
381
+ 'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
382
  ]
383
  for word in common_words_fix:
384
  word_lower = word.lower()
 
428
 
429
  # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
430
  # Handle cases where a word got split into multiple parts
431
+ multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
432
  for word in multi_split_words:
433
  word_lower = word.lower()
434
  # Create pattern for word split into individual letters with spaces
 
496
  # Fix "ag a in" -> "again" (multiple splits)
497
  (r'\bag\s+a\s+in\b', 'again'),
498
  (r'\bAg\s+a\s+in\b', 'Again'),
499
+ # Fix "UN TO" -> "UNTO" (before Fix 3c processes it)
500
+ (r'\bUN\s+TO\b', 'UNTO'),
501
+ (r'\bun\s+to\b', 'unto'),
502
+ # Fix potential word issues
503
+ (r'\bcoronured\b', 'crowned'), # "coronured" -> "crowned"
504
+ (r'\beyuls\b', 'evils'), # "eyuls" -> "evils"
505
+ # Fix "AOr" -> "A Or" or "Or" (if it's at start of sentence)
506
+ (r'\bAOr\b', 'A Or'),
507
+ (r'^A Or\s+', 'Or '), # If "A Or" is at start, might just be "Or"
508
+ # Fix "fe at" -> "feat"
509
+ (r'\bfe\s+at\b', 'feat'),
510
+ (r'\bFe\s+at\b', 'Feat'),
511
+ # Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
512
+ (r'\bTH\s+AN\b', 'THAN'),
513
+ (r'\bth\s+an\b', 'than'),
514
+ # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
515
+ # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
516
  ]
517
  for pattern, replacement in merged_fixes:
518
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
 
609
 
610
  generated_text = '\n'.join(normalized_lines)
611
 
612
+ # Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
613
+ # Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
614
+ # Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
615
+ # "THERE SHOULD RUE:" -> "THERE SHOULD RUE."
616
+ # "UN TO THE LADY GREY:" -> "UNTO THE LADY GREY."
617
+ # These are usually long phrases (3+ words) that don't look like character names
618
+ lines = generated_text.split('\n')
619
+ fixed_dialogue_lines = []
620
+ # Known speaker names (keep these as speakers)
621
+ known_speakers = ['BAPTISTA', 'GLOUCESTER', 'CLARENCE', 'ROMEO', 'JULIET', 'HAMLET', 'MACBETH',
622
+ 'KING', 'QUEEN', 'DUKE', 'PRINCE', 'LADY', 'FIRST', 'SECOND', 'THIRD',
623
+ 'CITIZEN', 'GENTLEMAN', 'SERVANT', 'MENENIUS', 'COMINIUS', 'CORIOLANUS',
624
+ 'VINCENTIO', 'ANGELO', 'ISABELLA', 'OTHELLO', 'DESDEMONA', 'IAGO']
625
+
626
+ for i, line in enumerate(lines):
627
+ line_stripped = line.strip()
628
+ # Check if line looks like all-caps speaker but is actually dialogue
629
+ # Pattern: All caps, ends with colon
630
+ if re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped):
631
+ words = line_stripped.split()
632
+ speaker_name = words[0] if words else ''
633
+
634
+ # Check if it's a known speaker name (1-2 words, known name)
635
+ is_known_speaker = (len(words) <= 2 and speaker_name in known_speakers) or \
636
+ (len(words) == 2 and words[0] in ['FIRST', 'SECOND', 'THIRD'] and words[1] in ['CITIZEN', 'GENTLEMAN', 'SERVANT'])
637
+
638
+ if is_known_speaker:
639
+ # Keep as speaker name
640
+ fixed_dialogue_lines.append(line)
641
+ # If it has 3+ words, it's likely dialogue, not a speaker name
642
+ elif len(words) >= 3:
643
+ # Convert colon to period (dialogue ending)
644
+ dialogue = line_stripped[:-1] + '.' # Remove colon, add period
645
+ fixed_dialogue_lines.append(dialogue)
646
+ # Also check if it contains common dialogue words (not speaker names)
647
+ elif any(word in ['ARE', 'YOUR', 'HONOUR', 'TO', 'ENTER', 'SHOULD', 'RUE', 'THE', 'GREY', 'HENCE', 'THERE', 'UN', 'UNTIL', 'UNTO', 'MORE', 'THAN', 'HALF', 'TH', 'AN'] for word in words):
648
+ # Likely dialogue, not speaker
649
+ dialogue = line_stripped[:-1] + '.' # Remove colon, add period
650
+ fixed_dialogue_lines.append(dialogue)
651
+ # Special case: Single letter "A:" is likely dialogue or incomplete, not a speaker
652
+ elif len(words) == 1 and words[0] == 'A':
653
+ # Convert to dialogue
654
+ fixed_dialogue_lines.append('A.')
655
+ # Special case: "MORE THAN HALF:" is dialogue, not speaker
656
+ elif 'MORE' in words and 'THAN' in words:
657
+ dialogue = line_stripped[:-1] + '.' # Remove colon, add period
658
+ fixed_dialogue_lines.append(dialogue)
659
+ else:
660
+ # Keep as speaker name (might be a short unknown character name)
661
+ fixed_dialogue_lines.append(line)
662
+ else:
663
+ fixed_dialogue_lines.append(line)
664
+
665
+ generated_text = '\n'.join(fixed_dialogue_lines)
666
+
667
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
668
  # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
669
  lines = generated_text.split('\n')