shwethd commited on
Commit
dfe500d
·
verified ·
1 Parent(s): 9908f05

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -1
app.py CHANGED
@@ -513,6 +513,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
513
  (r'\bth\s+an\b', 'than'),
514
  # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
515
  # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
 
 
 
 
 
 
516
  ]
517
  for pattern, replacement in merged_fixes:
518
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
@@ -666,6 +672,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
666
 
667
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
668
  # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
 
669
  lines = generated_text.split('\n')
670
  cleaned_lines = []
671
  speaker_history = [] # Track recent speakers with their line numbers
@@ -677,11 +684,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
677
 
678
  if speaker_match:
679
  speaker = speaker_match.group(1).strip()
 
680
 
681
  # Check if this speaker appeared recently (within last 3 lines - more aggressive)
 
682
  recent_speaker = False
683
  for hist_speaker, hist_line_num in speaker_history[-3:]:
684
- if speaker == hist_speaker:
 
685
  recent_speaker = True
686
  break
687
 
@@ -744,6 +754,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
744
  generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
745
 
746
  # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
 
747
  generated_text = re.sub(
748
  r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
749
  r'\1:\n',
@@ -751,6 +762,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
751
  flags=re.MULTILINE
752
  )
753
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
755
  # This happens when the model hits the token limit mid-generation
756
  if generated_text.strip():
 
513
  (r'\bth\s+an\b', 'than'),
514
  # Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
515
  # Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
516
+ # Fix duplicate words: "if it be it possible" -> "if it be possible"
517
+ (r'\bif it be it\b', 'if it be'),
518
+ (r'\bIf it be it\b', 'If it be'),
519
+ # Fix duplicate "belike" -> remove one
520
+ (r'\bbelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'belike that you were right gentle exercise'),
521
+ (r'\bBelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'Belike that you were right gentle exercise'),
522
  ]
523
  for pattern, replacement in merged_fixes:
524
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
 
672
 
673
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
674
  # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
675
+ # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
676
  lines = generated_text.split('\n')
677
  cleaned_lines = []
678
  speaker_history = [] # Track recent speakers with their line numbers
 
684
 
685
  if speaker_match:
686
  speaker = speaker_match.group(1).strip()
687
+ speaker_upper = speaker.upper() # For case-insensitive comparison
688
 
689
  # Check if this speaker appeared recently (within last 3 lines - more aggressive)
690
+ # Check both exact match and case-insensitive match
691
  recent_speaker = False
692
  for hist_speaker, hist_line_num in speaker_history[-3:]:
693
+ hist_speaker_upper = hist_speaker.upper()
694
+ if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
695
  recent_speaker = True
696
  break
697
 
 
754
  generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
755
 
756
  # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
757
+ # Handle both exact duplicates and case-insensitive duplicates
758
  generated_text = re.sub(
759
  r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
760
  r'\1:\n',
 
762
  flags=re.MULTILINE
763
  )
764
 
765
+ # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
766
+ # Pattern: word followed by same word (case-insensitive)
767
+ # But be careful not to remove valid repetitions like "very very" or "more more"
768
+ # Only remove common function words that shouldn't repeat
769
+ duplicate_word_patterns = [
770
+ (r'\b(it)\s+\1\b', r'\1'), # "it it" -> "it"
771
+ (r'\b(the)\s+\1\b', r'\1'), # "the the" -> "the"
772
+ (r'\b(a)\s+\1\b', r'\1'), # "a a" -> "a"
773
+ (r'\b(an)\s+\1\b', r'\1'), # "an an" -> "an"
774
+ (r'\b(is)\s+\1\b', r'\1'), # "is is" -> "is"
775
+ (r'\b(was)\s+\1\b', r'\1'), # "was was" -> "was"
776
+ (r'\b(are)\s+\1\b', r'\1'), # "are are" -> "are"
777
+ (r'\b(be)\s+\1\b', r'\1'), # "be be" -> "be"
778
+ ]
779
+ for pattern, replacement in duplicate_word_patterns:
780
+ generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
781
+
782
  # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
783
  # This happens when the model hits the token limit mid-generation
784
  if generated_text.strip():