shwethd commited on
Commit
6b1dee4
·
verified ·
1 Parent(s): dfe500d

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -16
app.py CHANGED
@@ -584,16 +584,20 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
584
 
585
  # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
586
  # Handle mixed case speaker names that should be all caps
 
587
  lines = generated_text.split('\n')
588
  normalized_lines = []
589
  for i, line in enumerate(lines):
590
  line_stripped = line.strip()
591
 
592
  # Check if line is a potential speaker name (title case or mixed case, 2+ words)
593
- # Pattern: "Romeo and juliet", "Romeo And Juliet", etc.
594
  speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
595
  match = re.match(speaker_pattern, line_stripped)
596
 
 
 
 
597
  if match:
598
  # Check if next line is dialogue (not another speaker)
599
  is_speaker = False
@@ -610,6 +614,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
610
  speaker_name = match.group(1).upper()
611
  normalized_lines.append(speaker_name + ':')
612
  continue
 
 
 
 
 
 
 
 
613
 
614
  normalized_lines.append(line)
615
 
@@ -671,25 +683,25 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
671
  generated_text = '\n'.join(fixed_dialogue_lines)
672
 
673
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
674
- # More aggressive: remove same speaker if it appears within 3 lines (tighter window)
675
  # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
676
  lines = generated_text.split('\n')
677
  cleaned_lines = []
678
- speaker_history = [] # Track recent speakers with their line numbers
679
 
680
  for i, line in enumerate(lines):
681
  line_stripped = line.strip()
682
- # Check if this line is a speaker name
683
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
684
 
685
  if speaker_match:
686
  speaker = speaker_match.group(1).strip()
687
  speaker_upper = speaker.upper() # For case-insensitive comparison
688
 
689
- # Check if this speaker appeared recently (within last 3 lines - more aggressive)
690
  # Check both exact match and case-insensitive match
691
  recent_speaker = False
692
- for hist_speaker, hist_line_num in speaker_history[-3:]:
693
  hist_speaker_upper = hist_speaker.upper()
694
  if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
695
  recent_speaker = True
@@ -699,10 +711,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
699
  # Skip this duplicate speaker
700
  continue
701
 
702
- # Add to history
703
- speaker_history.append((speaker, i))
704
- # Keep only last 10 speakers in history
705
- if len(speaker_history) > 10:
706
  speaker_history.pop(0)
707
 
708
  cleaned_lines.append(line)
@@ -755,12 +767,34 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
755
 
756
  # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
757
  # Handle both exact duplicates and case-insensitive duplicates
758
- generated_text = re.sub(
759
- r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
760
- r'\1:\n',
761
- generated_text,
762
- flags=re.MULTILINE
763
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
 
765
  # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
766
  # Pattern: word followed by same word (case-insensitive)
 
584
 
585
  # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
586
  # Handle mixed case speaker names that should be all caps
587
+ # Also handle "First Citizen:" -> "FIRST CITIZEN:"
588
  lines = generated_text.split('\n')
589
  normalized_lines = []
590
  for i, line in enumerate(lines):
591
  line_stripped = line.strip()
592
 
593
  # Check if line is a potential speaker name (title case or mixed case, 2+ words)
594
+ # Pattern: "Romeo and juliet", "Romeo And Juliet", "First Citizen", etc.
595
  speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
596
  match = re.match(speaker_pattern, line_stripped)
597
 
598
+ # Also check for all-caps speaker names (already normalized)
599
+ all_caps_speaker = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
600
+
601
  if match:
602
  # Check if next line is dialogue (not another speaker)
603
  is_speaker = False
 
614
  speaker_name = match.group(1).upper()
615
  normalized_lines.append(speaker_name + ':')
616
  continue
617
+ elif all_caps_speaker:
618
+ # Already all caps, just ensure it has colon
619
+ speaker_name = all_caps_speaker.group(1).strip()
620
+ if not line_stripped.endswith(':'):
621
+ normalized_lines.append(speaker_name + ':')
622
+ else:
623
+ normalized_lines.append(line)
624
+ continue
625
 
626
  normalized_lines.append(line)
627
 
 
683
  generated_text = '\n'.join(fixed_dialogue_lines)
684
 
685
  # Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
686
+ # More aggressive: remove same speaker if it appears within 5 lines (expanded window for empty lines)
687
  # Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
688
  lines = generated_text.split('\n')
689
  cleaned_lines = []
690
+ speaker_history = [] # Track recent speakers with their line numbers (case-insensitive)
691
 
692
  for i, line in enumerate(lines):
693
  line_stripped = line.strip()
694
+ # Check if this line is a speaker name (all caps after normalization)
695
  speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
696
 
697
  if speaker_match:
698
  speaker = speaker_match.group(1).strip()
699
  speaker_upper = speaker.upper() # For case-insensitive comparison
700
 
701
+ # Check if this speaker appeared recently (within last 5 lines - expanded for empty lines)
702
  # Check both exact match and case-insensitive match
703
  recent_speaker = False
704
+ for hist_speaker, hist_line_num in speaker_history[-5:]: # Check last 5 speakers
705
  hist_speaker_upper = hist_speaker.upper()
706
  if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
707
  recent_speaker = True
 
711
  # Skip this duplicate speaker
712
  continue
713
 
714
+ # Add to history (store uppercase version for consistent comparison)
715
+ speaker_history.append((speaker_upper, i))
716
+ # Keep only last 15 speakers in history (expanded)
717
+ if len(speaker_history) > 15:
718
  speaker_history.pop(0)
719
 
720
  cleaned_lines.append(line)
 
767
 
768
  # Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
769
  # Handle both exact duplicates and case-insensitive duplicates
770
+ # This handles cases like "FIRST CITIZEN:\n\nFIRST CITIZEN:" -> "FIRST CITIZEN:"
771
+ lines = generated_text.split('\n')
772
+ final_cleaned_lines = []
773
+ last_speaker_upper = None
774
+
775
+ for i, line in enumerate(lines):
776
+ line_stripped = line.strip()
777
+ speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
778
+
779
+ if speaker_match:
780
+ speaker = speaker_match.group(1).strip()
781
+ speaker_upper = speaker.upper()
782
+
783
+ # If this is the same speaker as the last one (case-insensitive), skip it
784
+ if speaker_upper == last_speaker_upper:
785
+ continue
786
+
787
+ last_speaker_upper = speaker_upper
788
+ final_cleaned_lines.append(line)
789
+ else:
790
+ # Reset speaker tracking on non-speaker lines (but keep last_speaker for nearby duplicates)
791
+ # Only reset if we have substantial content (not just empty lines)
792
+ if line_stripped: # Non-empty line
793
+ # Keep last_speaker for a few lines in case of empty lines
794
+ pass
795
+ final_cleaned_lines.append(line)
796
+
797
+ generated_text = '\n'.join(final_cleaned_lines)
798
 
799
  # Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
800
  # Pattern: word followed by same word (case-insensitive)