Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -584,16 +584,20 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 584 |
|
| 585 |
# Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
|
| 586 |
# Handle mixed case speaker names that should be all caps
|
|
|
|
| 587 |
lines = generated_text.split('\n')
|
| 588 |
normalized_lines = []
|
| 589 |
for i, line in enumerate(lines):
|
| 590 |
line_stripped = line.strip()
|
| 591 |
|
| 592 |
# Check if line is a potential speaker name (title case or mixed case, 2+ words)
|
| 593 |
-
# Pattern: "Romeo and juliet", "Romeo And Juliet", etc.
|
| 594 |
speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
|
| 595 |
match = re.match(speaker_pattern, line_stripped)
|
| 596 |
|
|
|
|
|
|
|
|
|
|
| 597 |
if match:
|
| 598 |
# Check if next line is dialogue (not another speaker)
|
| 599 |
is_speaker = False
|
|
@@ -610,6 +614,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 610 |
speaker_name = match.group(1).upper()
|
| 611 |
normalized_lines.append(speaker_name + ':')
|
| 612 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
|
| 614 |
normalized_lines.append(line)
|
| 615 |
|
|
@@ -671,25 +683,25 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 671 |
generated_text = '\n'.join(fixed_dialogue_lines)
|
| 672 |
|
| 673 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 674 |
-
# More aggressive: remove same speaker if it appears within
|
| 675 |
# Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
|
| 676 |
lines = generated_text.split('\n')
|
| 677 |
cleaned_lines = []
|
| 678 |
-
speaker_history = [] # Track recent speakers with their line numbers
|
| 679 |
|
| 680 |
for i, line in enumerate(lines):
|
| 681 |
line_stripped = line.strip()
|
| 682 |
-
# Check if this line is a speaker name
|
| 683 |
speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 684 |
|
| 685 |
if speaker_match:
|
| 686 |
speaker = speaker_match.group(1).strip()
|
| 687 |
speaker_upper = speaker.upper() # For case-insensitive comparison
|
| 688 |
|
| 689 |
-
# Check if this speaker appeared recently (within last
|
| 690 |
# Check both exact match and case-insensitive match
|
| 691 |
recent_speaker = False
|
| 692 |
-
for hist_speaker, hist_line_num in speaker_history[-
|
| 693 |
hist_speaker_upper = hist_speaker.upper()
|
| 694 |
if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
|
| 695 |
recent_speaker = True
|
|
@@ -699,10 +711,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 699 |
# Skip this duplicate speaker
|
| 700 |
continue
|
| 701 |
|
| 702 |
-
# Add to history
|
| 703 |
-
speaker_history.append((
|
| 704 |
-
# Keep only last
|
| 705 |
-
if len(speaker_history) >
|
| 706 |
speaker_history.pop(0)
|
| 707 |
|
| 708 |
cleaned_lines.append(line)
|
|
@@ -755,12 +767,34 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 755 |
|
| 756 |
# Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
|
| 757 |
# Handle both exact duplicates and case-insensitive duplicates
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
|
| 765 |
# Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
|
| 766 |
# Pattern: word followed by same word (case-insensitive)
|
|
|
|
| 584 |
|
| 585 |
# Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
|
| 586 |
# Handle mixed case speaker names that should be all caps
|
| 587 |
+
# Also handle "First Citizen:" -> "FIRST CITIZEN:"
|
| 588 |
lines = generated_text.split('\n')
|
| 589 |
normalized_lines = []
|
| 590 |
for i, line in enumerate(lines):
|
| 591 |
line_stripped = line.strip()
|
| 592 |
|
| 593 |
# Check if line is a potential speaker name (title case or mixed case, 2+ words)
|
| 594 |
+
# Pattern: "Romeo and juliet", "Romeo And Juliet", "First Citizen", etc.
|
| 595 |
speaker_pattern = r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+)\s*:?\s*$'
|
| 596 |
match = re.match(speaker_pattern, line_stripped)
|
| 597 |
|
| 598 |
+
# Also check for all-caps speaker names (already normalized)
|
| 599 |
+
all_caps_speaker = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 600 |
+
|
| 601 |
if match:
|
| 602 |
# Check if next line is dialogue (not another speaker)
|
| 603 |
is_speaker = False
|
|
|
|
| 614 |
speaker_name = match.group(1).upper()
|
| 615 |
normalized_lines.append(speaker_name + ':')
|
| 616 |
continue
|
| 617 |
+
elif all_caps_speaker:
|
| 618 |
+
# Already all caps, just ensure it has colon
|
| 619 |
+
speaker_name = all_caps_speaker.group(1).strip()
|
| 620 |
+
if not line_stripped.endswith(':'):
|
| 621 |
+
normalized_lines.append(speaker_name + ':')
|
| 622 |
+
else:
|
| 623 |
+
normalized_lines.append(line)
|
| 624 |
+
continue
|
| 625 |
|
| 626 |
normalized_lines.append(line)
|
| 627 |
|
|
|
|
| 683 |
generated_text = '\n'.join(fixed_dialogue_lines)
|
| 684 |
|
| 685 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 686 |
+
# More aggressive: remove same speaker if it appears within 5 lines (expanded window for empty lines)
|
| 687 |
# Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
|
| 688 |
lines = generated_text.split('\n')
|
| 689 |
cleaned_lines = []
|
| 690 |
+
speaker_history = [] # Track recent speakers with their line numbers (case-insensitive)
|
| 691 |
|
| 692 |
for i, line in enumerate(lines):
|
| 693 |
line_stripped = line.strip()
|
| 694 |
+
# Check if this line is a speaker name (all caps after normalization)
|
| 695 |
speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 696 |
|
| 697 |
if speaker_match:
|
| 698 |
speaker = speaker_match.group(1).strip()
|
| 699 |
speaker_upper = speaker.upper() # For case-insensitive comparison
|
| 700 |
|
| 701 |
+
# Check if this speaker appeared recently (within last 5 lines - expanded for empty lines)
|
| 702 |
# Check both exact match and case-insensitive match
|
| 703 |
recent_speaker = False
|
| 704 |
+
for hist_speaker, hist_line_num in speaker_history[-5:]: # Check last 5 speakers
|
| 705 |
hist_speaker_upper = hist_speaker.upper()
|
| 706 |
if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
|
| 707 |
recent_speaker = True
|
|
|
|
| 711 |
# Skip this duplicate speaker
|
| 712 |
continue
|
| 713 |
|
| 714 |
+
# Add to history (store uppercase version for consistent comparison)
|
| 715 |
+
speaker_history.append((speaker_upper, i))
|
| 716 |
+
# Keep only last 15 speakers in history (expanded)
|
| 717 |
+
if len(speaker_history) > 15:
|
| 718 |
speaker_history.pop(0)
|
| 719 |
|
| 720 |
cleaned_lines.append(line)
|
|
|
|
| 767 |
|
| 768 |
# Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
|
| 769 |
# Handle both exact duplicates and case-insensitive duplicates
|
| 770 |
+
# This handles cases like "FIRST CITIZEN:\n\nFIRST CITIZEN:" -> "FIRST CITIZEN:"
|
| 771 |
+
lines = generated_text.split('\n')
|
| 772 |
+
final_cleaned_lines = []
|
| 773 |
+
last_speaker_upper = None
|
| 774 |
+
|
| 775 |
+
for i, line in enumerate(lines):
|
| 776 |
+
line_stripped = line.strip()
|
| 777 |
+
speaker_match = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 778 |
+
|
| 779 |
+
if speaker_match:
|
| 780 |
+
speaker = speaker_match.group(1).strip()
|
| 781 |
+
speaker_upper = speaker.upper()
|
| 782 |
+
|
| 783 |
+
# If this is the same speaker as the last one (case-insensitive), skip it
|
| 784 |
+
if speaker_upper == last_speaker_upper:
|
| 785 |
+
continue
|
| 786 |
+
|
| 787 |
+
last_speaker_upper = speaker_upper
|
| 788 |
+
final_cleaned_lines.append(line)
|
| 789 |
+
else:
|
| 790 |
+
# Reset speaker tracking on non-speaker lines (but keep last_speaker for nearby duplicates)
|
| 791 |
+
# Only reset if we have substantial content (not just empty lines)
|
| 792 |
+
if line_stripped: # Non-empty line
|
| 793 |
+
# Keep last_speaker for a few lines in case of empty lines
|
| 794 |
+
pass
|
| 795 |
+
final_cleaned_lines.append(line)
|
| 796 |
+
|
| 797 |
+
generated_text = '\n'.join(final_cleaned_lines)
|
| 798 |
|
| 799 |
# Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
|
| 800 |
# Pattern: word followed by same word (case-insensitive)
|