Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -513,6 +513,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 513 |
(r'\bth\s+an\b', 'than'),
|
| 514 |
# Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
|
| 515 |
# Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
]
|
| 517 |
for pattern, replacement in merged_fixes:
|
| 518 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
@@ -666,6 +672,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 666 |
|
| 667 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 668 |
# More aggressive: remove same speaker if it appears within 3 lines (tighter window)
|
|
|
|
| 669 |
lines = generated_text.split('\n')
|
| 670 |
cleaned_lines = []
|
| 671 |
speaker_history = [] # Track recent speakers with their line numbers
|
|
@@ -677,11 +684,14 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 677 |
|
| 678 |
if speaker_match:
|
| 679 |
speaker = speaker_match.group(1).strip()
|
|
|
|
| 680 |
|
| 681 |
# Check if this speaker appeared recently (within last 3 lines - more aggressive)
|
|
|
|
| 682 |
recent_speaker = False
|
| 683 |
for hist_speaker, hist_line_num in speaker_history[-3:]:
|
| 684 |
-
|
|
|
|
| 685 |
recent_speaker = True
|
| 686 |
break
|
| 687 |
|
|
@@ -744,6 +754,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 744 |
generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
|
| 745 |
|
| 746 |
# Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
|
|
|
|
| 747 |
generated_text = re.sub(
|
| 748 |
r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
|
| 749 |
r'\1:\n',
|
|
@@ -751,6 +762,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 751 |
flags=re.MULTILINE
|
| 752 |
)
|
| 753 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
# Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
|
| 755 |
# This happens when the model hits the token limit mid-generation
|
| 756 |
if generated_text.strip():
|
|
|
|
| 513 |
(r'\bth\s+an\b', 'than'),
|
| 514 |
# Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
|
| 515 |
# Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
|
| 516 |
+
# Fix duplicate words: "if it be it possible" -> "if it be possible"
|
| 517 |
+
(r'\bif it be it\b', 'if it be'),
|
| 518 |
+
(r'\bIf it be it\b', 'If it be'),
|
| 519 |
+
# Fix duplicate "belike" -> remove one
|
| 520 |
+
(r'\bbelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'belike that you were right gentle exercise'),
|
| 521 |
+
(r'\bBelike\s+that\s+you\s+were\s+right\s+gentle\s+exercise,\s+belike\b', 'Belike that you were right gentle exercise'),
|
| 522 |
]
|
| 523 |
for pattern, replacement in merged_fixes:
|
| 524 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
|
|
| 672 |
|
| 673 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 674 |
# More aggressive: remove same speaker if it appears within 3 lines (tighter window)
|
| 675 |
+
# Also handle case-insensitive duplicates (e.g., "First Citizen:" and "FIRST CITIZEN:")
|
| 676 |
lines = generated_text.split('\n')
|
| 677 |
cleaned_lines = []
|
| 678 |
speaker_history = [] # Track recent speakers with their line numbers
|
|
|
|
| 684 |
|
| 685 |
if speaker_match:
|
| 686 |
speaker = speaker_match.group(1).strip()
|
| 687 |
+
speaker_upper = speaker.upper() # For case-insensitive comparison
|
| 688 |
|
| 689 |
# Check if this speaker appeared recently (within last 3 lines - more aggressive)
|
| 690 |
+
# Check both exact match and case-insensitive match
|
| 691 |
recent_speaker = False
|
| 692 |
for hist_speaker, hist_line_num in speaker_history[-3:]:
|
| 693 |
+
hist_speaker_upper = hist_speaker.upper()
|
| 694 |
+
if speaker == hist_speaker or speaker_upper == hist_speaker_upper:
|
| 695 |
recent_speaker = True
|
| 696 |
break
|
| 697 |
|
|
|
|
| 754 |
generated_text = re.sub(r'([A-Z][A-Z\s]+?):\s*\n\s*\n+', r'\1:\n', generated_text)
|
| 755 |
|
| 756 |
# Fix 7: Remove any remaining consecutive duplicate speakers (final cleanup)
|
| 757 |
+
# Handle both exact duplicates and case-insensitive duplicates
|
| 758 |
generated_text = re.sub(
|
| 759 |
r'^([A-Z][A-Z\s]+?):\s*\n\s*\n*\1:\s*\n',
|
| 760 |
r'\1:\n',
|
|
|
|
| 762 |
flags=re.MULTILINE
|
| 763 |
)
|
| 764 |
|
| 765 |
+
# Fix 7b: Remove duplicate words in sentences (e.g., "if it be it possible" -> "if it be possible")
|
| 766 |
+
# Pattern: word followed by same word (case-insensitive)
|
| 767 |
+
# But be careful not to remove valid repetitions like "very very" or "more more"
|
| 768 |
+
# Only remove common function words that shouldn't repeat
|
| 769 |
+
duplicate_word_patterns = [
|
| 770 |
+
(r'\b(it)\s+\1\b', r'\1'), # "it it" -> "it"
|
| 771 |
+
(r'\b(the)\s+\1\b', r'\1'), # "the the" -> "the"
|
| 772 |
+
(r'\b(a)\s+\1\b', r'\1'), # "a a" -> "a"
|
| 773 |
+
(r'\b(an)\s+\1\b', r'\1'), # "an an" -> "an"
|
| 774 |
+
(r'\b(is)\s+\1\b', r'\1'), # "is is" -> "is"
|
| 775 |
+
(r'\b(was)\s+\1\b', r'\1'), # "was was" -> "was"
|
| 776 |
+
(r'\b(are)\s+\1\b', r'\1'), # "are are" -> "are"
|
| 777 |
+
(r'\b(be)\s+\1\b', r'\1'), # "be be" -> "be"
|
| 778 |
+
]
|
| 779 |
+
for pattern, replacement in duplicate_word_patterns:
|
| 780 |
+
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
| 781 |
+
|
| 782 |
# Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
|
| 783 |
# This happens when the model hits the token limit mid-generation
|
| 784 |
if generated_text.strip():
|