Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -349,8 +349,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 349 |
# Just remove the orphaned first line, don't add a speaker
|
| 350 |
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 351 |
|
| 352 |
-
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
|
| 353 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
|
|
|
|
|
|
| 354 |
|
| 355 |
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
|
| 356 |
# Remove spaces in the middle of common words - MORE AGGRESSIVE matching
|
|
@@ -371,7 +373,12 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 371 |
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 372 |
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 373 |
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
|
| 374 |
-
'again', 'government', 'honour', 'light', 'stands', 'fly'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
]
|
| 376 |
for word in common_words_fix:
|
| 377 |
word_lower = word.lower()
|
|
@@ -421,7 +428,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 421 |
|
| 422 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 423 |
# Handle cases where a word got split into multiple parts
|
| 424 |
-
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly']
|
| 425 |
for word in multi_split_words:
|
| 426 |
word_lower = word.lower()
|
| 427 |
# Create pattern for word split into individual letters with spaces
|
|
@@ -489,9 +496,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 489 |
# Fix "ag a in" -> "again" (multiple splits)
|
| 490 |
(r'\bag\s+a\s+in\b', 'again'),
|
| 491 |
(r'\bAg\s+a\s+in\b', 'Again'),
|
| 492 |
-
# Fix "
|
| 493 |
-
(r'\
|
| 494 |
-
(r'\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
]
|
| 496 |
for pattern, replacement in merged_fixes:
|
| 497 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
@@ -588,6 +609,61 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 588 |
|
| 589 |
generated_text = '\n'.join(normalized_lines)
|
| 590 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 592 |
# More aggressive: remove same speaker if it appears within 3 lines (tighter window)
|
| 593 |
lines = generated_text.split('\n')
|
|
|
|
| 349 |
# Just remove the orphaned first line, don't add a speaker
|
| 350 |
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 351 |
|
| 352 |
+
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 353 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 354 |
+
# Also fix single letter + capital word (e.g., "AOr" -> "A Or")
|
| 355 |
+
generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
|
| 356 |
|
| 357 |
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
|
| 358 |
# Remove spaces in the middle of common words - MORE AGGRESSIVE matching
|
|
|
|
| 373 |
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 374 |
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 375 |
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
|
| 376 |
+
'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
|
| 377 |
+
'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
|
| 378 |
+
'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
|
| 379 |
+
'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
|
| 380 |
+
'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
|
| 381 |
+
'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
|
| 382 |
]
|
| 383 |
for word in common_words_fix:
|
| 384 |
word_lower = word.lower()
|
|
|
|
| 428 |
|
| 429 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 430 |
# Handle cases where a word got split into multiple parts
|
| 431 |
+
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
|
| 432 |
for word in multi_split_words:
|
| 433 |
word_lower = word.lower()
|
| 434 |
# Create pattern for word split into individual letters with spaces
|
|
|
|
| 496 |
# Fix "ag a in" -> "again" (multiple splits)
|
| 497 |
(r'\bag\s+a\s+in\b', 'again'),
|
| 498 |
(r'\bAg\s+a\s+in\b', 'Again'),
|
| 499 |
+
# Fix "UN TO" -> "UNTO" (before Fix 3c processes it)
|
| 500 |
+
(r'\bUN\s+TO\b', 'UNTO'),
|
| 501 |
+
(r'\bun\s+to\b', 'unto'),
|
| 502 |
+
# Fix potential word issues
|
| 503 |
+
(r'\bcoronured\b', 'crowned'), # "coronured" -> "crowned"
|
| 504 |
+
(r'\beyuls\b', 'evils'), # "eyuls" -> "evils"
|
| 505 |
+
# Fix "AOr" -> "A Or" or "Or" (if it's at start of sentence)
|
| 506 |
+
(r'\bAOr\b', 'A Or'),
|
| 507 |
+
(r'^A Or\s+', 'Or '), # If "A Or" is at start, might just be "Or"
|
| 508 |
+
# Fix "fe at" -> "feat"
|
| 509 |
+
(r'\bfe\s+at\b', 'feat'),
|
| 510 |
+
(r'\bFe\s+at\b', 'Feat'),
|
| 511 |
+
# Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
|
| 512 |
+
(r'\bTH\s+AN\b', 'THAN'),
|
| 513 |
+
(r'\bth\s+an\b', 'than'),
|
| 514 |
+
# Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
|
| 515 |
+
# Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
|
| 516 |
]
|
| 517 |
for pattern, replacement in merged_fixes:
|
| 518 |
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
|
|
|
| 609 |
|
| 610 |
generated_text = '\n'.join(normalized_lines)
|
| 611 |
|
| 612 |
+
# Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
|
| 613 |
+
# Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
|
| 614 |
+
# Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
|
| 615 |
+
# "THERE SHOULD RUE:" -> "THERE SHOULD RUE."
|
| 616 |
+
# "UN TO THE LADY GREY:" -> "UNTO THE LADY GREY."
|
| 617 |
+
# These are usually long phrases (3+ words) that don't look like character names
|
| 618 |
+
lines = generated_text.split('\n')
|
| 619 |
+
fixed_dialogue_lines = []
|
| 620 |
+
# Known speaker names (keep these as speakers)
|
| 621 |
+
known_speakers = ['BAPTISTA', 'GLOUCESTER', 'CLARENCE', 'ROMEO', 'JULIET', 'HAMLET', 'MACBETH',
|
| 622 |
+
'KING', 'QUEEN', 'DUKE', 'PRINCE', 'LADY', 'FIRST', 'SECOND', 'THIRD',
|
| 623 |
+
'CITIZEN', 'GENTLEMAN', 'SERVANT', 'MENENIUS', 'COMINIUS', 'CORIOLANUS',
|
| 624 |
+
'VINCENTIO', 'ANGELO', 'ISABELLA', 'OTHELLO', 'DESDEMONA', 'IAGO']
|
| 625 |
+
|
| 626 |
+
for i, line in enumerate(lines):
|
| 627 |
+
line_stripped = line.strip()
|
| 628 |
+
# Check if line looks like all-caps speaker but is actually dialogue
|
| 629 |
+
# Pattern: All caps, ends with colon
|
| 630 |
+
if re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped):
|
| 631 |
+
words = line_stripped.split()
|
| 632 |
+
speaker_name = words[0] if words else ''
|
| 633 |
+
|
| 634 |
+
# Check if it's a known speaker name (1-2 words, known name)
|
| 635 |
+
is_known_speaker = (len(words) <= 2 and speaker_name in known_speakers) or \
|
| 636 |
+
(len(words) == 2 and words[0] in ['FIRST', 'SECOND', 'THIRD'] and words[1] in ['CITIZEN', 'GENTLEMAN', 'SERVANT'])
|
| 637 |
+
|
| 638 |
+
if is_known_speaker:
|
| 639 |
+
# Keep as speaker name
|
| 640 |
+
fixed_dialogue_lines.append(line)
|
| 641 |
+
# If it has 3+ words, it's likely dialogue, not a speaker name
|
| 642 |
+
elif len(words) >= 3:
|
| 643 |
+
# Convert colon to period (dialogue ending)
|
| 644 |
+
dialogue = line_stripped[:-1] + '.' # Remove colon, add period
|
| 645 |
+
fixed_dialogue_lines.append(dialogue)
|
| 646 |
+
# Also check if it contains common dialogue words (not speaker names)
|
| 647 |
+
elif any(word in ['ARE', 'YOUR', 'HONOUR', 'TO', 'ENTER', 'SHOULD', 'RUE', 'THE', 'GREY', 'HENCE', 'THERE', 'UN', 'UNTIL', 'UNTO', 'MORE', 'THAN', 'HALF', 'TH', 'AN'] for word in words):
|
| 648 |
+
# Likely dialogue, not speaker
|
| 649 |
+
dialogue = line_stripped[:-1] + '.' # Remove colon, add period
|
| 650 |
+
fixed_dialogue_lines.append(dialogue)
|
| 651 |
+
# Special case: Single letter "A:" is likely dialogue or incomplete, not a speaker
|
| 652 |
+
elif len(words) == 1 and words[0] == 'A':
|
| 653 |
+
# Convert to dialogue
|
| 654 |
+
fixed_dialogue_lines.append('A.')
|
| 655 |
+
# Special case: "MORE THAN HALF:" is dialogue, not speaker
|
| 656 |
+
elif 'MORE' in words and 'THAN' in words:
|
| 657 |
+
dialogue = line_stripped[:-1] + '.' # Remove colon, add period
|
| 658 |
+
fixed_dialogue_lines.append(dialogue)
|
| 659 |
+
else:
|
| 660 |
+
# Keep as speaker name (might be a short unknown character name)
|
| 661 |
+
fixed_dialogue_lines.append(line)
|
| 662 |
+
else:
|
| 663 |
+
fixed_dialogue_lines.append(line)
|
| 664 |
+
|
| 665 |
+
generated_text = '\n'.join(fixed_dialogue_lines)
|
| 666 |
+
|
| 667 |
# Fix 4: Remove duplicate speaker names (e.g., "EDWARD IV:\n...\nEDWARD IV:" -> keep only first)
|
| 668 |
# More aggressive: remove same speaker if it appears within 3 lines (tighter window)
|
| 669 |
lines = generated_text.split('\n')
|