Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -367,7 +367,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 367 |
'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
|
| 368 |
'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
|
| 369 |
'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
|
| 370 |
-
'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she'
|
|
|
|
|
|
|
|
|
|
| 371 |
]
|
| 372 |
for word in common_words_fix:
|
| 373 |
word_lower = word.lower()
|
|
@@ -415,15 +418,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 415 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 416 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 417 |
|
| 418 |
-
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "T h is" -> "This")
|
| 419 |
# Handle cases where a word got split into multiple parts
|
| 420 |
-
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon']
|
| 421 |
for word in multi_split_words:
|
| 422 |
word_lower = word.lower()
|
| 423 |
# Create pattern for word split into individual letters with spaces
|
| 424 |
-
# e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at"
|
| 425 |
if len(word_lower) > 2:
|
| 426 |
-
# Pattern: letter space letter space ... (all letters of the word)
|
| 427 |
letters = list(word_lower)
|
| 428 |
pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
|
| 429 |
pattern_parts.append(re.escape(letters[-1]))
|
|
@@ -437,6 +440,21 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 437 |
# Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
|
| 438 |
pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
|
| 439 |
generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
# Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
|
| 442 |
# Common patterns where words got merged incorrectly
|
|
@@ -485,7 +503,45 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 485 |
pattern = r"(" + re.escape(contraction) + r")([a-z])"
|
| 486 |
generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
|
| 487 |
|
| 488 |
-
# Fix 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
# First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
|
| 490 |
# Pattern: lowercase word followed immediately by all-caps speaker name
|
| 491 |
generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)
|
|
|
|
| 367 |
'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
|
| 368 |
'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
|
| 369 |
'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
|
| 370 |
+
'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
|
| 371 |
+
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 372 |
+
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 373 |
+
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art'
|
| 374 |
]
|
| 375 |
for word in common_words_fix:
|
| 376 |
word_lower = word.lower()
|
|
|
|
| 418 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 419 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 420 |
|
| 421 |
+
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 422 |
# Handle cases where a word got split into multiple parts
|
| 423 |
+
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art']
|
| 424 |
for word in multi_split_words:
|
| 425 |
word_lower = word.lower()
|
| 426 |
# Create pattern for word split into individual letters with spaces
|
| 427 |
+
# e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at" or "y our"
|
| 428 |
if len(word_lower) > 2:
|
| 429 |
+
# Pattern 1: letter space letter space ... (all letters of the word split individually)
|
| 430 |
letters = list(word_lower)
|
| 431 |
pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
|
| 432 |
pattern_parts.append(re.escape(letters[-1]))
|
|
|
|
| 440 |
# Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
|
| 441 |
pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
|
| 442 |
generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
|
| 443 |
+
|
| 444 |
+
# Pattern 2: Handle two-part splits (e.g., "y our" -> "your", "h onour" -> "honour")
|
| 445 |
+
# Try all possible two-part splits
|
| 446 |
+
for split_pos in range(1, len(word_lower)):
|
| 447 |
+
first_part = word_lower[:split_pos]
|
| 448 |
+
second_part = word_lower[split_pos:]
|
| 449 |
+
# Pattern: "y our" -> "your"
|
| 450 |
+
pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
|
| 451 |
+
generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
|
| 452 |
+
# Capitalized version: "Y our" -> "Your"
|
| 453 |
+
pattern_2part_cap = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
|
| 454 |
+
generated_text = re.sub(pattern_2part_cap, word.capitalize(), generated_text)
|
| 455 |
+
# All caps: "Y OUR" -> "YOUR"
|
| 456 |
+
pattern_2part_allcap = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
|
| 457 |
+
generated_text = re.sub(pattern_2part_allcap, word.upper(), generated_text)
|
| 458 |
|
| 459 |
# Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
|
| 460 |
# Common patterns where words got merged incorrectly
|
|
|
|
| 503 |
pattern = r"(" + re.escape(contraction) + r")([a-z])"
|
| 504 |
generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
|
| 505 |
|
| 506 |
+
# Fix 3: Fix split speaker names (e.g., "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:")
|
| 507 |
+
# Pattern: All caps words separated by spaces ending with colon (likely split speaker name)
|
| 508 |
+
# First, try to merge split speaker names: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
|
| 509 |
+
# But be careful - some speaker names might legitimately have spaces (e.g., "FIRST CITIZEN:")
|
| 510 |
+
lines = generated_text.split('\n')
|
| 511 |
+
fixed_lines = []
|
| 512 |
+
for line in lines:
|
| 513 |
+
line_stripped = line.strip()
|
| 514 |
+
# Check if line looks like a split speaker name (all caps, has spaces, ends with colon)
|
| 515 |
+
# Pattern 1: Multiple all-caps words with spaces: "ALL ANC A:" or "GENTLEM AN:"
|
| 516 |
+
if re.match(r'^([A-Z]+\s+[A-Z]+\s*[A-Z]*):\s*$', line_stripped):
|
| 517 |
+
# Check if it's a known multi-word speaker name (keep those)
|
| 518 |
+
known_multi_word_speakers = ['FIRST CITIZEN', 'SECOND CITIZEN', 'THIRD CITIZEN',
|
| 519 |
+
'FIRST GENTLEMAN', 'SECOND GENTLEMAN', 'THIRD GENTLEMAN',
|
| 520 |
+
'FIRST SERVANT', 'SECOND SERVANT', 'LADY MACBETH',
|
| 521 |
+
'KING HENRY', 'PRINCE HAMLET', 'DUKE VINCENTIO']
|
| 522 |
+
is_known = False
|
| 523 |
+
for known in known_multi_word_speakers:
|
| 524 |
+
if known in line_stripped.upper():
|
| 525 |
+
is_known = True
|
| 526 |
+
break
|
| 527 |
+
|
| 528 |
+
if not is_known:
|
| 529 |
+
# Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
|
| 530 |
+
# Remove spaces between all-caps words before colon
|
| 531 |
+
merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
|
| 532 |
+
# Only use merged if it makes sense (not too long, looks like a word)
|
| 533 |
+
if len(merged) < 30: # Reasonable speaker name length
|
| 534 |
+
fixed_lines.append(merged)
|
| 535 |
+
else:
|
| 536 |
+
fixed_lines.append(line)
|
| 537 |
+
else:
|
| 538 |
+
# Keep known multi-word speaker names as is
|
| 539 |
+
fixed_lines.append(line)
|
| 540 |
+
else:
|
| 541 |
+
fixed_lines.append(line)
|
| 542 |
+
generated_text = '\n'.join(fixed_lines)
|
| 543 |
+
|
| 544 |
+
# Fix 3b: Add space before character names (all caps words) and fix missing punctuation
|
| 545 |
# First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
|
| 546 |
# Pattern: lowercase word followed immediately by all-caps speaker name
|
| 547 |
generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)
|