Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -369,13 +369,19 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 369 |
|
| 370 |
# If after removing prompt, first line is orphaned dialogue (no speaker), handle it
|
| 371 |
# Keep removing orphaned dialogue at the start until we find a speaker or valid content
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
if not first_line:
|
| 377 |
# Remove empty first line
|
| 378 |
-
|
| 379 |
continue
|
| 380 |
|
| 381 |
# Check if first line is a speaker name
|
|
@@ -389,76 +395,51 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 389 |
# Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
|
| 390 |
if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
|
| 391 |
# Remove the orphaned first line
|
| 392 |
-
|
| 393 |
else:
|
| 394 |
# Not clearly orphaned dialogue, stop removing
|
| 395 |
break
|
| 396 |
|
|
|
|
|
|
|
| 397 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 398 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 399 |
# Also fix single letter + capital word (e.g., "AOr" -> "A Or")
|
| 400 |
generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
|
| 401 |
|
| 402 |
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
|
| 403 |
-
#
|
|
|
|
| 404 |
common_words_fix = [
|
| 405 |
'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
|
| 406 |
-
'man', 'men', 'woman', 'women', '
|
| 407 |
-
'house', '
|
| 408 |
-
'well', '
|
| 409 |
'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
|
| 410 |
'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
|
| 411 |
-
'you', 'me', 'my', 'his', '
|
| 412 |
-
'
|
| 413 |
-
'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
|
| 414 |
-
'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
|
| 415 |
-
'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
|
| 416 |
-
'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
|
| 417 |
-
'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
|
| 418 |
-
'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
|
| 419 |
-
'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
|
| 420 |
-
'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
|
| 421 |
-
'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
|
| 422 |
-
'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
|
| 423 |
-
'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
|
| 424 |
-
'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
|
| 425 |
-
'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
|
| 426 |
-
'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
|
| 427 |
-
'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
|
| 428 |
-
'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
|
| 429 |
-
'know', 'edward'
|
| 430 |
]
|
|
|
|
| 431 |
for word in common_words_fix:
|
| 432 |
word_lower = word.lower()
|
| 433 |
-
#
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
first_part = word_lower[:i]
|
| 436 |
second_part = word_lower[i:]
|
| 437 |
|
| 438 |
-
#
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
generated_text = re.sub(pattern1, word, generated_text, flags=re.IGNORECASE)
|
| 442 |
-
|
| 443 |
-
# Pattern 2: Capital first letter (e.g., "Th at" -> "That")
|
| 444 |
-
pattern2 = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
|
| 445 |
-
generated_text = re.sub(pattern2, word.capitalize(), generated_text)
|
| 446 |
-
|
| 447 |
-
# Pattern 3: All caps (e.g., "TH AT" -> "THAT")
|
| 448 |
-
pattern3 = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
|
| 449 |
-
generated_text = re.sub(pattern3, word.upper(), generated_text)
|
| 450 |
-
|
| 451 |
-
# Pattern 4: Mixed case - first letter capitalized (e.g., "Th at" -> "That")
|
| 452 |
-
if len(first_part) > 0:
|
| 453 |
-
pattern4 = r'\b' + re.escape(first_part[0].upper() + first_part[1:]) + r'\s+' + re.escape(second_part) + r'\b'
|
| 454 |
-
generated_text = re.sub(pattern4, word.capitalize(), generated_text, flags=re.IGNORECASE)
|
| 455 |
-
|
| 456 |
-
# Pattern 5: Handle multiple splits in one word (e.g., "c o u n t" -> "count")
|
| 457 |
-
# This is a special case for words that got split multiple times
|
| 458 |
-
if len(word_lower) > 4: # Only for longer words
|
| 459 |
-
# Try to find pattern like "c o u n t" or "y o u r"
|
| 460 |
-
# This is more complex, so we'll handle it separately
|
| 461 |
-
pass
|
| 462 |
|
| 463 |
# Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
|
| 464 |
# Add space before common words that might have been merged
|
|
@@ -474,43 +455,29 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 474 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 475 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 476 |
|
| 477 |
-
# Fix 1c: Fix multiple splits in one word
|
| 478 |
-
#
|
| 479 |
-
multi_split_words = ['count', 'your', '
|
|
|
|
|
|
|
| 480 |
for word in multi_split_words:
|
| 481 |
word_lower = word.lower()
|
| 482 |
-
# Create pattern for word split into individual letters with spaces
|
| 483 |
-
# e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at" or "y our"
|
| 484 |
if len(word_lower) > 2:
|
| 485 |
-
# Pattern 1: letter space letter space ... (all letters
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
pattern_cap = r'\b' + re.escape(letters[0].upper()) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[1:-1]]) + re.escape(letters[-1]) + r'\b'
|
| 493 |
-
generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
|
| 494 |
-
# Handle mixed case like "Wh at" -> "What"
|
| 495 |
-
if len(letters) > 2:
|
| 496 |
-
# Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
|
| 497 |
-
pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
|
| 498 |
-
generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
|
| 499 |
|
| 500 |
-
# Pattern 2: Handle two-part splits
|
| 501 |
-
|
| 502 |
-
|
| 503 |
first_part = word_lower[:split_pos]
|
| 504 |
second_part = word_lower[split_pos:]
|
| 505 |
-
# Pattern: "y our" -> "your"
|
| 506 |
pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
|
| 507 |
generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
|
| 508 |
-
# Capitalized version: "Y our" -> "Your"
|
| 509 |
-
pattern_2part_cap = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
|
| 510 |
-
generated_text = re.sub(pattern_2part_cap, word.capitalize(), generated_text)
|
| 511 |
-
# All caps: "Y OUR" -> "YOUR"
|
| 512 |
-
pattern_2part_allcap = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
|
| 513 |
-
generated_text = re.sub(pattern_2part_allcap, word.upper(), generated_text)
|
| 514 |
|
| 515 |
# Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
|
| 516 |
# Common patterns where words got merged incorrectly
|
|
|
|
| 369 |
|
| 370 |
# If after removing prompt, first line is orphaned dialogue (no speaker), handle it
|
| 371 |
# Keep removing orphaned dialogue at the start until we find a speaker or valid content
|
| 372 |
+
# Limit to max 10 iterations to avoid infinite loops
|
| 373 |
+
lines = generated_text.split('\n')
|
| 374 |
+
start_idx = 0
|
| 375 |
+
max_iterations = 10
|
| 376 |
+
iteration = 0
|
| 377 |
+
|
| 378 |
+
while start_idx < len(lines) and iteration < max_iterations:
|
| 379 |
+
iteration += 1
|
| 380 |
+
first_line = lines[start_idx].strip() if start_idx < len(lines) else ''
|
| 381 |
|
| 382 |
if not first_line:
|
| 383 |
# Remove empty first line
|
| 384 |
+
start_idx += 1
|
| 385 |
continue
|
| 386 |
|
| 387 |
# Check if first line is a speaker name
|
|
|
|
| 395 |
# Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
|
| 396 |
if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
|
| 397 |
# Remove the orphaned first line
|
| 398 |
+
start_idx += 1
|
| 399 |
else:
|
| 400 |
# Not clearly orphaned dialogue, stop removing
|
| 401 |
break
|
| 402 |
|
| 403 |
+
generated_text = '\n'.join(lines[start_idx:])
|
| 404 |
+
|
| 405 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 406 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 407 |
# Also fix single letter + capital word (e.g., "AOr" -> "A Or")
|
| 408 |
generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
|
| 409 |
|
| 410 |
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
|
| 411 |
+
# OPTIMIZED: Only process most common split words to reduce computation
|
| 412 |
+
# Focus on words that are most likely to be split incorrectly
|
| 413 |
common_words_fix = [
|
| 414 |
'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
|
| 415 |
+
'man', 'men', 'woman', 'women', 'content', 'gentle', 'gently',
|
| 416 |
+
'house', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
|
| 417 |
+
'well', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
|
| 418 |
'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
|
| 419 |
'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
|
| 420 |
+
'you', 'me', 'my', 'his', 'into', 'onto', 'upon', 'within', 'without',
|
| 421 |
+
'together', 'honour', 'honor', 'common', 'complain', 'again', 'apparent'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
]
|
| 423 |
+
# Pre-compile patterns for common splits (only most common 2-3 splits per word)
|
| 424 |
for word in common_words_fix:
|
| 425 |
word_lower = word.lower()
|
| 426 |
+
# Only try 2-3 most common split positions (middle, quarter, three-quarter)
|
| 427 |
+
split_positions = []
|
| 428 |
+
if len(word_lower) > 2:
|
| 429 |
+
split_positions = [len(word_lower) // 2] # Most common: middle split
|
| 430 |
+
if len(word_lower) > 4:
|
| 431 |
+
split_positions.append(len(word_lower) // 4)
|
| 432 |
+
split_positions.append(3 * len(word_lower) // 4)
|
| 433 |
+
|
| 434 |
+
for i in split_positions:
|
| 435 |
+
if i < 1 or i >= len(word_lower):
|
| 436 |
+
continue
|
| 437 |
first_part = word_lower[:i]
|
| 438 |
second_part = word_lower[i:]
|
| 439 |
|
| 440 |
+
# Combined pattern with case-insensitive flag (more efficient)
|
| 441 |
+
pattern = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
|
| 442 |
+
generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
# Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
|
| 445 |
# Add space before common words that might have been merged
|
|
|
|
| 455 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 456 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 457 |
|
| 458 |
+
# Fix 1c: Fix multiple splits in one word - OPTIMIZED: Only handle most common cases
|
| 459 |
+
# Focus on very common words that are most likely to be split
|
| 460 |
+
multi_split_words = ['count', 'your', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were',
|
| 461 |
+
'been', 'have', 'has', 'had', 'will', 'this', 'that', 'there', 'where', 'here',
|
| 462 |
+
'their', 'what', 'common', 'complain', 'honour', 'honor', 'again', 'apparent']
|
| 463 |
for word in multi_split_words:
|
| 464 |
word_lower = word.lower()
|
|
|
|
|
|
|
| 465 |
if len(word_lower) > 2:
|
| 466 |
+
# Pattern 1: letter space letter space ... (all letters split individually) - only for short words
|
| 467 |
+
if len(word_lower) <= 5:
|
| 468 |
+
letters = list(word_lower)
|
| 469 |
+
pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
|
| 470 |
+
pattern_parts.append(re.escape(letters[-1]))
|
| 471 |
+
pattern = r'\b' + ''.join(pattern_parts) + r'\b'
|
| 472 |
+
generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
+
# Pattern 2: Handle two-part splits - only try most common split (middle)
|
| 475 |
+
split_pos = len(word_lower) // 2
|
| 476 |
+
if split_pos > 0 and split_pos < len(word_lower):
|
| 477 |
first_part = word_lower[:split_pos]
|
| 478 |
second_part = word_lower[split_pos:]
|
|
|
|
| 479 |
pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
|
| 480 |
generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
|
| 482 |
# Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
|
| 483 |
# Common patterns where words got merged incorrectly
|