Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -321,40 +321,62 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 321 |
|
| 322 |
# Fix 0: Remove the prompt from the beginning if it appears as a speaker name
|
| 323 |
# This handles cases where user enters "First Citizen:" and model repeats it
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
| 325 |
lines = generated_text.split('\n')
|
|
|
|
|
|
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
# Normalize both prompt and first line for comparison (remove colons, case-insensitive)
|
| 330 |
-
first_line_normalized = first_line.replace(':', '').strip().upper()
|
| 331 |
-
prompt_normalized = prompt_stripped.upper()
|
| 332 |
|
| 333 |
-
#
|
| 334 |
-
if
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 360 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
|
@@ -385,7 +407,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 385 |
'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
|
| 386 |
'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
|
| 387 |
'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
|
| 388 |
-
'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little'
|
|
|
|
|
|
|
|
|
|
| 389 |
]
|
| 390 |
for word in common_words_fix:
|
| 391 |
word_lower = word.lower()
|
|
@@ -435,7 +460,7 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 435 |
|
| 436 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 437 |
# Handle cases where a word got split into multiple parts
|
| 438 |
-
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little']
|
| 439 |
for word in multi_split_words:
|
| 440 |
word_lower = word.lower()
|
| 441 |
# Create pattern for word split into individual letters with spaces
|
|
@@ -650,6 +675,37 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 650 |
|
| 651 |
generated_text = '\n'.join(normalized_lines)
|
| 652 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
# Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
|
| 654 |
# Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
|
| 655 |
# Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
|
|
|
|
| 321 |
|
| 322 |
# Fix 0: Remove the prompt from the beginning if it appears as a speaker name
|
| 323 |
# This handles cases where user enters "First Citizen:" and model repeats it
|
| 324 |
+
# Normalize prompt: remove colon, strip, convert to uppercase for comparison
|
| 325 |
+
prompt_normalized = prompt.strip().replace(':', '').strip().upper()
|
| 326 |
+
|
| 327 |
+
# Process all lines to find and remove prompt matches
|
| 328 |
lines = generated_text.split('\n')
|
| 329 |
+
cleaned_lines = []
|
| 330 |
+
prompt_removed = False
|
| 331 |
|
| 332 |
+
for i, line in enumerate(lines):
|
| 333 |
+
line_stripped = line.strip()
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
+
# Skip empty lines at the start (but only if we haven't added any content yet)
|
| 336 |
+
if not line_stripped:
|
| 337 |
+
if not cleaned_lines:
|
| 338 |
+
continue # Skip leading empty lines
|
| 339 |
+
else:
|
| 340 |
+
cleaned_lines.append(line) # Keep empty lines after content starts
|
| 341 |
+
continue
|
| 342 |
+
|
| 343 |
+
# Normalize line for comparison (remove colon, case-insensitive)
|
| 344 |
+
line_normalized = line_stripped.replace(':', '').strip().upper()
|
| 345 |
+
|
| 346 |
+
# Check if this line matches the prompt (case-insensitive, allowing for colon)
|
| 347 |
+
# Check if it's a speaker name format (all caps OR title case OR mixed case)
|
| 348 |
+
is_speaker_line = (re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped) or # All caps: "FIRST CITIZEN:"
|
| 349 |
+
re.match(r'^([A-Z][a-z]+(?:\s+[a-zA-Z]+)+):\s*$', line_stripped) or # Title case: "First Citizen:"
|
| 350 |
+
re.match(r'^([A-Z][A-Za-z\s]+?):\s*$', line_stripped)) # Mixed case: "First Citizen:" or "FIRST Citizen:"
|
| 351 |
+
|
| 352 |
+
# If this line matches the prompt (case-insensitive), remove it
|
| 353 |
+
# Be more aggressive: if it matches the prompt, remove it even if pattern doesn't match exactly
|
| 354 |
+
if line_normalized == prompt_normalized and not prompt_removed:
|
| 355 |
+
# Additional check: if it ends with colon, it's likely a speaker name
|
| 356 |
+
if line_stripped.endswith(':'):
|
| 357 |
+
# This is the prompt appearing as a speaker - skip it
|
| 358 |
+
prompt_removed = True
|
| 359 |
+
continue
|
| 360 |
+
# Also remove if it's a speaker line pattern
|
| 361 |
+
elif is_speaker_line:
|
| 362 |
+
prompt_removed = True
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
# If we've already removed the prompt, add the line
|
| 366 |
+
cleaned_lines.append(line)
|
| 367 |
+
|
| 368 |
+
generated_text = '\n'.join(cleaned_lines)
|
| 369 |
+
|
| 370 |
+
# If after removing prompt, first line is orphaned dialogue (no speaker), handle it
|
| 371 |
+
if generated_text.strip():
|
| 372 |
+
lines = generated_text.split('\n')
|
| 373 |
+
first_line = lines[0].strip() if lines else ''
|
| 374 |
+
# Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
|
| 375 |
+
if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
|
| 376 |
+
# Check if it's dialogue-like (starts with capital, has punctuation)
|
| 377 |
+
if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
|
| 378 |
+
# Just remove the orphaned first line, don't add a speaker
|
| 379 |
+
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 380 |
|
| 381 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 382 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
|
|
|
| 407 |
'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
|
| 408 |
'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
|
| 409 |
'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
|
| 410 |
+
'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
|
| 411 |
+
'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
|
| 412 |
+
'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
|
| 413 |
+
'know', 'edward'
|
| 414 |
]
|
| 415 |
for word in common_words_fix:
|
| 416 |
word_lower = word.lower()
|
|
|
|
| 460 |
|
| 461 |
# Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
|
| 462 |
# Handle cases where a word got split into multiple parts
|
| 463 |
+
multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great', 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'call', 'rod', 'respect', 'drunk', 'signior', 'gremio', 'compound', 'soft', 'unvish', 'know', 'edward', 'man', 'men']
|
| 464 |
for word in multi_split_words:
|
| 465 |
word_lower = word.lower()
|
| 466 |
# Create pattern for word split into individual letters with spaces
|
|
|
|
| 675 |
|
| 676 |
generated_text = '\n'.join(normalized_lines)
|
| 677 |
|
| 678 |
+
# Fix 0b: Remove prompt again after normalization (in case it was normalized to all caps)
|
| 679 |
+
# This handles cases where "First Citizen:" was normalized to "FIRST CITIZEN:"
|
| 680 |
+
prompt_normalized = prompt.strip().replace(':', '').strip().upper()
|
| 681 |
+
lines = generated_text.split('\n')
|
| 682 |
+
cleaned_lines_after_norm = []
|
| 683 |
+
prompt_removed_after_norm = False
|
| 684 |
+
|
| 685 |
+
for i, line in enumerate(lines):
|
| 686 |
+
line_stripped = line.strip()
|
| 687 |
+
|
| 688 |
+
# Skip empty lines at the start
|
| 689 |
+
if not line_stripped and not cleaned_lines_after_norm:
|
| 690 |
+
continue
|
| 691 |
+
|
| 692 |
+
# Normalize line for comparison (remove colon, case-insensitive)
|
| 693 |
+
line_normalized = line_stripped.replace(':', '').strip().upper()
|
| 694 |
+
|
| 695 |
+
# Check if this line matches the prompt (case-insensitive, allowing for colon)
|
| 696 |
+
# Also check if it's a speaker name format (all caps after normalization)
|
| 697 |
+
is_speaker_line = re.match(r'^([A-Z][A-Z\s]+?):\s*$', line_stripped)
|
| 698 |
+
|
| 699 |
+
if is_speaker_line and line_normalized == prompt_normalized and not prompt_removed_after_norm:
|
| 700 |
+
# This is the prompt appearing as a speaker - skip it
|
| 701 |
+
prompt_removed_after_norm = True
|
| 702 |
+
continue
|
| 703 |
+
|
| 704 |
+
# If we've already removed the prompt, add the line
|
| 705 |
+
cleaned_lines_after_norm.append(line)
|
| 706 |
+
|
| 707 |
+
generated_text = '\n'.join(cleaned_lines_after_norm)
|
| 708 |
+
|
| 709 |
# Fix 3c: Fix dialogue that was incorrectly formatted as speaker names
|
| 710 |
# Pattern: All caps lines ending with colon that are actually dialogue (not speakers)
|
| 711 |
# Examples: "HENCE ARE YOUR HONOUR TO ENTER:" -> "HENCE ARE YOUR HONOUR TO ENTER."
|