Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -361,17 +361,36 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 361 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
|
| 362 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 363 |
|
| 364 |
-
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This"
|
| 365 |
# Remove spaces in the middle of common words
|
| 366 |
-
common_words_fix = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
for word in common_words_fix:
|
| 368 |
-
# Pattern: word split incorrectly (e.g., "furt her", "T his")
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
# Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
|
| 377 |
# Add space before common words that might have been merged
|
|
@@ -387,6 +406,27 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 387 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 388 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
# Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
|
| 391 |
# Add space after contractions before lowercase words
|
| 392 |
contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]
|
|
|
|
| 361 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
|
| 362 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 363 |
|
| 364 |
+
# Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your"
|
| 365 |
# Remove spaces in the middle of common words
|
| 366 |
+
common_words_fix = [
|
| 367 |
+
'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
|
| 368 |
+
'man', 'men', 'woman', 'women', 'padua', 'padua', 'content', 'gentle', 'gently',
|
| 369 |
+
'house', 'neck', 'car', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
|
| 370 |
+
'well', 'newly', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
|
| 371 |
+
'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
|
| 372 |
+
'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
|
| 373 |
+
'you', 'me', 'my', 'his', 'hers', 'its', 'our', 'ours', 'yours', 'theirs',
|
| 374 |
+
'into', 'onto', 'upon', 'within', 'without', 'through', 'though', 'although',
|
| 375 |
+
'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
|
| 376 |
+
'after', 'while', 'until', 'since', 'because', 'though', 'although'
|
| 377 |
+
]
|
| 378 |
for word in common_words_fix:
|
| 379 |
+
# Pattern: word split incorrectly (e.g., "furt her", "T his", "y our", "a m an", "Padu a")
|
| 380 |
+
# Handle split at any position
|
| 381 |
+
word_lower = word.lower()
|
| 382 |
+
for i in range(1, len(word_lower)):
|
| 383 |
+
# Split at position i: first part + space + second part
|
| 384 |
+
first_part = word_lower[:i]
|
| 385 |
+
second_part = word_lower[i:]
|
| 386 |
+
# Pattern: word split at this position (case insensitive)
|
| 387 |
+
pattern = r'\b' + first_part + r'\s+' + second_part + r'\b'
|
| 388 |
+
generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
|
| 389 |
+
# Also handle with capital letters (e.g., "Padu a" -> "Padua")
|
| 390 |
+
pattern_cap = r'\b' + first_part.capitalize() + r'\s+' + second_part + r'\b'
|
| 391 |
+
generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
|
| 392 |
+
pattern_all_cap = r'\b' + first_part.upper() + r'\s+' + second_part.upper() + r'\b'
|
| 393 |
+
generated_text = re.sub(pattern_all_cap, word.upper(), generated_text)
|
| 394 |
|
| 395 |
# Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
|
| 396 |
# Add space before common words that might have been merged
|
|
|
|
| 406 |
# Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
|
| 407 |
generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
|
| 408 |
|
| 409 |
+
# Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to")
|
| 410 |
+
# Common patterns where words got merged incorrectly
|
| 411 |
+
# Pattern: pronoun + "t" (likely "to" got merged)
|
| 412 |
+
merged_fixes = [
|
| 413 |
+
(r'\bhimt\s+', 'him to '), # "himt me" -> "him to me"
|
| 414 |
+
(r'\bhert\s+', 'her to '), # "hert him" -> "her to him"
|
| 415 |
+
(r'\bthemt\s+', 'them to '), # "themt us" -> "them to us"
|
| 416 |
+
(r'\byout\s+', 'you to '), # "yout me" -> "you to me"
|
| 417 |
+
(r'\bhimt([,.;:!?])', r'him to\1'), # "himt," -> "him to,"
|
| 418 |
+
(r'\bhert([,.;:!?])', r'her to\1'),
|
| 419 |
+
(r'\bthemt([,.;:!?])', r'them to\1'),
|
| 420 |
+
(r'\byout([,.;:!?])', r'you to\1'),
|
| 421 |
+
]
|
| 422 |
+
for pattern, replacement in merged_fixes:
|
| 423 |
+
generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
|
| 424 |
+
|
| 425 |
+
# Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
|
| 426 |
+
# "content on" should stay as "content on" (already correct)
|
| 427 |
+
# But if it's "contenton" -> "content on"
|
| 428 |
+
generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
|
| 429 |
+
|
| 430 |
# Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
|
| 431 |
# Add space after contractions before lowercase words
|
| 432 |
contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]
|