shwethd commited on
Commit
82f907e
·
verified ·
1 Parent(s): c037b52

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -9
app.py CHANGED
@@ -361,17 +361,36 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
361
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
362
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
363
 
364
- # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This"
365
  # Remove spaces in the middle of common words
366
- common_words_fix = ['further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our', 'your']
 
 
 
 
 
 
 
 
 
 
 
367
  for word in common_words_fix:
368
- # Pattern: word split incorrectly (e.g., "furt her", "T his")
369
- pattern = r'\b' + word[0] + r'\s+' + word[1:] + r'\b'
370
- generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
371
- # Also handle reversed (less common)
372
- if len(word) > 3:
373
- pattern2 = r'\b' + word[:-1] + r'\s+' + word[-1] + r'\b'
374
- generated_text = re.sub(pattern2, word, generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
375
 
376
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
377
  # Add space before common words that might have been merged
@@ -387,6 +406,27 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
387
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
388
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
391
  # Add space after contractions before lowercase words
392
  contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]
 
361
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
362
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
363
 
364
+ # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your"
365
  # Remove spaces in the middle of common words
366
+ common_words_fix = [
367
+ 'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
368
+ 'man', 'men', 'woman', 'women', 'padua', 'padua', 'content', 'gentle', 'gently',
369
+ 'house', 'neck', 'car', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
370
+ 'well', 'newly', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
371
+ 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
372
+ 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
373
+ 'you', 'me', 'my', 'his', 'hers', 'its', 'our', 'ours', 'yours', 'theirs',
374
+ 'into', 'onto', 'upon', 'within', 'without', 'through', 'though', 'although',
375
+ 'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
376
+ 'after', 'while', 'until', 'since', 'because', 'though', 'although'
377
+ ]
378
  for word in common_words_fix:
379
+ # Pattern: word split incorrectly (e.g., "furt her", "T his", "y our", "a m an", "Padu a")
380
+ # Handle split at any position
381
+ word_lower = word.lower()
382
+ for i in range(1, len(word_lower)):
383
+ # Split at position i: first part + space + second part
384
+ first_part = word_lower[:i]
385
+ second_part = word_lower[i:]
386
+ # Pattern: word split at this position (case insensitive)
387
+ pattern = r'\b' + first_part + r'\s+' + second_part + r'\b'
388
+ generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
389
+ # Also handle with capital letters (e.g., "Padu a" -> "Padua")
390
+ pattern_cap = r'\b' + first_part.capitalize() + r'\s+' + second_part + r'\b'
391
+ generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
392
+ pattern_all_cap = r'\b' + first_part.upper() + r'\s+' + second_part.upper() + r'\b'
393
+ generated_text = re.sub(pattern_all_cap, word.upper(), generated_text)
394
 
395
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
396
  # Add space before common words that might have been merged
 
406
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
407
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
408
 
409
+ # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to")
410
+ # Common patterns where words got merged incorrectly
411
+ # Pattern: pronoun + "t" (likely "to" got merged)
412
+ merged_fixes = [
413
+ (r'\bhimt\s+', 'him to '), # "himt me" -> "him to me"
414
+ (r'\bhert\s+', 'her to '), # "hert him" -> "her to him"
415
+ (r'\bthemt\s+', 'them to '), # "themt us" -> "them to us"
416
+ (r'\byout\s+', 'you to '), # "yout me" -> "you to me"
417
+ (r'\bhimt([,.;:!?])', r'him to\1'), # "himt," -> "him to,"
418
+ (r'\bhert([,.;:!?])', r'her to\1'),
419
+ (r'\bthemt([,.;:!?])', r'them to\1'),
420
+ (r'\byout([,.;:!?])', r'you to\1'),
421
+ ]
422
+ for pattern, replacement in merged_fixes:
423
+ generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
424
+
425
+ # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
426
+ # "content on" should stay as "content on" (already correct)
427
+ # But if it's "contenton" -> "content on"
428
+ generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
429
+
430
  # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
431
  # Add space after contractions before lowercase words
432
  contractions = ["'ll", "'ve", "'re", "'d", "'t", "'s", "'m"]