shwethd commited on
Commit
c673f73
·
verified ·
1 Parent(s): 5becd16

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -83
app.py CHANGED
@@ -369,13 +369,19 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
369
 
370
  # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
371
  # Keep removing orphaned dialogue at the start until we find a speaker or valid content
372
- while generated_text.strip():
373
- lines = generated_text.split('\n')
374
- first_line = lines[0].strip() if lines else ''
 
 
 
 
 
 
375
 
376
  if not first_line:
377
  # Remove empty first line
378
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
379
  continue
380
 
381
  # Check if first line is a speaker name
@@ -389,76 +395,51 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
389
  # Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
390
  if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
391
  # Remove the orphaned first line
392
- generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
393
  else:
394
  # Not clearly orphaned dialogue, stop removing
395
  break
396
 
 
 
397
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
398
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
399
  # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
400
  generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
401
 
402
  # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
403
- # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
 
404
  common_words_fix = [
405
  'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
406
- 'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
407
- 'house', 'neck', 'car', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
408
- 'well', 'newly', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
409
  'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
410
  'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
411
- 'you', 'me', 'my', 'his', 'hers', 'its', 'our', 'ours', 'yours', 'theirs',
412
- 'into', 'onto', 'upon', 'within', 'without', 'through', 'though', 'although',
413
- 'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
414
- 'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
415
- 'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
416
- 'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
417
- 'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
418
- 'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
419
- 'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
420
- 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art',
421
- 'again', 'government', 'honour', 'light', 'stands', 'fly', 'mighty', 'forth',
422
- 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'there',
423
- 'confess', 'suffer', 'part', 'coronured', 'eyuls', 'unto', 'until', 'grey',
424
- 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt',
425
- 'not', 'most', 'worthy', 'should', 'bed', 'than', 'half', 'chaste', 'sight',
426
- 'that', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great',
427
- 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'worthy', 'call', 'rod',
428
- 'respect', 'drunk', 'there', 'signior', 'gremio', 'compound', 'soft', 'unvish',
429
- 'know', 'edward'
430
  ]
 
431
  for word in common_words_fix:
432
  word_lower = word.lower()
433
- # Try all possible split positions
434
- for i in range(1, len(word_lower)):
 
 
 
 
 
 
 
 
 
435
  first_part = word_lower[:i]
436
  second_part = word_lower[i:]
437
 
438
- # Pattern 1: lowercase split (e.g., "furt her" -> "further", "th at" -> "that")
439
- # Use word boundaries but also allow punctuation/whitespace around
440
- pattern1 = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
441
- generated_text = re.sub(pattern1, word, generated_text, flags=re.IGNORECASE)
442
-
443
- # Pattern 2: Capital first letter (e.g., "Th at" -> "That")
444
- pattern2 = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
445
- generated_text = re.sub(pattern2, word.capitalize(), generated_text)
446
-
447
- # Pattern 3: All caps (e.g., "TH AT" -> "THAT")
448
- pattern3 = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
449
- generated_text = re.sub(pattern3, word.upper(), generated_text)
450
-
451
- # Pattern 4: Mixed case - first letter capitalized (e.g., "Th at" -> "That")
452
- if len(first_part) > 0:
453
- pattern4 = r'\b' + re.escape(first_part[0].upper() + first_part[1:]) + r'\s+' + re.escape(second_part) + r'\b'
454
- generated_text = re.sub(pattern4, word.capitalize(), generated_text, flags=re.IGNORECASE)
455
-
456
- # Pattern 5: Handle multiple splits in one word (e.g., "c o u n t" -> "count")
457
- # This is a special case for words that got split multiple times
458
- if len(word_lower) > 4: # Only for longer words
459
- # Try to find pattern like "c o u n t" or "y o u r"
460
- # This is more complex, so we'll handle it separately
461
- pass
462
 
463
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
464
  # Add space before common words that might have been merged
@@ -474,43 +455,29 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
474
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
475
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
476
 
477
- # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
478
- # Handle cases where a word got split into multiple parts
479
- multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art', 'again', 'government', 'light', 'stands', 'fly', 'mighty', 'forth', 'turn', 'highness', 'morning', 'hence', 'enter', 'should', 'rue', 'confess', 'suffer', 'part', 'unto', 'until', 'grey', 'lady', 'evils', 'eyes', 'feat', 'worn', 'sister', 'thus', 'apparent', 'blunt', 'most', 'worthy', 'bed', 'than', 'half', 'chaste', 'sight', 'just', 'those', 'passes', 'stuffed', 'calm', 'then', 'little', 'great', 'secrets', 'full', 'pray', 'duke', 'songs', 'soldier', 'call', 'rod', 'respect', 'drunk', 'signior', 'gremio', 'compound', 'soft', 'unvish', 'know', 'edward', 'man', 'men']
 
 
480
  for word in multi_split_words:
481
  word_lower = word.lower()
482
- # Create pattern for word split into individual letters with spaces
483
- # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at" or "y our"
484
  if len(word_lower) > 2:
485
- # Pattern 1: letter space letter space ... (all letters of the word split individually)
486
- letters = list(word_lower)
487
- pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
488
- pattern_parts.append(re.escape(letters[-1]))
489
- pattern = r'\b' + ''.join(pattern_parts) + r'\b'
490
- generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
491
- # Also handle with some capitalization (e.g., "T h is" -> "This", "Wh at" -> "What")
492
- pattern_cap = r'\b' + re.escape(letters[0].upper()) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[1:-1]]) + re.escape(letters[-1]) + r'\b'
493
- generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
494
- # Handle mixed case like "Wh at" -> "What"
495
- if len(letters) > 2:
496
- # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
497
- pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
498
- generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
499
 
500
- # Pattern 2: Handle two-part splits (e.g., "y our" -> "your", "h onour" -> "honour")
501
- # Try all possible two-part splits
502
- for split_pos in range(1, len(word_lower)):
503
  first_part = word_lower[:split_pos]
504
  second_part = word_lower[split_pos:]
505
- # Pattern: "y our" -> "your"
506
  pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
507
  generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
508
- # Capitalized version: "Y our" -> "Your"
509
- pattern_2part_cap = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
510
- generated_text = re.sub(pattern_2part_cap, word.capitalize(), generated_text)
511
- # All caps: "Y OUR" -> "YOUR"
512
- pattern_2part_allcap = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
513
- generated_text = re.sub(pattern_2part_allcap, word.upper(), generated_text)
514
 
515
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
516
  # Common patterns where words got merged incorrectly
 
369
 
370
  # If after removing prompt, first line is orphaned dialogue (no speaker), handle it
371
  # Keep removing orphaned dialogue at the start until we find a speaker or valid content
372
+ # Limit to max 10 iterations to avoid infinite loops
373
+ lines = generated_text.split('\n')
374
+ start_idx = 0
375
+ max_iterations = 10
376
+ iteration = 0
377
+
378
+ while start_idx < len(lines) and iteration < max_iterations:
379
+ iteration += 1
380
+ first_line = lines[start_idx].strip() if start_idx < len(lines) else ''
381
 
382
  if not first_line:
383
  # Remove empty first line
384
+ start_idx += 1
385
  continue
386
 
387
  # Check if first line is a speaker name
 
395
  # Check if it's orphaned dialogue (starts with capital, has punctuation, but no speaker)
396
  if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
397
  # Remove the orphaned first line
398
+ start_idx += 1
399
  else:
400
  # Not clearly orphaned dialogue, stop removing
401
  break
402
 
403
+ generated_text = '\n'.join(lines[start_idx:])
404
+
405
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
406
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
407
  # Also fix single letter + capital word (e.g., "AOr" -> "A Or")
408
  generated_text = re.sub(r'\b([A-Z])([A-Z][a-z]+)', r'\1 \2', generated_text)
409
 
410
  # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
411
+ # OPTIMIZED: Only process most common split words to reduce computation
412
+ # Focus on words that are most likely to be split incorrectly
413
  common_words_fix = [
414
  'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
415
+ 'man', 'men', 'woman', 'women', 'content', 'gentle', 'gently',
416
+ 'house', 'made', 'lost', 'rough', 'see', 'might', 'any', 'one',
417
+ 'well', 'too', 'him', 'her', 'them', 'they', 'the', 'and', 'but',
418
  'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will',
419
  'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we',
420
+ 'you', 'me', 'my', 'his', 'into', 'onto', 'upon', 'within', 'without',
421
+ 'together', 'honour', 'honor', 'common', 'complain', 'again', 'apparent'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  ]
423
+ # Pre-compile patterns for common splits (only most common 2-3 splits per word)
424
  for word in common_words_fix:
425
  word_lower = word.lower()
426
+ # Only try 2-3 most common split positions (middle, quarter, three-quarter)
427
+ split_positions = []
428
+ if len(word_lower) > 2:
429
+ split_positions = [len(word_lower) // 2] # Most common: middle split
430
+ if len(word_lower) > 4:
431
+ split_positions.append(len(word_lower) // 4)
432
+ split_positions.append(3 * len(word_lower) // 4)
433
+
434
+ for i in split_positions:
435
+ if i < 1 or i >= len(word_lower):
436
+ continue
437
  first_part = word_lower[:i]
438
  second_part = word_lower[i:]
439
 
440
+ # Combined pattern with case-insensitive flag (more efficient)
441
+ pattern = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
442
+ generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
445
  # Add space before common words that might have been merged
 
455
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
456
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
457
 
458
+ # Fix 1c: Fix multiple splits in one word - OPTIMIZED: Only handle most common cases
459
+ # Focus on very common words that are most likely to be split
460
+ multi_split_words = ['count', 'your', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were',
461
+ 'been', 'have', 'has', 'had', 'will', 'this', 'that', 'there', 'where', 'here',
462
+ 'their', 'what', 'common', 'complain', 'honour', 'honor', 'again', 'apparent']
463
  for word in multi_split_words:
464
  word_lower = word.lower()
 
 
465
  if len(word_lower) > 2:
466
+ # Pattern 1: letter space letter space ... (all letters split individually) - only for short words
467
+ if len(word_lower) <= 5:
468
+ letters = list(word_lower)
469
+ pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
470
+ pattern_parts.append(re.escape(letters[-1]))
471
+ pattern = r'\b' + ''.join(pattern_parts) + r'\b'
472
+ generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
473
 
474
+ # Pattern 2: Handle two-part splits - only try most common split (middle)
475
+ split_pos = len(word_lower) // 2
476
+ if split_pos > 0 and split_pos < len(word_lower):
477
  first_part = word_lower[:split_pos]
478
  second_part = word_lower[split_pos:]
 
479
  pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
480
  generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
481
 
482
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
483
  # Common patterns where words got merged incorrectly