shwethd commited on
Commit
a845bcb
·
verified ·
1 Parent(s): 3bc9884

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -6
app.py CHANGED
@@ -367,7 +367,10 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
367
  'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
368
  'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
369
  'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
370
- 'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she'
 
 
 
371
  ]
372
  for word in common_words_fix:
373
  word_lower = word.lower()
@@ -415,15 +418,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
415
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
416
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
417
 
418
- # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "T h is" -> "This")
419
  # Handle cases where a word got split into multiple parts
420
- multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon']
421
  for word in multi_split_words:
422
  word_lower = word.lower()
423
  # Create pattern for word split into individual letters with spaces
424
- # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at"
425
  if len(word_lower) > 2:
426
- # Pattern: letter space letter space ... (all letters of the word)
427
  letters = list(word_lower)
428
  pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
429
  pattern_parts.append(re.escape(letters[-1]))
@@ -437,6 +440,21 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
437
  # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
438
  pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
439
  generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
442
  # Common patterns where words got merged incorrectly
@@ -485,7 +503,45 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
485
  pattern = r"(" + re.escape(contraction) + r")([a-z])"
486
  generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
487
 
488
- # Fix 3: Add space before character names (all caps words) and fix missing punctuation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
489
  # First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
490
  # Pattern: lowercase word followed immediately by all-caps speaker name
491
  generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)
 
367
  'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
368
  'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
369
  'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
370
+ 'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she', 'honour', 'honor',
371
+ 'youth', 'ports', 'impans', 'swear', 'gods', 'please', 'standing', 'tybalt',
372
+ 'sworn', 'where', 'would', 'give', 'seize', 'before', 'repair', 'lest', 'speak',
373
+ 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art'
374
  ]
375
  for word in common_words_fix:
376
  word_lower = word.lower()
 
418
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
419
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
420
 
421
+ # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "y our" -> "your", "T h is" -> "This")
422
  # Handle cases where a word got split into multiple parts
423
+ multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon', 'honour', 'honor', 'youth', 'ports', 'impans', 'woman', 'gentleman', 'deed', 'better', 'virtuous', 'done', 'broke', 'art']
424
  for word in multi_split_words:
425
  word_lower = word.lower()
426
  # Create pattern for word split into individual letters with spaces
427
+ # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at" or "y our"
428
  if len(word_lower) > 2:
429
+ # Pattern 1: letter space letter space ... (all letters of the word split individually)
430
  letters = list(word_lower)
431
  pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
432
  pattern_parts.append(re.escape(letters[-1]))
 
440
  # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
441
  pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
442
  generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
443
+
444
+ # Pattern 2: Handle two-part splits (e.g., "y our" -> "your", "h onour" -> "honour")
445
+ # Try all possible two-part splits
446
+ for split_pos in range(1, len(word_lower)):
447
+ first_part = word_lower[:split_pos]
448
+ second_part = word_lower[split_pos:]
449
+ # Pattern: "y our" -> "your"
450
+ pattern_2part = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
451
+ generated_text = re.sub(pattern_2part, word, generated_text, flags=re.IGNORECASE)
452
+ # Capitalized version: "Y our" -> "Your"
453
+ pattern_2part_cap = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
454
+ generated_text = re.sub(pattern_2part_cap, word.capitalize(), generated_text)
455
+ # All caps: "Y OUR" -> "YOUR"
456
+ pattern_2part_allcap = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
457
+ generated_text = re.sub(pattern_2part_allcap, word.upper(), generated_text)
458
 
459
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
460
  # Common patterns where words got merged incorrectly
 
503
  pattern = r"(" + re.escape(contraction) + r")([a-z])"
504
  generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
505
 
506
+ # Fix 3: Fix split speaker names (e.g., "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:")
507
+ # Pattern: All caps words separated by spaces ending with colon (likely split speaker name)
508
+ # First, try to merge split speaker names: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
509
+ # But be careful - some speaker names might legitimately have spaces (e.g., "FIRST CITIZEN:")
510
+ lines = generated_text.split('\n')
511
+ fixed_lines = []
512
+ for line in lines:
513
+ line_stripped = line.strip()
514
+ # Check if line looks like a split speaker name (all caps, has spaces, ends with colon)
515
+ # Pattern 1: Multiple all-caps words with spaces: "ALL ANC A:" or "GENTLEM AN:"
516
+ if re.match(r'^([A-Z]+\s+[A-Z]+\s*[A-Z]*):\s*$', line_stripped):
517
+ # Check if it's a known multi-word speaker name (keep those)
518
+ known_multi_word_speakers = ['FIRST CITIZEN', 'SECOND CITIZEN', 'THIRD CITIZEN',
519
+ 'FIRST GENTLEMAN', 'SECOND GENTLEMAN', 'THIRD GENTLEMAN',
520
+ 'FIRST SERVANT', 'SECOND SERVANT', 'LADY MACBETH',
521
+ 'KING HENRY', 'PRINCE HAMLET', 'DUKE VINCENTIO']
522
+ is_known = False
523
+ for known in known_multi_word_speakers:
524
+ if known in line_stripped.upper():
525
+ is_known = True
526
+ break
527
+
528
+ if not is_known:
529
+ # Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
530
+ # Remove spaces between all-caps words before colon
531
+ merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
532
+ # Only use merged if it makes sense (not too long, looks like a word)
533
+ if len(merged) < 30: # Reasonable speaker name length
534
+ fixed_lines.append(merged)
535
+ else:
536
+ fixed_lines.append(line)
537
+ else:
538
+ # Keep known multi-word speaker names as is
539
+ fixed_lines.append(line)
540
+ else:
541
+ fixed_lines.append(line)
542
+ generated_text = '\n'.join(fixed_lines)
543
+
544
+ # Fix 3b: Add space before character names (all caps words) and fix missing punctuation
545
  # First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
546
  # Pattern: lowercase word followed immediately by all-caps speaker name
547
  generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)