shwethd commited on
Commit
3bc9884
·
verified ·
1 Parent(s): 1e393db

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -38
app.py CHANGED
@@ -341,28 +341,19 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
341
  lines = generated_text.split('\n')
342
  if lines and lines[0].strip():
343
  first_line = lines[0].strip()
344
- # If first line is not a speaker name and looks like dialogue, add a speaker
 
345
  if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
346
  # Check if it's dialogue-like (starts with capital, has punctuation)
347
  if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
348
- # Add a generic speaker name based on the prompt context
349
- # For story prompts like "Romeo and Juliet", use a character from the prompt
350
- prompt_words = [w.capitalize() for w in prompt_lower.split() if len(w) > 2]
351
- if len(prompt_words) >= 2:
352
- # Use first significant word as speaker (e.g., "Romeo" from "Romeo and Juliet")
353
- speaker_name = prompt_words[0].upper()
354
- else:
355
- # Generic speaker
356
- speaker_name = "NARRATOR"
357
-
358
- # Add speaker before the dialogue
359
- generated_text = f"{speaker_name}:\n{first_line}\n" + '\n'.join(lines[1:]) if len(lines) > 1 else f"{speaker_name}:\n{first_line}"
360
 
361
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
362
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
363
 
364
- # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "Th at" -> "That"
365
- # Remove spaces in the middle of common words
366
  common_words_fix = [
367
  'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
368
  'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
@@ -375,33 +366,40 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
375
  'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
376
  'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
377
  'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
378
- 'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold'
 
379
  ]
380
  for word in common_words_fix:
381
- # Pattern: word split incorrectly (e.g., "furt her", "T his", "y our", "a m an", "Padu a", "Th at")
382
- # Handle split at any position, including with capital letters
383
  word_lower = word.lower()
 
384
  for i in range(1, len(word_lower)):
385
- # Split at position i: first part + space + second part
386
  first_part = word_lower[:i]
387
  second_part = word_lower[i:]
388
 
389
- # Pattern 1: lowercase split (e.g., "furt her" -> "further")
390
- pattern = r'\b' + first_part + r'\s+' + second_part + r'\b'
391
- generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
 
392
 
393
- # Pattern 2: Capital letter split (e.g., "Th at" -> "That", "T his" -> "This")
394
- pattern_cap = r'\b' + first_part.capitalize() + r'\s+' + second_part + r'\b'
395
- generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
396
 
397
- # Pattern 3: All caps split (e.g., "TH AT" -> "THAT")
398
- pattern_all_cap = r'\b' + first_part.upper() + r'\s+' + second_part.upper() + r'\b'
399
- generated_text = re.sub(pattern_all_cap, word.upper(), generated_text)
400
 
401
- # Pattern 4: Mixed case with capital in first part (e.g., "Th at" -> "That")
402
  if len(first_part) > 0:
403
- pattern_mixed = r'\b' + first_part[0].upper() + first_part[1:] + r'\s+' + second_part + r'\b'
404
- generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
 
 
 
 
 
 
 
405
 
406
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
407
  # Add space before common words that might have been merged
@@ -417,6 +415,29 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
417
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
418
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
421
  # Common patterns where words got merged incorrectly
422
  merged_fixes = [
@@ -432,20 +453,28 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
432
  # Other merged patterns
433
  (r'\bincwold\b', 'in cold'), # "incwold" -> "in cold"
434
  (r'\bincold\b', 'in cold'), # "incold" -> "in cold"
435
- (r'\blikeled\b', 'liked'), # "likeled" -> "liked" (or could be "like led" but "liked" is more common)
436
  (r'\bh\s+on\s+our\b', 'honour'), # "h on our" -> "honour"
437
- (r'\bh\s+on\s+or\b', 'honor'), # "h on or" -> "honor" (American spelling)
 
 
 
 
 
 
 
 
 
 
 
438
  ]
439
  for pattern, replacement in merged_fixes:
440
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
441
 
442
  # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
443
- # "content on" should stay as "content on" (already correct)
444
- # But if it's "contenton" -> "content on"
445
  generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
446
 
447
- # Fix 2g: Fix "toget her" -> "together" (but be careful - "get her" is also valid)
448
- # Only fix if it's clearly "together" (context-dependent, but "toget her" is likely "together")
449
  generated_text = re.sub(r'\btoget\s+her\b', 'together', generated_text, flags=re.IGNORECASE)
450
 
451
  # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
@@ -456,7 +485,11 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
456
  pattern = r"(" + re.escape(contraction) + r")([a-z])"
457
  generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
458
 
459
- # Fix 3: Add space before character names (all caps words)
 
 
 
 
460
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
461
 
462
  # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
@@ -577,6 +610,63 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
577
  flags=re.MULTILINE
578
  )
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  return generated_text
581
  except Exception as e:
582
  import traceback
 
341
  lines = generated_text.split('\n')
342
  if lines and lines[0].strip():
343
  first_line = lines[0].strip()
344
+ # If first line is not a speaker name and looks like dialogue, just remove it
345
+ # Don't add NARRATOR - let the model's natural flow continue
346
  if not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
347
  # Check if it's dialogue-like (starts with capital, has punctuation)
348
  if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
349
+ # Just remove the orphaned first line, don't add a speaker
350
+ generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
 
 
 
 
 
 
 
 
 
 
351
 
352
  # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
353
  generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
354
 
355
+ # Fix 1b: Fix spacing issues like "furt her" -> "further", "T his" -> "This", "y our" -> "your", "th at" -> "that"
356
+ # Remove spaces in the middle of common words - MORE AGGRESSIVE matching
357
  common_words_fix = [
358
  'further', 'this', 'that', 'there', 'where', 'here', 'their', 'your', 'our',
359
  'man', 'men', 'woman', 'women', 'padua', 'content', 'gentle', 'gently',
 
366
  'about', 'above', 'below', 'beside', 'between', 'among', 'during', 'before',
367
  'after', 'while', 'until', 'since', 'because', 'together', 'honour', 'honor',
368
  'already', 'perfect', 'soul', 'way', 'wounds', 'tears', 'raise', 'call',
369
+ 'citizens', 'senator', 'liked', 'cold', 'incold', 'incwold', 'son', 'count',
370
+ 'happen', 'happ', 'what', 'common', 'complain', 'upon', 'she'
371
  ]
372
  for word in common_words_fix:
 
 
373
  word_lower = word.lower()
374
+ # Try all possible split positions
375
  for i in range(1, len(word_lower)):
 
376
  first_part = word_lower[:i]
377
  second_part = word_lower[i:]
378
 
379
+ # Pattern 1: lowercase split (e.g., "furt her" -> "further", "th at" -> "that")
380
+ # Use word boundaries but also allow punctuation/whitespace around
381
+ pattern1 = r'\b' + re.escape(first_part) + r'\s+' + re.escape(second_part) + r'\b'
382
+ generated_text = re.sub(pattern1, word, generated_text, flags=re.IGNORECASE)
383
 
384
+ # Pattern 2: Capital first letter (e.g., "Th at" -> "That")
385
+ pattern2 = r'\b' + re.escape(first_part.capitalize()) + r'\s+' + re.escape(second_part) + r'\b'
386
+ generated_text = re.sub(pattern2, word.capitalize(), generated_text)
387
 
388
+ # Pattern 3: All caps (e.g., "TH AT" -> "THAT")
389
+ pattern3 = r'\b' + re.escape(first_part.upper()) + r'\s+' + re.escape(second_part.upper()) + r'\b'
390
+ generated_text = re.sub(pattern3, word.upper(), generated_text)
391
 
392
+ # Pattern 4: Mixed case - first letter capitalized (e.g., "Th at" -> "That")
393
  if len(first_part) > 0:
394
+ pattern4 = r'\b' + re.escape(first_part[0].upper() + first_part[1:]) + r'\s+' + re.escape(second_part) + r'\b'
395
+ generated_text = re.sub(pattern4, word.capitalize(), generated_text, flags=re.IGNORECASE)
396
+
397
+ # Pattern 5: Handle multiple splits in one word (e.g., "c o u n t" -> "count")
398
+ # This is a special case for words that got split multiple times
399
+ if len(word_lower) > 4: # Only for longer words
400
+ # Try to find pattern like "c o u n t" or "y o u r"
401
+ # This is more complex, so we'll handle it separately
402
+ pass
403
 
404
  # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
405
  # Add space before common words that might have been merged
 
415
  # Fix 2d: Fix spacing after commas (e.g., "What,bear" -> "What, bear")
416
  generated_text = re.sub(r',([a-zA-Z])', r', \1', generated_text)
417
 
418
+ # Fix 1c: Fix multiple splits in one word (e.g., "c o u n t" -> "count", "y o u r" -> "your", "T h is" -> "This")
419
+ # Handle cases where a word got split into multiple parts
420
+ multi_split_words = ['count', 'your', 'son', 'our', 'the', 'and', 'but', 'for', 'not', 'are', 'was', 'were', 'been', 'have', 'has', 'had', 'will', 'shall', 'would', 'could', 'should', 'be', 'is', 'it', 'he', 'she', 'we', 'they', 'you', 'me', 'my', 'his', 'her', 'them', 'him', 'this', 'that', 'there', 'where', 'here', 'their', 'what', 'common', 'complain', 'upon']
421
+ for word in multi_split_words:
422
+ word_lower = word.lower()
423
+ # Create pattern for word split into individual letters with spaces
424
+ # e.g., "c o u n t" or "y o u r" or "T h is" or "Wh at"
425
+ if len(word_lower) > 2:
426
+ # Pattern: letter space letter space ... (all letters of the word)
427
+ letters = list(word_lower)
428
+ pattern_parts = [re.escape(letter) + r'\s+' for letter in letters[:-1]]
429
+ pattern_parts.append(re.escape(letters[-1]))
430
+ pattern = r'\b' + ''.join(pattern_parts) + r'\b'
431
+ generated_text = re.sub(pattern, word, generated_text, flags=re.IGNORECASE)
432
+ # Also handle with some capitalization (e.g., "T h is" -> "This", "Wh at" -> "What")
433
+ pattern_cap = r'\b' + re.escape(letters[0].upper()) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[1:-1]]) + re.escape(letters[-1]) + r'\b'
434
+ generated_text = re.sub(pattern_cap, word.capitalize(), generated_text)
435
+ # Handle mixed case like "Wh at" -> "What"
436
+ if len(letters) > 2:
437
+ # Pattern for "Wh at" style (first two letters capitalized, rest lowercase)
438
+ pattern_mixed = r'\b' + re.escape(letters[0].upper()) + re.escape(letters[1]) + r'\s+' + ''.join([re.escape(letter) + r'\s+' for letter in letters[2:-1]]) + re.escape(letters[-1]) + r'\b'
439
+ generated_text = re.sub(pattern_mixed, word.capitalize(), generated_text, flags=re.IGNORECASE)
440
+
441
  # Fix 2e: Fix merged words that should be separate (e.g., "himt" -> "him to", "incwold" -> "in cold")
442
  # Common patterns where words got merged incorrectly
443
  merged_fixes = [
 
453
  # Other merged patterns
454
  (r'\bincwold\b', 'in cold'), # "incwold" -> "in cold"
455
  (r'\bincold\b', 'in cold'), # "incold" -> "in cold"
456
+ (r'\blikeled\b', 'liked'), # "likeled" -> "liked"
457
  (r'\bh\s+on\s+our\b', 'honour'), # "h on our" -> "honour"
458
+ (r'\bh\s+on\s+or\b', 'honor'), # "h on or" -> "honor"
459
+ (r'\bHapp\s+up\s+on\'t\b', "Happen upon't"), # "Happ up on't" -> "Happen upon't"
460
+ (r'\bhapp\s+up\s+on\'t\b', "happen upon't"),
461
+ # Fix "comm on" -> "common" (if not already fixed)
462
+ (r'\bcomm\s+on\b', 'common'),
463
+ (r'\bComm\s+on\b', 'Common'),
464
+ # Fix "compl a in" -> "complain" (multiple splits)
465
+ (r'\bcompl\s+a\s+in\b', 'complain'),
466
+ (r'\bCompl\s+a\s+in\b', 'Complain'),
467
+ # Fix "As s he" -> "As she"
468
+ (r'\bAs\s+s\s+he\b', 'As she'),
469
+ (r'\bas\s+s\s+he\b', 'as she'),
470
  ]
471
  for pattern, replacement in merged_fixes:
472
  generated_text = re.sub(pattern, replacement, generated_text, flags=re.IGNORECASE)
473
 
474
  # Fix 2f: Fix "content on" - this is likely two separate words, but ensure proper spacing
 
 
475
  generated_text = re.sub(r'\bcontenton\b', 'content on', generated_text, flags=re.IGNORECASE)
476
 
477
+ # Fix 2g: Fix "toget her" -> "together"
 
478
  generated_text = re.sub(r'\btoget\s+her\b', 'together', generated_text, flags=re.IGNORECASE)
479
 
480
  # Fix 2b: Fix contractions that got merged (e.g., "You'llbe" -> "You'll be")
 
485
  pattern = r"(" + re.escape(contraction) + r")([a-z])"
486
  generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
487
 
488
+ # Fix 3: Add space before character names (all caps words) and fix missing punctuation
489
+ # First, fix cases like "Barn MENENIUS:" -> "Barn. MENENIUS:" or "Barn, MENENIUS:"
490
+ # Pattern: lowercase word followed immediately by all-caps speaker name
491
+ generated_text = re.sub(r'([a-z]+)([A-Z]{2,}):', r'\1. \2:', generated_text)
492
+ # Then add space before character names
493
  generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
494
 
495
  # Fix 3b: Normalize speaker names (e.g., "Romeo and juliet" -> "ROMEO AND JULIET:")
 
610
  flags=re.MULTILINE
611
  )
612
 
613
+ # Fix 8: Handle incomplete termination - remove incomplete words/sentences at the end
614
+ # This happens when the model hits the token limit mid-generation
615
+ if generated_text.strip():
616
+ # Remove incomplete word at the end (word that doesn't end with punctuation or space)
617
+ # Pattern: ends with a word that has no trailing punctuation/space
618
+ # But keep if it ends with proper punctuation (. ! ? , ; :)
619
+ lines = generated_text.split('\n')
620
+ if lines:
621
+ last_line = lines[-1].strip()
622
+
623
+ # If last line doesn't end with punctuation and is not a speaker name
624
+ if last_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', last_line):
625
+ # Check if it ends with incomplete word (no punctuation, not a complete sentence)
626
+ # Remove if it ends with a word that looks incomplete
627
+ # Pattern: ends with word that has no punctuation
628
+ if not re.search(r'[.!?,;:]$', last_line):
629
+ # Check if the last "word" is very short (likely incomplete)
630
+ # Or if it's a single character/letter (likely cut off)
631
+ words = last_line.split()
632
+ if words:
633
+ last_word = words[-1]
634
+ # If last word is very short (1-2 chars) and not punctuation, likely incomplete
635
+ if len(last_word) <= 2 and last_word.isalpha():
636
+ # Remove the incomplete last word
637
+ lines[-1] = ' '.join(words[:-1]) if len(words) > 1 else ''
638
+ # If last word doesn't end with punctuation and line is short, might be incomplete
639
+ elif len(last_line) < 20 and not last_word.endswith(('.', '!', '?', ',', ';', ':')):
640
+ # Check if removing last word makes sense
641
+ # Only remove if it's clearly incomplete (very short word)
642
+ if len(last_word) < 4:
643
+ lines[-1] = ' '.join(words[:-1]) if len(words) > 1 else ''
644
+
645
+ # If after processing, last line is empty or just whitespace, remove it
646
+ if not lines[-1].strip():
647
+ lines = lines[:-1]
648
+
649
+ # Reconstruct text
650
+ generated_text = '\n'.join(lines)
651
+
652
+ # Final check: if text doesn't end with punctuation and is not a speaker,
653
+ # try to find the last complete sentence
654
+ if generated_text.strip():
655
+ # Find the last complete sentence (ends with . ! ?)
656
+ # Split by sentences
657
+ sentences = re.split(r'([.!?]+)', generated_text)
658
+ if len(sentences) > 1:
659
+ # Reconstruct, keeping only complete sentences
660
+ complete_text = ''
661
+ for i in range(0, len(sentences) - 1, 2):
662
+ if i + 1 < len(sentences):
663
+ complete_text += sentences[i] + sentences[i + 1]
664
+ # If we have complete sentences, use them; otherwise keep original
665
+ if complete_text.strip():
666
+ # But check if we removed too much (more than 50% of text)
667
+ if len(complete_text.strip()) > len(generated_text.strip()) * 0.3:
668
+ generated_text = complete_text.strip()
669
+
670
  return generated_text
671
  except Exception as e:
672
  import traceback