Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -320,34 +320,41 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 320 |
import re
|
| 321 |
|
| 322 |
# Fix 0: Remove the prompt from the beginning if it appears as a speaker name
|
| 323 |
-
# This handles cases where user enters "
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
#
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
# Check if the first line after removal is orphaned dialogue (no speaker)
|
| 341 |
lines = generated_text.split('\n')
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
#
|
| 346 |
-
if
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 353 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
|
@@ -511,6 +518,15 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 511 |
# Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
|
| 512 |
(r'\bTH\s+AN\b', 'THAN'),
|
| 513 |
(r'\bth\s+an\b', 'than'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
# Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
|
| 515 |
# Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
|
| 516 |
# Fix duplicate words: "if it be it possible" -> "if it be possible"
|
|
@@ -560,9 +576,16 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50, top_p=0
|
|
| 560 |
break
|
| 561 |
|
| 562 |
if not is_known:
|
| 563 |
-
# Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:"
|
| 564 |
# Remove spaces between all-caps words before colon
|
| 565 |
merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 566 |
# Only use merged if it makes sense (not too long, looks like a word)
|
| 567 |
if len(merged) < 30: # Reasonable speaker name length
|
| 568 |
fixed_lines.append(merged)
|
|
|
|
| 320 |
import re
|
| 321 |
|
| 322 |
# Fix 0: Remove the prompt from the beginning if it appears as a speaker name
|
| 323 |
+
# This handles cases where user enters "First Citizen:" and model repeats it
|
| 324 |
+
prompt_stripped = prompt.strip().replace(':', '').strip()
|
| 325 |
+
lines = generated_text.split('\n')
|
| 326 |
+
|
| 327 |
+
if lines:
|
| 328 |
+
first_line = lines[0].strip()
|
| 329 |
+
# Normalize both prompt and first line for comparison (remove colons, case-insensitive)
|
| 330 |
+
first_line_normalized = first_line.replace(':', '').strip().upper()
|
| 331 |
+
prompt_normalized = prompt_stripped.upper()
|
| 332 |
+
|
| 333 |
+
# If first line matches the prompt (case-insensitive, allowing for colon)
|
| 334 |
+
if first_line_normalized == prompt_normalized:
|
| 335 |
+
# Remove the first line (it's the prompt, not generated content)
|
| 336 |
+
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 337 |
+
|
| 338 |
+
# Also check if the next line is also the same speaker (duplicate)
|
| 339 |
+
if generated_text.strip():
|
|
|
|
| 340 |
lines = generated_text.split('\n')
|
| 341 |
+
next_line = lines[0].strip() if lines else ''
|
| 342 |
+
if next_line:
|
| 343 |
+
next_line_normalized = next_line.replace(':', '').strip().upper()
|
| 344 |
+
# If next line is also the same speaker, remove it too
|
| 345 |
+
if next_line_normalized == prompt_normalized and re.match(r'^([A-Z][A-Z\s]+?):\s*$', next_line):
|
| 346 |
+
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 347 |
+
|
| 348 |
+
# If after removing prompt, first line is orphaned dialogue (no speaker), handle it
|
| 349 |
+
if generated_text.strip():
|
| 350 |
+
lines = generated_text.split('\n')
|
| 351 |
+
first_line = lines[0].strip() if lines else ''
|
| 352 |
+
# Check if first line is orphaned dialogue (starts with capital, has punctuation, but no speaker)
|
| 353 |
+
if first_line and not re.match(r'^([A-Z][A-Z\s]+?):\s*$', first_line):
|
| 354 |
+
# Check if it's dialogue-like (starts with capital, has punctuation)
|
| 355 |
+
if re.match(r'^[A-Z]', first_line) and ('.' in first_line or ',' in first_line or '!' in first_line or '?' in first_line):
|
| 356 |
+
# Just remove the orphaned first line, don't add a speaker
|
| 357 |
+
generated_text = '\n'.join(lines[1:]) if len(lines) > 1 else ''
|
| 358 |
|
| 359 |
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With", "AOr" -> "A Or")
|
| 360 |
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
|
|
|
| 518 |
# Fix "MORE TH AN HALF" -> "MORE THAN HALF" (but this might be dialogue, not speaker)
|
| 519 |
(r'\bTH\s+AN\b', 'THAN'),
|
| 520 |
(r'\bth\s+an\b', 'than'),
|
| 521 |
+
# Fix "F IT" -> "FIT" (in all caps dialogue)
|
| 522 |
+
(r'\bF\s+IT\b', 'FIT'),
|
| 523 |
+
(r'\bf\s+it\b', 'fit'),
|
| 524 |
+
(r'\bF\s+it\b', 'Fit'),
|
| 525 |
+
# Fix "C A" -> "CA" (but be careful - might be part of "C A:" speaker name)
|
| 526 |
+
# Actually, "C A:" should be merged to "CA:" or might be "CLARENCE:" - handle in speaker fix
|
| 527 |
+
# Fix "OUCESTER" -> "GLOUCESTER" (missing "GL" prefix)
|
| 528 |
+
(r'\bOUCESTER\b', 'GLOUCESTER'),
|
| 529 |
+
(r'\bOucester\b', 'Gloucester'),
|
| 530 |
# Fix "stuff'd" -> "stuffed" (if needed, but "stuff'd" is valid Shakespeare)
|
| 531 |
# Actually, "stuff'd" is correct Shakespeare spelling, so we'll leave it
|
| 532 |
# Fix duplicate words: "if it be it possible" -> "if it be possible"
|
|
|
|
| 576 |
break
|
| 577 |
|
| 578 |
if not is_known:
|
| 579 |
+
# Try to merge: "ALL ANC A:" -> "ALLANCA:", "GENTLEM AN:" -> "GENTLEMAN:", "C A:" -> "CA:" or "CLARENCE:"
|
| 580 |
# Remove spaces between all-caps words before colon
|
| 581 |
merged = re.sub(r'([A-Z]+)\s+([A-Z]+)\s*([A-Z]*):', r'\1\2\3:', line_stripped)
|
| 582 |
+
|
| 583 |
+
# Special case: "C A:" might be "CLARENCE:" - check if it's a known pattern
|
| 584 |
+
if re.match(r'^C\s+A:\s*$', line_stripped):
|
| 585 |
+
# Check context - if it's near "Clarence" or "Sir Clarence", it's likely "CLARENCE:"
|
| 586 |
+
# For now, merge to "CA:" and let it be handled as a potential speaker
|
| 587 |
+
merged = 'CLARENCE:'
|
| 588 |
+
|
| 589 |
# Only use merged if it makes sense (not too long, looks like a word)
|
| 590 |
if len(merged) < 30: # Reasonable speaker name length
|
| 591 |
fixed_lines.append(merged)
|