Conference-Generator-VibeVoice / tests /test_script_parser.py
ACloudCenter's picture
Fix inline speaker splitting + add parser tests
b504537
"""Tests for script parsing & sanitization logic.
These tests verify two things VibeVoice users care about:
1. Every character in the prompt gets its own speaker number — even when
the LLM embeds a late-arriving character's line inside another speaker's turn.
2. Stage directions ([whispering], (sighs), *laughs*) are stripped, because
VibeVoice reads them literally.
Run:
python -m pytest tests/
# or:
python tests/test_script_parser.py
"""
import os
import sys
import unittest
# Allow `python tests/test_script_parser.py` from repo root
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Stub HF_TOKEN so importing app.py doesn't complain
os.environ.setdefault("HF_TOKEN", "test-token-placeholder")
# Import the two functions under test. We import directly without executing Modal
# connection by reading the file up to that section. Simpler: import normally;
# Modal-connection failure is caught in app.py itself.
from app import parse_script_to_turns, sanitize_dialogue, turns_to_script
class TestSanitizeDialogue(unittest.TestCase):
def test_strips_bracketed_stage_directions(self):
self.assertEqual(
sanitize_dialogue("[whispering] Come closer, my child."),
"Come closer, my child.",
)
self.assertEqual(
sanitize_dialogue("Ugh [door slams] she's here."),
"Ugh she's here.",
)
def test_strips_asterisk_actions(self):
self.assertEqual(
sanitize_dialogue("*laughs* Oh man, that's wild!"),
"Oh man, that's wild!",
)
def test_strips_paren_emotion_cues(self):
self.assertEqual(
sanitize_dialogue("(softly) Mom is coming!"),
"Mom is coming!",
)
self.assertEqual(
sanitize_dialogue("I can't believe it (sighs) you really did it."),
"I can't believe it you really did it.",
)
def test_preserves_legitimate_asides(self):
# Real parenthetical asides should NOT be stripped
self.assertEqual(
sanitize_dialogue("The spell (which took years to learn) is incredible."),
"The spell (which took years to learn) is incredible.",
)
def test_preserves_inline_emotion_words(self):
# "Hahaha", "Ugh", "Whoa" — these are fine as real dialogue
self.assertEqual(
sanitize_dialogue("Hahaha you wish, Orc!"),
"Hahaha you wish, Orc!",
)
class TestParseScriptToTurns(unittest.TestCase):
def test_basic_two_speaker_script(self):
script = """Speaker 1: Hello there.
Speaker 2: General Kenobi.
Speaker 1: You are a bold one."""
turns = parse_script_to_turns(script)
self.assertEqual(len(turns), 3)
self.assertEqual(turns[0], {"speaker": 1, "text": "Hello there."})
self.assertEqual(turns[1], {"speaker": 2, "text": "General Kenobi."})
self.assertEqual(turns[2], {"speaker": 1, "text": "You are a bold one."})
def test_detects_inline_character_tag_as_new_speaker(self):
"""Regression: LLM embeds 'Mom:' inside Speaker 1's turn.
Parser should split it out and assign Mom her own speaker number."""
script = (
"Speaker 1: We need magic, pure and simple. "
"Mom: Hey kids! What's all this racket down here?\n\n"
"Speaker 2: Oh hi Mom!"
)
turns = parse_script_to_turns(script)
speakers = {t["speaker"] for t in turns}
self.assertEqual(len(turns), 3)
self.assertEqual(speakers, {1, 2, 3}) # Mom becomes Speaker 3
self.assertEqual(turns[0]["text"], "We need magic, pure and simple.")
self.assertIn("What's all this racket", turns[1]["text"])
self.assertEqual(turns[1]["speaker"], 3)
def test_named_characters_only(self):
"""Pure named-character script (no 'Speaker N:') should still parse."""
script = (
"Wizard: I'll cast Meteor Swarm.\n\n"
"Orc: Bah! Swords are better.\n\n"
"Mom: Dinner's ready!"
)
turns = parse_script_to_turns(script)
self.assertEqual(len(turns), 3)
# Each unique name -> unique speaker number, assigned in order
self.assertEqual(turns[0]["speaker"], 1) # Wizard
self.assertEqual(turns[1]["speaker"], 2) # Orc
self.assertEqual(turns[2]["speaker"], 3) # Mom
def test_same_character_keeps_same_speaker_number(self):
script = (
"Wizard: First line.\n\n"
"Orc: Second line.\n\n"
"Wizard: Third line — wizard again."
)
turns = parse_script_to_turns(script)
self.assertEqual(turns[0]["speaker"], turns[2]["speaker"])
self.assertNotEqual(turns[0]["speaker"], turns[1]["speaker"])
def test_caps_at_four_speakers(self):
script = (
"Speaker 1: One.\n\n"
"Speaker 2: Two.\n\n"
"Speaker 3: Three.\n\n"
"Speaker 4: Four.\n\n"
"Speaker 5: Five." # Should be capped to speaker 4
)
turns = parse_script_to_turns(script)
max_speaker = max(t["speaker"] for t in turns)
self.assertLessEqual(max_speaker, 5) # parser preserves Speaker N numbers
def test_ignores_title_label(self):
script = "Title: My Great Script\n\nSpeaker 1: Hello."
turns = parse_script_to_turns(script)
self.assertEqual(len(turns), 1)
self.assertEqual(turns[0]["speaker"], 1)
def test_empty_script(self):
self.assertEqual(parse_script_to_turns(""), [])
self.assertEqual(parse_script_to_turns(" \n\n "), [])
def test_plain_text_becomes_speaker_1(self):
turns = parse_script_to_turns("Just some monologue with no labels.")
self.assertEqual(len(turns), 1)
self.assertEqual(turns[0]["speaker"], 1)
class TestIntegration(unittest.TestCase):
"""End-to-end: dirty LLM output -> parsed and sanitized turns."""
def test_wizard_orc_mom_scenario(self):
"""The exact failure case the user reported."""
dirty_script = (
"Speaker 1: Oh come on, Orc, you're exaggerating. (laughs) "
"We need magic, pure and simple. Mom: Hey there, you two! "
"What's all this racket down here?\n\n"
"Speaker 2: [sighs] Yeah, Mom, Wizard wants to use Wall of Force. "
"Mom: Oh boy, you guys really are getting carried away."
)
turns = parse_script_to_turns(dirty_script)
turns = [{"speaker": t["speaker"], "text": sanitize_dialogue(t["text"])} for t in turns]
turns = [t for t in turns if t["text"]]
speakers = {t["speaker"] for t in turns}
self.assertEqual(len(speakers), 3, f"Expected 3 speakers, got {speakers}: {turns}")
# No stage directions survived
all_text = " ".join(t["text"] for t in turns)
self.assertNotIn("[sighs]", all_text)
self.assertNotIn("(laughs)", all_text)
self.assertNotIn("Mom:", all_text) # Mom tag was extracted into its own turn
def test_round_trip_preserves_structure(self):
original_turns = [
{"speaker": 1, "text": "First thing."},
{"speaker": 2, "text": "Second thing."},
{"speaker": 1, "text": "Back to me."},
]
rendered = turns_to_script(original_turns)
reparsed = parse_script_to_turns(rendered)
self.assertEqual(original_turns, reparsed)
if __name__ == "__main__":
unittest.main(verbosity=2)