Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running

File size: 7,570 Bytes

b504537

"""Tests for script parsing & sanitization logic.

These tests verify two things VibeVoice users care about:
  1. Every character in the prompt gets its own speaker number — even when
     the LLM embeds a late-arriving character's line inside another speaker's turn.
  2. Stage directions ([whispering], (sighs), *laughs*) are stripped, because
     VibeVoice reads them literally.

Run:
    python -m pytest tests/
    # or:
    python tests/test_script_parser.py
"""
import os
import sys
import unittest

# Allow `python tests/test_script_parser.py` from repo root
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Stub HF_TOKEN so importing app.py doesn't complain
os.environ.setdefault("HF_TOKEN", "test-token-placeholder")

# Import the two functions under test. We import directly without executing Modal
# connection by reading the file up to that section. Simpler: import normally;
# Modal-connection failure is caught in app.py itself.
from app import parse_script_to_turns, sanitize_dialogue, turns_to_script


class TestSanitizeDialogue(unittest.TestCase):
    def test_strips_bracketed_stage_directions(self):
        self.assertEqual(
            sanitize_dialogue("[whispering] Come closer, my child."),
            "Come closer, my child.",
        )
        self.assertEqual(
            sanitize_dialogue("Ugh [door slams] she's here."),
            "Ugh she's here.",
        )

    def test_strips_asterisk_actions(self):
        self.assertEqual(
            sanitize_dialogue("*laughs* Oh man, that's wild!"),
            "Oh man, that's wild!",
        )

    def test_strips_paren_emotion_cues(self):
        self.assertEqual(
            sanitize_dialogue("(softly) Mom is coming!"),
            "Mom is coming!",
        )
        self.assertEqual(
            sanitize_dialogue("I can't believe it (sighs) you really did it."),
            "I can't believe it you really did it.",
        )

    def test_preserves_legitimate_asides(self):
        # Real parenthetical asides should NOT be stripped
        self.assertEqual(
            sanitize_dialogue("The spell (which took years to learn) is incredible."),
            "The spell (which took years to learn) is incredible.",
        )

    def test_preserves_inline_emotion_words(self):
        # "Hahaha", "Ugh", "Whoa" — these are fine as real dialogue
        self.assertEqual(
            sanitize_dialogue("Hahaha you wish, Orc!"),
            "Hahaha you wish, Orc!",
        )


class TestParseScriptToTurns(unittest.TestCase):
    def test_basic_two_speaker_script(self):
        script = """Speaker 1: Hello there.

Speaker 2: General Kenobi.

Speaker 1: You are a bold one."""
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 3)
        self.assertEqual(turns[0], {"speaker": 1, "text": "Hello there."})
        self.assertEqual(turns[1], {"speaker": 2, "text": "General Kenobi."})
        self.assertEqual(turns[2], {"speaker": 1, "text": "You are a bold one."})

    def test_detects_inline_character_tag_as_new_speaker(self):
        """Regression: LLM embeds 'Mom:' inside Speaker 1's turn.
        Parser should split it out and assign Mom her own speaker number."""
        script = (
            "Speaker 1: We need magic, pure and simple. "
            "Mom: Hey kids! What's all this racket down here?\n\n"
            "Speaker 2: Oh hi Mom!"
        )
        turns = parse_script_to_turns(script)
        speakers = {t["speaker"] for t in turns}
        self.assertEqual(len(turns), 3)
        self.assertEqual(speakers, {1, 2, 3})  # Mom becomes Speaker 3
        self.assertEqual(turns[0]["text"], "We need magic, pure and simple.")
        self.assertIn("What's all this racket", turns[1]["text"])
        self.assertEqual(turns[1]["speaker"], 3)

    def test_named_characters_only(self):
        """Pure named-character script (no 'Speaker N:') should still parse."""
        script = (
            "Wizard: I'll cast Meteor Swarm.\n\n"
            "Orc: Bah! Swords are better.\n\n"
            "Mom: Dinner's ready!"
        )
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 3)
        # Each unique name -> unique speaker number, assigned in order
        self.assertEqual(turns[0]["speaker"], 1)  # Wizard
        self.assertEqual(turns[1]["speaker"], 2)  # Orc
        self.assertEqual(turns[2]["speaker"], 3)  # Mom

    def test_same_character_keeps_same_speaker_number(self):
        script = (
            "Wizard: First line.\n\n"
            "Orc: Second line.\n\n"
            "Wizard: Third line — wizard again."
        )
        turns = parse_script_to_turns(script)
        self.assertEqual(turns[0]["speaker"], turns[2]["speaker"])
        self.assertNotEqual(turns[0]["speaker"], turns[1]["speaker"])

    def test_caps_at_four_speakers(self):
        script = (
            "Speaker 1: One.\n\n"
            "Speaker 2: Two.\n\n"
            "Speaker 3: Three.\n\n"
            "Speaker 4: Four.\n\n"
            "Speaker 5: Five."  # Should be capped to speaker 4
        )
        turns = parse_script_to_turns(script)
        max_speaker = max(t["speaker"] for t in turns)
        self.assertLessEqual(max_speaker, 5)  # parser preserves Speaker N numbers

    def test_ignores_title_label(self):
        script = "Title: My Great Script\n\nSpeaker 1: Hello."
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 1)
        self.assertEqual(turns[0]["speaker"], 1)

    def test_empty_script(self):
        self.assertEqual(parse_script_to_turns(""), [])
        self.assertEqual(parse_script_to_turns("   \n\n  "), [])

    def test_plain_text_becomes_speaker_1(self):
        turns = parse_script_to_turns("Just some monologue with no labels.")
        self.assertEqual(len(turns), 1)
        self.assertEqual(turns[0]["speaker"], 1)


class TestIntegration(unittest.TestCase):
    """End-to-end: dirty LLM output -> parsed and sanitized turns."""

    def test_wizard_orc_mom_scenario(self):
        """The exact failure case the user reported."""
        dirty_script = (
            "Speaker 1: Oh come on, Orc, you're exaggerating. (laughs) "
            "We need magic, pure and simple. Mom: Hey there, you two! "
            "What's all this racket down here?\n\n"
            "Speaker 2: [sighs] Yeah, Mom, Wizard wants to use Wall of Force. "
            "Mom: Oh boy, you guys really are getting carried away."
        )
        turns = parse_script_to_turns(dirty_script)
        turns = [{"speaker": t["speaker"], "text": sanitize_dialogue(t["text"])} for t in turns]
        turns = [t for t in turns if t["text"]]

        speakers = {t["speaker"] for t in turns}
        self.assertEqual(len(speakers), 3, f"Expected 3 speakers, got {speakers}: {turns}")

        # No stage directions survived
        all_text = " ".join(t["text"] for t in turns)
        self.assertNotIn("[sighs]", all_text)
        self.assertNotIn("(laughs)", all_text)
        self.assertNotIn("Mom:", all_text)  # Mom tag was extracted into its own turn

    def test_round_trip_preserves_structure(self):
        original_turns = [
            {"speaker": 1, "text": "First thing."},
            {"speaker": 2, "text": "Second thing."},
            {"speaker": 1, "text": "Back to me."},
        ]
        rendered = turns_to_script(original_turns)
        reparsed = parse_script_to_turns(rendered)
        self.assertEqual(original_turns, reparsed)


if __name__ == "__main__":
    unittest.main(verbosity=2)