File size: 7,570 Bytes
b504537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Tests for script parsing & sanitization logic.

These tests verify two things VibeVoice users care about:
  1. Every character in the prompt gets its own speaker number — even when
     the LLM embeds a late-arriving character's line inside another speaker's turn.
  2. Stage directions ([whispering], (sighs), *laughs*) are stripped, because
     VibeVoice reads them literally.

Run:
    python -m pytest tests/
    # or:
    python tests/test_script_parser.py
"""
import os
import sys
import unittest

# Allow `python tests/test_script_parser.py` from repo root
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Stub HF_TOKEN so importing app.py doesn't complain
os.environ.setdefault("HF_TOKEN", "test-token-placeholder")

# Import the two functions under test. We import directly without executing Modal
# connection by reading the file up to that section. Simpler: import normally;
# Modal-connection failure is caught in app.py itself.
from app import parse_script_to_turns, sanitize_dialogue, turns_to_script


class TestSanitizeDialogue(unittest.TestCase):
    def test_strips_bracketed_stage_directions(self):
        self.assertEqual(
            sanitize_dialogue("[whispering] Come closer, my child."),
            "Come closer, my child.",
        )
        self.assertEqual(
            sanitize_dialogue("Ugh [door slams] she's here."),
            "Ugh she's here.",
        )

    def test_strips_asterisk_actions(self):
        self.assertEqual(
            sanitize_dialogue("*laughs* Oh man, that's wild!"),
            "Oh man, that's wild!",
        )

    def test_strips_paren_emotion_cues(self):
        self.assertEqual(
            sanitize_dialogue("(softly) Mom is coming!"),
            "Mom is coming!",
        )
        self.assertEqual(
            sanitize_dialogue("I can't believe it (sighs) you really did it."),
            "I can't believe it you really did it.",
        )

    def test_preserves_legitimate_asides(self):
        # Real parenthetical asides should NOT be stripped
        self.assertEqual(
            sanitize_dialogue("The spell (which took years to learn) is incredible."),
            "The spell (which took years to learn) is incredible.",
        )

    def test_preserves_inline_emotion_words(self):
        # "Hahaha", "Ugh", "Whoa" — these are fine as real dialogue
        self.assertEqual(
            sanitize_dialogue("Hahaha you wish, Orc!"),
            "Hahaha you wish, Orc!",
        )


class TestParseScriptToTurns(unittest.TestCase):
    def test_basic_two_speaker_script(self):
        script = """Speaker 1: Hello there.

Speaker 2: General Kenobi.

Speaker 1: You are a bold one."""
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 3)
        self.assertEqual(turns[0], {"speaker": 1, "text": "Hello there."})
        self.assertEqual(turns[1], {"speaker": 2, "text": "General Kenobi."})
        self.assertEqual(turns[2], {"speaker": 1, "text": "You are a bold one."})

    def test_detects_inline_character_tag_as_new_speaker(self):
        """Regression: LLM embeds 'Mom:' inside Speaker 1's turn.
        Parser should split it out and assign Mom her own speaker number."""
        script = (
            "Speaker 1: We need magic, pure and simple. "
            "Mom: Hey kids! What's all this racket down here?\n\n"
            "Speaker 2: Oh hi Mom!"
        )
        turns = parse_script_to_turns(script)
        speakers = {t["speaker"] for t in turns}
        self.assertEqual(len(turns), 3)
        self.assertEqual(speakers, {1, 2, 3})  # Mom becomes Speaker 3
        self.assertEqual(turns[0]["text"], "We need magic, pure and simple.")
        self.assertIn("What's all this racket", turns[1]["text"])
        self.assertEqual(turns[1]["speaker"], 3)

    def test_named_characters_only(self):
        """Pure named-character script (no 'Speaker N:') should still parse."""
        script = (
            "Wizard: I'll cast Meteor Swarm.\n\n"
            "Orc: Bah! Swords are better.\n\n"
            "Mom: Dinner's ready!"
        )
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 3)
        # Each unique name -> unique speaker number, assigned in order
        self.assertEqual(turns[0]["speaker"], 1)  # Wizard
        self.assertEqual(turns[1]["speaker"], 2)  # Orc
        self.assertEqual(turns[2]["speaker"], 3)  # Mom

    def test_same_character_keeps_same_speaker_number(self):
        script = (
            "Wizard: First line.\n\n"
            "Orc: Second line.\n\n"
            "Wizard: Third line — wizard again."
        )
        turns = parse_script_to_turns(script)
        self.assertEqual(turns[0]["speaker"], turns[2]["speaker"])
        self.assertNotEqual(turns[0]["speaker"], turns[1]["speaker"])

    def test_caps_at_four_speakers(self):
        script = (
            "Speaker 1: One.\n\n"
            "Speaker 2: Two.\n\n"
            "Speaker 3: Three.\n\n"
            "Speaker 4: Four.\n\n"
            "Speaker 5: Five."  # Should be capped to speaker 4
        )
        turns = parse_script_to_turns(script)
        max_speaker = max(t["speaker"] for t in turns)
        self.assertLessEqual(max_speaker, 5)  # parser preserves Speaker N numbers

    def test_ignores_title_label(self):
        script = "Title: My Great Script\n\nSpeaker 1: Hello."
        turns = parse_script_to_turns(script)
        self.assertEqual(len(turns), 1)
        self.assertEqual(turns[0]["speaker"], 1)

    def test_empty_script(self):
        self.assertEqual(parse_script_to_turns(""), [])
        self.assertEqual(parse_script_to_turns("   \n\n  "), [])

    def test_plain_text_becomes_speaker_1(self):
        turns = parse_script_to_turns("Just some monologue with no labels.")
        self.assertEqual(len(turns), 1)
        self.assertEqual(turns[0]["speaker"], 1)


class TestIntegration(unittest.TestCase):
    """End-to-end: dirty LLM output -> parsed and sanitized turns."""

    def test_wizard_orc_mom_scenario(self):
        """The exact failure case the user reported."""
        dirty_script = (
            "Speaker 1: Oh come on, Orc, you're exaggerating. (laughs) "
            "We need magic, pure and simple. Mom: Hey there, you two! "
            "What's all this racket down here?\n\n"
            "Speaker 2: [sighs] Yeah, Mom, Wizard wants to use Wall of Force. "
            "Mom: Oh boy, you guys really are getting carried away."
        )
        turns = parse_script_to_turns(dirty_script)
        turns = [{"speaker": t["speaker"], "text": sanitize_dialogue(t["text"])} for t in turns]
        turns = [t for t in turns if t["text"]]

        speakers = {t["speaker"] for t in turns}
        self.assertEqual(len(speakers), 3, f"Expected 3 speakers, got {speakers}: {turns}")

        # No stage directions survived
        all_text = " ".join(t["text"] for t in turns)
        self.assertNotIn("[sighs]", all_text)
        self.assertNotIn("(laughs)", all_text)
        self.assertNotIn("Mom:", all_text)  # Mom tag was extracted into its own turn

    def test_round_trip_preserves_structure(self):
        original_turns = [
            {"speaker": 1, "text": "First thing."},
            {"speaker": 2, "text": "Second thing."},
            {"speaker": 1, "text": "Back to me."},
        ]
        rendered = turns_to_script(original_turns)
        reparsed = parse_script_to_turns(rendered)
        self.assertEqual(original_turns, reparsed)


if __name__ == "__main__":
    unittest.main(verbosity=2)