Spaces:

rubentuesday
/

vocal-mirror

Sleeping

App Files Files Community

rubentuesday commited on Apr 13

Commit

e75cf4f

1 Parent(s): b1bd3b9

fix: replace dict State with flat primitive States to fix Gradio API schema TypeError

Browse files

Files changed (2) hide show

app.py +80 -69
build-errors/build_errors.md +16 -0

app.py CHANGED Viewed

@@ -4,6 +4,10 @@ Replicates the exact frontend flow:
   Language select → 3-phrase enrollment → chat (Claude Haiku replies in cloned voice) → wall at turn 7
 RTF is shown after each AI turn so you can verify the GPU is keeping up.
 """
 import math
@@ -80,7 +84,7 @@ L2_OPTIONS  = [("Spanish (es)", "es"), ("English (en)", "en"), ("French (fr)", "
                ("Korean (ko)", "ko"), ("Chinese (zh)", "zh")]
 # ── Audio helpers ─────────────────────────────────────────────────────────────
-def _to_mono_16k(audio_input) -> np.ndarray | None:
     if audio_input is None:
         return None
     sr, samples = audio_input
@@ -97,10 +101,15 @@ def _to_mono_16k(audio_input) -> np.ndarray | None:
 # ── GPU functions ─────────────────────────────────────────────────────────────
 @spaces.GPU
 def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
-    """WavLM enrollment + synthesize first AI message. Returns (ref_list, greeting_audio, rtf_text)."""
     chunks, ref_texts = [], []
     for i, a in enumerate([audio1, audio2, audio3]):
         chunk = _to_mono_16k(a)
@@ -109,7 +118,7 @@ def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
             ref_texts.append(ENROLLMENT_PHRASES[i])
     if not chunks:
-        return None, None, "⚠️ No audio recorded"
     ref = np.concatenate(chunks)
     ref_text = " ".join(ref_texts)
@@ -125,20 +134,24 @@ def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
     status = "PASS ✓" if rtf < 1.0 else "FAIL ✗"
     rtf_text = f"Greeting — RTF: {rtf:.3f} | {status} | {elapsed*1000:.0f} ms"
-    # Store ref as plain Python list so Gradio State stays JSON-serializable
     return ref.tolist(), (16_000, greeting_audio), rtf_text
 @spaces.GPU
 def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
-    """ASR → Claude Haiku reply → TTS. Returns (user_text, reply_text, audio_out, new_history, rtf_text).
-    ref_list is a plain Python list (stored in State); converted to np.ndarray here.
     """
     samples = _to_mono_16k(audio_input)
     if samples is None or len(samples) == 0:
         return None, None, None, history, "⚠️ No audio"
-    # Reconstruct np.ndarray from the list stored in State
     ref = np.array(ref_list, dtype=np.float32)
     # ASR — Whisper tiny (CPU)
@@ -155,9 +168,9 @@ def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
         turn_number=turn_count + 1,
         whisper_signals=whisper_signals if whisper_signals else None,
     )
-    reply_text     = reply_obj.l2_text
-    new_history    = reply_obj.updated_history
-    lang_name      = LANG_NAMES.get(l2, "English")
     # TTS — hybrid router (OpenVoice short / Qwen3 long)
     t0 = time.perf_counter()
@@ -172,12 +185,20 @@ def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
-EMPTY_STATE = {"l1": "en", "l2": "es", "ref": None, "history": [], "turn_count": 0}
 with gr.Blocks(title="Vocal Mirror") as demo:
-    state = gr.State(EMPTY_STATE.copy())
     # ── Screen 1: Language select ─────────────────────────────────────────────
     with gr.Column(visible=True) as screen_lang:
@@ -204,18 +225,18 @@ with gr.Blocks(title="Vocal Mirror") as demo:
             with gr.Column():
                 gr.Markdown(f'**Phrase 3**\n\n*"{ENROLLMENT_PHRASES[2]}"*')
                 enroll_a3 = gr.Audio(label="Phrase 3", sources=["microphone"], type="numpy")
-        enroll_btn  = gr.Button("Clone my voice & start →", variant="primary", size="lg")
         enroll_status = gr.Textbox(label="Status", interactive=False, visible=False)
     # ── Screen 3: Chat ────────────────────────────────────────────────────────
     with gr.Column(visible=False) as screen_chat:
         gr.Markdown("## Chat")
-        chatbot = gr.Chatbot(label="Conversation", type="messages", height=400)
         ai_audio = gr.Audio(label="AI reply (cloned voice)", type="numpy", autoplay=True)
         rtf_box  = gr.Textbox(label="RTF", interactive=False)
         gr.Markdown("### Your turn — record your reply")
-        user_mic  = gr.Audio(label="Your voice", sources=["microphone"], type="numpy")
-        send_btn  = gr.Button("Send →", variant="primary")
     # ── Screen 4: Wall ────────────────────────────────────────────────────────
     with gr.Column(visible=False) as screen_wall:
@@ -225,53 +246,50 @@ with gr.Blocks(title="Vocal Mirror") as demo:
             "in your own voice.\n\n"
             "Join the waitlist to get early access when we launch."
         )
-        gr.Markdown(f"_RTF benchmark ran throughout — all turns are real-time capable on A10G GPU._")
     # ── Callbacks ─────────────────────────────────────────────────────────────
-    def on_start(l1, l2, state):
-        state = state.copy()
-        state["l1"] = l1
-        state["l2"] = l2
-        state["history"] = []
-        state["turn_count"] = 0
-        state["ref"] = None
         return (
-            state,
-            gr.update(visible=False),   # screen_lang
-            gr.update(visible=True),    # screen_enroll
         )
     start_btn.click(
         fn=on_start,
-        inputs=[l1_dd, l2_dd, state],
-        outputs=[state, screen_lang, screen_enroll],
     )
-    def on_enroll(a1, a2, a3, state):
-        state = state.copy()
-        l2 = state.get("l2", "es")
-        ref, greeting, rtf_text = gpu_enroll_and_greet(a1, a2, a3, l2)
-        if ref is None:
             return (
-                state,
-                gr.update(visible=True),  # screen_enroll stays
-                gr.update(value=rtf_text, visible=True),  # enroll_status
-                gr.update(visible=False), gr.update(visible=False),
-                [], None, "",
             )
         fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
-        state["ref"] = ref
-        # Greeting counts as AI turn 0 — history stays empty until user speaks
         messages = [{"role": "assistant", "content": fluent_text}]
         return (
-            state,
             gr.update(visible=False),          # screen_enroll
             gr.update(visible=False),          # enroll_status
             gr.update(visible=True),           # screen_chat
@@ -283,45 +301,35 @@ with gr.Blocks(title="Vocal Mirror") as demo:
     enroll_btn.click(
         fn=on_enroll,
-        inputs=[enroll_a1, enroll_a2, enroll_a3, state],
-        outputs=[state, screen_enroll, enroll_status, screen_chat, screen_wall,
                  chatbot, ai_audio, rtf_box],
     )
-    def on_send(audio, state):
-        state = state.copy()
-        ref        = state.get("ref")
-        history    = state.get("history", [])
-        turn_count = state.get("turn_count", 0)
-        l1         = state.get("l1", "en")
-        l2         = state.get("l2", "es")
-        if ref is None:
-            return state, gr.update(), None, "⚠️ Not enrolled", gr.update(), gr.update()
         user_text, reply_text, audio_out, new_history, rtf_text = gpu_chat_turn(
-            audio, ref, history, turn_count, l1, l2
         )
         if reply_text is None:
-            # Transcription failed — keep chat as-is
-            return state, gr.update(), None, rtf_text, gr.update(), gr.update()
-        turn_count += 1
-        state["history"]     = new_history
-        state["turn_count"]  = turn_count
-        # Build display messages (show only current conversation, not enrollment greeting)
-        # Prepend greeting so it stays at top
         fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
         messages = [{"role": "assistant", "content": fluent_text}]
         for msg in new_history:
             messages.append({"role": msg["role"], "content": msg["content"]})
-        if turn_count >= WALL_TURN_COUNT:
             return (
-                state,
                 messages,
                 audio_out,
                 rtf_text,
@@ -330,7 +338,9 @@ with gr.Blocks(title="Vocal Mirror") as demo:
             )
         return (
-            state,
             messages,
             audio_out,
             rtf_text,
@@ -340,8 +350,9 @@ with gr.Blocks(title="Vocal Mirror") as demo:
     send_btn.click(
         fn=on_send,
-        inputs=[user_mic, state],
-        outputs=[state, chatbot, ai_audio, rtf_box, screen_chat, screen_wall],
     )
 demo.queue()

   Language select → 3-phrase enrollment → chat (Claude Haiku replies in cloned voice) → wall at turn 7
 RTF is shown after each AI turn so you can verify the GPU is keeping up.
+Iteration 13: replaced single gr.State(dict) with flat primitive States (str/int/list) to fix
+Gradio API schema crash — gradio_client.utils._json_schema_to_python_type cannot handle dict
+additionalProperties=True (a bool), causing TypeError: argument of type 'bool' is not iterable.
 """
 import math
                ("Korean (ko)", "ko"), ("Chinese (zh)", "zh")]
 # ── Audio helpers ─────────────────────────────────────────────────────────────
+def _to_mono_16k(audio_input):
     if audio_input is None:
         return None
     sr, samples = audio_input
 # ── GPU functions ─────────────────────────────────────────────────────────────
+# NOTE: No type hints on parameters — Gradio's json_schema_to_python_type crashes
+# on np.ndarray and dict types. State is stored as flat primitives only (str/int/list).
 @spaces.GPU
 def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
+    """WavLM enrollment + synthesize first AI greeting.
+    Returns (ref_list, greeting_audio, rtf_text).
+    ref_list is a plain Python list so it survives JSON serialization through gr.State.
+    """
     chunks, ref_texts = [], []
     for i, a in enumerate([audio1, audio2, audio3]):
         chunk = _to_mono_16k(a)
             ref_texts.append(ENROLLMENT_PHRASES[i])
     if not chunks:
+        return [], None, "⚠️ No audio recorded"
     ref = np.concatenate(chunks)
     ref_text = " ".join(ref_texts)
     status = "PASS ✓" if rtf < 1.0 else "FAIL ✗"
     rtf_text = f"Greeting — RTF: {rtf:.3f} | {status} | {elapsed*1000:.0f} ms"
+    # Convert np.ndarray → list so gr.State stays JSON-serializable
     return ref.tolist(), (16_000, greeting_audio), rtf_text
 @spaces.GPU
 def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
+    """ASR → Claude Haiku reply → TTS.
+    ref_list is a plain Python list (from gr.State); converted to np.ndarray here.
+    Returns (user_text, reply_text, audio_out, new_history, rtf_text).
     """
     samples = _to_mono_16k(audio_input)
     if samples is None or len(samples) == 0:
         return None, None, None, history, "⚠️ No audio"
+    if not ref_list:
+        return None, None, None, history, "⚠️ Not enrolled"
+    # Reconstruct np.ndarray from list stored in State
     ref = np.array(ref_list, dtype=np.float32)
     # ASR — Whisper tiny (CPU)
         turn_number=turn_count + 1,
         whisper_signals=whisper_signals if whisper_signals else None,
     )
+    reply_text  = reply_obj.l2_text
+    new_history = reply_obj.updated_history
+    lang_name   = LANG_NAMES.get(l2, "English")
     # TTS — hybrid router (OpenVoice short / Qwen3 long)
     t0 = time.perf_counter()
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
+# Use FLAT, PRIMITIVE gr.State objects — NOT a single gr.State(dict).
+# Gradio's API schema generator (gradio_client.utils._json_schema_to_python_type)
+# crashes on dict additionalProperties=True (a bool) with:
+#   TypeError: argument of type 'bool' is not iterable
+# Flat primitives (str, int, list) are safe.
 with gr.Blocks(title="Vocal Mirror") as demo:
+    # Flat state — each piece of session state is its own gr.State
+    state_l1         = gr.State("en")          # native language code
+    state_l2         = gr.State("es")          # target language code
+    state_ref        = gr.State([])            # voice ref as plain float list
+    state_history    = gr.State([])            # conversation history (list of dicts)
+    state_turn_count = gr.State(0)             # number of completed turns
     # ── Screen 1: Language select ─────────────────────────────────────────────
     with gr.Column(visible=True) as screen_lang:
             with gr.Column():
                 gr.Markdown(f'**Phrase 3**\n\n*"{ENROLLMENT_PHRASES[2]}"*')
                 enroll_a3 = gr.Audio(label="Phrase 3", sources=["microphone"], type="numpy")
+        enroll_btn    = gr.Button("Clone my voice & start →", variant="primary", size="lg")
         enroll_status = gr.Textbox(label="Status", interactive=False, visible=False)
     # ── Screen 3: Chat ────────────────────────────────────────────────────────
     with gr.Column(visible=False) as screen_chat:
         gr.Markdown("## Chat")
+        chatbot  = gr.Chatbot(label="Conversation", type="messages", height=400)
         ai_audio = gr.Audio(label="AI reply (cloned voice)", type="numpy", autoplay=True)
         rtf_box  = gr.Textbox(label="RTF", interactive=False)
         gr.Markdown("### Your turn — record your reply")
+        user_mic = gr.Audio(label="Your voice", sources=["microphone"], type="numpy")
+        send_btn = gr.Button("Send →", variant="primary")
     # ── Screen 4: Wall ────────────────────────────────────────────────────────
     with gr.Column(visible=False) as screen_wall:
             "in your own voice.\n\n"
             "Join the waitlist to get early access when we launch."
         )
+        gr.Markdown("_RTF benchmark ran throughout — all turns are real-time capable on A10G GPU._")
     # ── Callbacks ─────────────────────────────────────────────────────────────
+    def on_start(l1, l2):
         return (
+            l1,                             # state_l1
+            l2,                             # state_l2
+            [],                             # state_ref (reset)
+            [],                             # state_history (reset)
+            0,                              # state_turn_count (reset)
+            gr.update(visible=False),       # screen_lang
+            gr.update(visible=True),        # screen_enroll
         )
     start_btn.click(
         fn=on_start,
+        inputs=[l1_dd, l2_dd],
+        outputs=[state_l1, state_l2, state_ref, state_history, state_turn_count,
+                 screen_lang, screen_enroll],
     )
+    def on_enroll(a1, a2, a3, l2):
+        ref_list, greeting, rtf_text = gpu_enroll_and_greet(a1, a2, a3, l2)
+        if not ref_list:
             return (
+                [],                                            # state_ref unchanged
+                gr.update(visible=True),                       # screen_enroll stays
+                gr.update(value=rtf_text, visible=True),       # enroll_status
+                gr.update(visible=False),                      # screen_chat
+                gr.update(visible=False),                      # screen_wall
+                [],                                            # chatbot
+                None,                                          # ai_audio
+                "",                                            # rtf_box
             )
         fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
         messages = [{"role": "assistant", "content": fluent_text}]
         return (
+            ref_list,                          # state_ref
             gr.update(visible=False),          # screen_enroll
             gr.update(visible=False),          # enroll_status
             gr.update(visible=True),           # screen_chat
     enroll_btn.click(
         fn=on_enroll,
+        inputs=[enroll_a1, enroll_a2, enroll_a3, state_l2],
+        outputs=[state_ref, screen_enroll, enroll_status, screen_chat, screen_wall,
                  chatbot, ai_audio, rtf_box],
     )
+    def on_send(audio, ref_list, history, turn_count, l1, l2):
+        if not ref_list:
+            return ref_list, history, turn_count, gr.update(), None, "⚠️ Not enrolled", gr.update(), gr.update()
         user_text, reply_text, audio_out, new_history, rtf_text = gpu_chat_turn(
+            audio, ref_list, history, turn_count, l1, l2
         )
         if reply_text is None:
+            return ref_list, history, turn_count, gr.update(), None, rtf_text, gr.update(), gr.update()
+        new_turn_count = turn_count + 1
         fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
         messages = [{"role": "assistant", "content": fluent_text}]
         for msg in new_history:
             messages.append({"role": msg["role"], "content": msg["content"]})
+        if new_turn_count >= WALL_TURN_COUNT:
             return (
+                ref_list,
+                new_history,
+                new_turn_count,
                 messages,
                 audio_out,
                 rtf_text,
             )
         return (
+            ref_list,
+            new_history,
+            new_turn_count,
             messages,
             audio_out,
             rtf_text,
     send_btn.click(
         fn=on_send,
+        inputs=[user_mic, state_ref, state_history, state_turn_count, state_l1, state_l2],
+        outputs=[state_ref, state_history, state_turn_count,
+                 chatbot, ai_audio, rtf_box, screen_chat, screen_wall],
     )
 demo.queue()

build-errors/build_errors.md CHANGED Viewed

@@ -115,5 +115,21 @@ This file is committed alongside every fix so the repo retains full context of w
 - Changed `gpu_chat_turn` to accept `ref_list` (plain list) and convert to `np.ndarray` internally via `np.array(ref_list, dtype=np.float32)` before passing to `synthesize()`
 - No changes to callbacks — `on_enroll` stores whatever the function returns; `on_send` passes it through unchanged
 **Files changed:** `app.py` only.
 **Result:** Pending — pushed, awaiting rebuild.

 - Changed `gpu_chat_turn` to accept `ref_list` (plain list) and convert to `np.ndarray` internally via `np.array(ref_list, dtype=np.float32)` before passing to `synthesize()`
 - No changes to callbacks — `on_enroll` stores whatever the function returns; `on_send` passes it through unchanged
 **Files changed:** `app.py` only.
+**Result:** FAIL — same crash persists. Removing np.ndarray type hints did not resolve it. Root cause was actually the gr.State(dict) itself, not the function signature. See Iteration 13.
+---
+## Iteration 13 — 2026-04-13
+**Stage:** RUNNING but `/gradio_api/info` still returns 500
+**Error:** `TypeError: argument of type 'bool' is not iterable` at `gradio_client/utils.py:882 → get_type → if "const" in schema`
+**Root cause:** Removing np.ndarray type hints in Iteration 12 did not fix the crash. The actual source is `gr.State({"l1": "en", "l2": "es", "ref": None, "history": [], "turn_count": 0})`. When Gradio generates the API schema for this State, it calls `_json_schema_to_python_type` on the dict schema. The dict's JSON Schema representation has `additionalProperties: True` (a Python bool, per JSON Schema spec). The schema generator then does `if "const" in schema` where `schema` is already a Python bool `True`, causing `TypeError: argument of type 'bool' is not iterable`. This happens in `gradio_client/utils.py` at line 882 regardless of function type hints — it's triggered by the State type itself.
+**Fix applied:** Replaced single `gr.State(dict)` with **5 flat, primitive `gr.State` objects**:
+- `state_l1 = gr.State("en")` — string, safe
+- `state_l2 = gr.State("es")` — string, safe
+- `state_ref = gr.State([])` — empty list (no numpy), safe
+- `state_history = gr.State([])` — list of dicts (plain JSON), safe
+- `state_turn_count = gr.State(0)` — int, safe
+All callbacks updated to accept/return these flat states. `ref_list` (a Python list) is passed as `state_ref` and converted to `np.ndarray` inside `gpu_chat_turn` only. Full `app.py` rewrite.
+**Files changed:** `app.py` only.
 **Result:** Pending — pushed, awaiting rebuild.