rubentuesday commited on
Commit
e75cf4f
Β·
1 Parent(s): b1bd3b9

fix: replace dict State with flat primitive States to fix Gradio API schema TypeError

Browse files
Files changed (2) hide show
  1. app.py +80 -69
  2. build-errors/build_errors.md +16 -0
app.py CHANGED
@@ -4,6 +4,10 @@ Replicates the exact frontend flow:
4
  Language select β†’ 3-phrase enrollment β†’ chat (Claude Haiku replies in cloned voice) β†’ wall at turn 7
5
 
6
  RTF is shown after each AI turn so you can verify the GPU is keeping up.
 
 
 
 
7
  """
8
 
9
  import math
@@ -80,7 +84,7 @@ L2_OPTIONS = [("Spanish (es)", "es"), ("English (en)", "en"), ("French (fr)", "
80
  ("Korean (ko)", "ko"), ("Chinese (zh)", "zh")]
81
 
82
  # ── Audio helpers ─────────────────────────────────────────────────────────────
83
- def _to_mono_16k(audio_input) -> np.ndarray | None:
84
  if audio_input is None:
85
  return None
86
  sr, samples = audio_input
@@ -97,10 +101,15 @@ def _to_mono_16k(audio_input) -> np.ndarray | None:
97
 
98
 
99
  # ── GPU functions ─────────────────────────────────────────────────────────────
 
 
100
 
101
  @spaces.GPU
102
  def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
103
- """WavLM enrollment + synthesize first AI message. Returns (ref_list, greeting_audio, rtf_text)."""
 
 
 
104
  chunks, ref_texts = [], []
105
  for i, a in enumerate([audio1, audio2, audio3]):
106
  chunk = _to_mono_16k(a)
@@ -109,7 +118,7 @@ def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
109
  ref_texts.append(ENROLLMENT_PHRASES[i])
110
 
111
  if not chunks:
112
- return None, None, "⚠️ No audio recorded"
113
 
114
  ref = np.concatenate(chunks)
115
  ref_text = " ".join(ref_texts)
@@ -125,20 +134,24 @@ def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
125
 
126
  status = "PASS βœ“" if rtf < 1.0 else "FAIL βœ—"
127
  rtf_text = f"Greeting β€” RTF: {rtf:.3f} | {status} | {elapsed*1000:.0f} ms"
128
- # Store ref as plain Python list so Gradio State stays JSON-serializable
129
  return ref.tolist(), (16_000, greeting_audio), rtf_text
130
 
131
 
132
  @spaces.GPU
133
  def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
134
- """ASR β†’ Claude Haiku reply β†’ TTS. Returns (user_text, reply_text, audio_out, new_history, rtf_text).
135
- ref_list is a plain Python list (stored in State); converted to np.ndarray here.
 
136
  """
137
  samples = _to_mono_16k(audio_input)
138
  if samples is None or len(samples) == 0:
139
  return None, None, None, history, "⚠️ No audio"
140
 
141
- # Reconstruct np.ndarray from the list stored in State
 
 
 
142
  ref = np.array(ref_list, dtype=np.float32)
143
 
144
  # ASR β€” Whisper tiny (CPU)
@@ -155,9 +168,9 @@ def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
155
  turn_number=turn_count + 1,
156
  whisper_signals=whisper_signals if whisper_signals else None,
157
  )
158
- reply_text = reply_obj.l2_text
159
- new_history = reply_obj.updated_history
160
- lang_name = LANG_NAMES.get(l2, "English")
161
 
162
  # TTS β€” hybrid router (OpenVoice short / Qwen3 long)
163
  t0 = time.perf_counter()
@@ -172,12 +185,20 @@ def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
172
 
173
 
174
  # ── Gradio UI ─────────────────────────────────────────────────────────────────
175
-
176
- EMPTY_STATE = {"l1": "en", "l2": "es", "ref": None, "history": [], "turn_count": 0}
 
 
 
177
 
178
  with gr.Blocks(title="Vocal Mirror") as demo:
179
 
180
- state = gr.State(EMPTY_STATE.copy())
 
 
 
 
 
181
 
182
  # ── Screen 1: Language select ─────────────────────────────────────────────
183
  with gr.Column(visible=True) as screen_lang:
@@ -204,18 +225,18 @@ with gr.Blocks(title="Vocal Mirror") as demo:
204
  with gr.Column():
205
  gr.Markdown(f'**Phrase 3**\n\n*"{ENROLLMENT_PHRASES[2]}"*')
206
  enroll_a3 = gr.Audio(label="Phrase 3", sources=["microphone"], type="numpy")
207
- enroll_btn = gr.Button("Clone my voice & start β†’", variant="primary", size="lg")
208
  enroll_status = gr.Textbox(label="Status", interactive=False, visible=False)
209
 
210
  # ── Screen 3: Chat ────────────────────────────────────────────────────────
211
  with gr.Column(visible=False) as screen_chat:
212
  gr.Markdown("## Chat")
213
- chatbot = gr.Chatbot(label="Conversation", type="messages", height=400)
214
  ai_audio = gr.Audio(label="AI reply (cloned voice)", type="numpy", autoplay=True)
215
  rtf_box = gr.Textbox(label="RTF", interactive=False)
216
  gr.Markdown("### Your turn β€” record your reply")
217
- user_mic = gr.Audio(label="Your voice", sources=["microphone"], type="numpy")
218
- send_btn = gr.Button("Send β†’", variant="primary")
219
 
220
  # ── Screen 4: Wall ────────────────────────────────────────────────────────
221
  with gr.Column(visible=False) as screen_wall:
@@ -225,53 +246,50 @@ with gr.Blocks(title="Vocal Mirror") as demo:
225
  "in your own voice.\n\n"
226
  "Join the waitlist to get early access when we launch."
227
  )
228
- gr.Markdown(f"_RTF benchmark ran throughout β€” all turns are real-time capable on A10G GPU._")
229
 
230
 
231
  # ── Callbacks ─────────────────────────────────────────────────────────────
232
 
233
- def on_start(l1, l2, state):
234
- state = state.copy()
235
- state["l1"] = l1
236
- state["l2"] = l2
237
- state["history"] = []
238
- state["turn_count"] = 0
239
- state["ref"] = None
240
  return (
241
- state,
242
- gr.update(visible=False), # screen_lang
243
- gr.update(visible=True), # screen_enroll
 
 
 
 
244
  )
245
 
246
  start_btn.click(
247
  fn=on_start,
248
- inputs=[l1_dd, l2_dd, state],
249
- outputs=[state, screen_lang, screen_enroll],
 
250
  )
251
 
252
 
253
- def on_enroll(a1, a2, a3, state):
254
- state = state.copy()
255
- l2 = state.get("l2", "es")
256
-
257
- ref, greeting, rtf_text = gpu_enroll_and_greet(a1, a2, a3, l2)
258
 
259
- if ref is None:
260
  return (
261
- state,
262
- gr.update(visible=True), # screen_enroll stays
263
- gr.update(value=rtf_text, visible=True), # enroll_status
264
- gr.update(visible=False), gr.update(visible=False),
265
- [], None, "",
 
 
 
266
  )
267
 
268
  fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
269
- state["ref"] = ref
270
- # Greeting counts as AI turn 0 β€” history stays empty until user speaks
271
  messages = [{"role": "assistant", "content": fluent_text}]
272
 
273
  return (
274
- state,
275
  gr.update(visible=False), # screen_enroll
276
  gr.update(visible=False), # enroll_status
277
  gr.update(visible=True), # screen_chat
@@ -283,45 +301,35 @@ with gr.Blocks(title="Vocal Mirror") as demo:
283
 
284
  enroll_btn.click(
285
  fn=on_enroll,
286
- inputs=[enroll_a1, enroll_a2, enroll_a3, state],
287
- outputs=[state, screen_enroll, enroll_status, screen_chat, screen_wall,
288
  chatbot, ai_audio, rtf_box],
289
  )
290
 
291
 
292
- def on_send(audio, state):
293
- state = state.copy()
294
- ref = state.get("ref")
295
- history = state.get("history", [])
296
- turn_count = state.get("turn_count", 0)
297
- l1 = state.get("l1", "en")
298
- l2 = state.get("l2", "es")
299
-
300
- if ref is None:
301
- return state, gr.update(), None, "⚠️ Not enrolled", gr.update(), gr.update()
302
 
303
  user_text, reply_text, audio_out, new_history, rtf_text = gpu_chat_turn(
304
- audio, ref, history, turn_count, l1, l2
305
  )
306
 
307
  if reply_text is None:
308
- # Transcription failed β€” keep chat as-is
309
- return state, gr.update(), None, rtf_text, gr.update(), gr.update()
310
 
311
- turn_count += 1
312
- state["history"] = new_history
313
- state["turn_count"] = turn_count
314
 
315
- # Build display messages (show only current conversation, not enrollment greeting)
316
- # Prepend greeting so it stays at top
317
  fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
318
  messages = [{"role": "assistant", "content": fluent_text}]
319
  for msg in new_history:
320
  messages.append({"role": msg["role"], "content": msg["content"]})
321
 
322
- if turn_count >= WALL_TURN_COUNT:
323
  return (
324
- state,
 
 
325
  messages,
326
  audio_out,
327
  rtf_text,
@@ -330,7 +338,9 @@ with gr.Blocks(title="Vocal Mirror") as demo:
330
  )
331
 
332
  return (
333
- state,
 
 
334
  messages,
335
  audio_out,
336
  rtf_text,
@@ -340,8 +350,9 @@ with gr.Blocks(title="Vocal Mirror") as demo:
340
 
341
  send_btn.click(
342
  fn=on_send,
343
- inputs=[user_mic, state],
344
- outputs=[state, chatbot, ai_audio, rtf_box, screen_chat, screen_wall],
 
345
  )
346
 
347
  demo.queue()
 
4
  Language select β†’ 3-phrase enrollment β†’ chat (Claude Haiku replies in cloned voice) β†’ wall at turn 7
5
 
6
  RTF is shown after each AI turn so you can verify the GPU is keeping up.
7
+
8
+ Iteration 13: replaced single gr.State(dict) with flat primitive States (str/int/list) to fix
9
+ Gradio API schema crash β€” gradio_client.utils._json_schema_to_python_type cannot handle dict
10
+ additionalProperties=True (a bool), causing TypeError: argument of type 'bool' is not iterable.
11
  """
12
 
13
  import math
 
84
  ("Korean (ko)", "ko"), ("Chinese (zh)", "zh")]
85
 
86
  # ── Audio helpers ─────────────────────────────────────────────────────────────
87
+ def _to_mono_16k(audio_input):
88
  if audio_input is None:
89
  return None
90
  sr, samples = audio_input
 
101
 
102
 
103
  # ── GPU functions ─────────────────────────────────────────────────────────────
104
+ # NOTE: No type hints on parameters β€” Gradio's json_schema_to_python_type crashes
105
+ # on np.ndarray and dict types. State is stored as flat primitives only (str/int/list).
106
 
107
  @spaces.GPU
108
  def gpu_enroll_and_greet(audio1, audio2, audio3, l2):
109
+ """WavLM enrollment + synthesize first AI greeting.
110
+ Returns (ref_list, greeting_audio, rtf_text).
111
+ ref_list is a plain Python list so it survives JSON serialization through gr.State.
112
+ """
113
  chunks, ref_texts = [], []
114
  for i, a in enumerate([audio1, audio2, audio3]):
115
  chunk = _to_mono_16k(a)
 
118
  ref_texts.append(ENROLLMENT_PHRASES[i])
119
 
120
  if not chunks:
121
+ return [], None, "⚠️ No audio recorded"
122
 
123
  ref = np.concatenate(chunks)
124
  ref_text = " ".join(ref_texts)
 
134
 
135
  status = "PASS βœ“" if rtf < 1.0 else "FAIL βœ—"
136
  rtf_text = f"Greeting β€” RTF: {rtf:.3f} | {status} | {elapsed*1000:.0f} ms"
137
+ # Convert np.ndarray β†’ list so gr.State stays JSON-serializable
138
  return ref.tolist(), (16_000, greeting_audio), rtf_text
139
 
140
 
141
  @spaces.GPU
142
  def gpu_chat_turn(audio_input, ref_list, history, turn_count, l1, l2):
143
+ """ASR β†’ Claude Haiku reply β†’ TTS.
144
+ ref_list is a plain Python list (from gr.State); converted to np.ndarray here.
145
+ Returns (user_text, reply_text, audio_out, new_history, rtf_text).
146
  """
147
  samples = _to_mono_16k(audio_input)
148
  if samples is None or len(samples) == 0:
149
  return None, None, None, history, "⚠️ No audio"
150
 
151
+ if not ref_list:
152
+ return None, None, None, history, "⚠️ Not enrolled"
153
+
154
+ # Reconstruct np.ndarray from list stored in State
155
  ref = np.array(ref_list, dtype=np.float32)
156
 
157
  # ASR β€” Whisper tiny (CPU)
 
168
  turn_number=turn_count + 1,
169
  whisper_signals=whisper_signals if whisper_signals else None,
170
  )
171
+ reply_text = reply_obj.l2_text
172
+ new_history = reply_obj.updated_history
173
+ lang_name = LANG_NAMES.get(l2, "English")
174
 
175
  # TTS β€” hybrid router (OpenVoice short / Qwen3 long)
176
  t0 = time.perf_counter()
 
185
 
186
 
187
  # ── Gradio UI ─────────────────────────────────────────────────────────────────
188
+ # Use FLAT, PRIMITIVE gr.State objects β€” NOT a single gr.State(dict).
189
+ # Gradio's API schema generator (gradio_client.utils._json_schema_to_python_type)
190
+ # crashes on dict additionalProperties=True (a bool) with:
191
+ # TypeError: argument of type 'bool' is not iterable
192
+ # Flat primitives (str, int, list) are safe.
193
 
194
  with gr.Blocks(title="Vocal Mirror") as demo:
195
 
196
+ # Flat state β€” each piece of session state is its own gr.State
197
+ state_l1 = gr.State("en") # native language code
198
+ state_l2 = gr.State("es") # target language code
199
+ state_ref = gr.State([]) # voice ref as plain float list
200
+ state_history = gr.State([]) # conversation history (list of dicts)
201
+ state_turn_count = gr.State(0) # number of completed turns
202
 
203
  # ── Screen 1: Language select ─────────────────────────────────────────────
204
  with gr.Column(visible=True) as screen_lang:
 
225
  with gr.Column():
226
  gr.Markdown(f'**Phrase 3**\n\n*"{ENROLLMENT_PHRASES[2]}"*')
227
  enroll_a3 = gr.Audio(label="Phrase 3", sources=["microphone"], type="numpy")
228
+ enroll_btn = gr.Button("Clone my voice & start β†’", variant="primary", size="lg")
229
  enroll_status = gr.Textbox(label="Status", interactive=False, visible=False)
230
 
231
  # ── Screen 3: Chat ────────────────────────────────────────────────────────
232
  with gr.Column(visible=False) as screen_chat:
233
  gr.Markdown("## Chat")
234
+ chatbot = gr.Chatbot(label="Conversation", type="messages", height=400)
235
  ai_audio = gr.Audio(label="AI reply (cloned voice)", type="numpy", autoplay=True)
236
  rtf_box = gr.Textbox(label="RTF", interactive=False)
237
  gr.Markdown("### Your turn β€” record your reply")
238
+ user_mic = gr.Audio(label="Your voice", sources=["microphone"], type="numpy")
239
+ send_btn = gr.Button("Send β†’", variant="primary")
240
 
241
  # ── Screen 4: Wall ────────────────────────────────────────────────────────
242
  with gr.Column(visible=False) as screen_wall:
 
246
  "in your own voice.\n\n"
247
  "Join the waitlist to get early access when we launch."
248
  )
249
+ gr.Markdown("_RTF benchmark ran throughout β€” all turns are real-time capable on A10G GPU._")
250
 
251
 
252
  # ── Callbacks ─────────────────────────────────────────────────────────────
253
 
254
+ def on_start(l1, l2):
 
 
 
 
 
 
255
  return (
256
+ l1, # state_l1
257
+ l2, # state_l2
258
+ [], # state_ref (reset)
259
+ [], # state_history (reset)
260
+ 0, # state_turn_count (reset)
261
+ gr.update(visible=False), # screen_lang
262
+ gr.update(visible=True), # screen_enroll
263
  )
264
 
265
  start_btn.click(
266
  fn=on_start,
267
+ inputs=[l1_dd, l2_dd],
268
+ outputs=[state_l1, state_l2, state_ref, state_history, state_turn_count,
269
+ screen_lang, screen_enroll],
270
  )
271
 
272
 
273
+ def on_enroll(a1, a2, a3, l2):
274
+ ref_list, greeting, rtf_text = gpu_enroll_and_greet(a1, a2, a3, l2)
 
 
 
275
 
276
+ if not ref_list:
277
  return (
278
+ [], # state_ref unchanged
279
+ gr.update(visible=True), # screen_enroll stays
280
+ gr.update(value=rtf_text, visible=True), # enroll_status
281
+ gr.update(visible=False), # screen_chat
282
+ gr.update(visible=False), # screen_wall
283
+ [], # chatbot
284
+ None, # ai_audio
285
+ "", # rtf_box
286
  )
287
 
288
  fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
 
 
289
  messages = [{"role": "assistant", "content": fluent_text}]
290
 
291
  return (
292
+ ref_list, # state_ref
293
  gr.update(visible=False), # screen_enroll
294
  gr.update(visible=False), # enroll_status
295
  gr.update(visible=True), # screen_chat
 
301
 
302
  enroll_btn.click(
303
  fn=on_enroll,
304
+ inputs=[enroll_a1, enroll_a2, enroll_a3, state_l2],
305
+ outputs=[state_ref, screen_enroll, enroll_status, screen_chat, screen_wall,
306
  chatbot, ai_audio, rtf_box],
307
  )
308
 
309
 
310
+ def on_send(audio, ref_list, history, turn_count, l1, l2):
311
+ if not ref_list:
312
+ return ref_list, history, turn_count, gr.update(), None, "⚠️ Not enrolled", gr.update(), gr.update()
 
 
 
 
 
 
 
313
 
314
  user_text, reply_text, audio_out, new_history, rtf_text = gpu_chat_turn(
315
+ audio, ref_list, history, turn_count, l1, l2
316
  )
317
 
318
  if reply_text is None:
319
+ return ref_list, history, turn_count, gr.update(), None, rtf_text, gr.update(), gr.update()
 
320
 
321
+ new_turn_count = turn_count + 1
 
 
322
 
 
 
323
  fluent_text = FLUENT_PHRASES.get(l2, FLUENT_PHRASES["en"])
324
  messages = [{"role": "assistant", "content": fluent_text}]
325
  for msg in new_history:
326
  messages.append({"role": msg["role"], "content": msg["content"]})
327
 
328
+ if new_turn_count >= WALL_TURN_COUNT:
329
  return (
330
+ ref_list,
331
+ new_history,
332
+ new_turn_count,
333
  messages,
334
  audio_out,
335
  rtf_text,
 
338
  )
339
 
340
  return (
341
+ ref_list,
342
+ new_history,
343
+ new_turn_count,
344
  messages,
345
  audio_out,
346
  rtf_text,
 
350
 
351
  send_btn.click(
352
  fn=on_send,
353
+ inputs=[user_mic, state_ref, state_history, state_turn_count, state_l1, state_l2],
354
+ outputs=[state_ref, state_history, state_turn_count,
355
+ chatbot, ai_audio, rtf_box, screen_chat, screen_wall],
356
  )
357
 
358
  demo.queue()
build-errors/build_errors.md CHANGED
@@ -115,5 +115,21 @@ This file is committed alongside every fix so the repo retains full context of w
115
  - Changed `gpu_chat_turn` to accept `ref_list` (plain list) and convert to `np.ndarray` internally via `np.array(ref_list, dtype=np.float32)` before passing to `synthesize()`
116
  - No changes to callbacks β€” `on_enroll` stores whatever the function returns; `on_send` passes it through unchanged
117
  **Files changed:** `app.py` only.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  **Result:** Pending β€” pushed, awaiting rebuild.
119
 
 
115
  - Changed `gpu_chat_turn` to accept `ref_list` (plain list) and convert to `np.ndarray` internally via `np.array(ref_list, dtype=np.float32)` before passing to `synthesize()`
116
  - No changes to callbacks β€” `on_enroll` stores whatever the function returns; `on_send` passes it through unchanged
117
  **Files changed:** `app.py` only.
118
+ **Result:** FAIL β€” same crash persists. Removing np.ndarray type hints did not resolve it. Root cause was actually the gr.State(dict) itself, not the function signature. See Iteration 13.
119
+
120
+ ---
121
+
122
+ ## Iteration 13 β€” 2026-04-13
123
+ **Stage:** RUNNING but `/gradio_api/info` still returns 500
124
+ **Error:** `TypeError: argument of type 'bool' is not iterable` at `gradio_client/utils.py:882 β†’ get_type β†’ if "const" in schema`
125
+ **Root cause:** Removing np.ndarray type hints in Iteration 12 did not fix the crash. The actual source is `gr.State({"l1": "en", "l2": "es", "ref": None, "history": [], "turn_count": 0})`. When Gradio generates the API schema for this State, it calls `_json_schema_to_python_type` on the dict schema. The dict's JSON Schema representation has `additionalProperties: True` (a Python bool, per JSON Schema spec). The schema generator then does `if "const" in schema` where `schema` is already a Python bool `True`, causing `TypeError: argument of type 'bool' is not iterable`. This happens in `gradio_client/utils.py` at line 882 regardless of function type hints β€” it's triggered by the State type itself.
126
+ **Fix applied:** Replaced single `gr.State(dict)` with **5 flat, primitive `gr.State` objects**:
127
+ - `state_l1 = gr.State("en")` β€” string, safe
128
+ - `state_l2 = gr.State("es")` β€” string, safe
129
+ - `state_ref = gr.State([])` β€” empty list (no numpy), safe
130
+ - `state_history = gr.State([])` β€” list of dicts (plain JSON), safe
131
+ - `state_turn_count = gr.State(0)` β€” int, safe
132
+ All callbacks updated to accept/return these flat states. `ref_list` (a Python list) is passed as `state_ref` and converted to `np.ndarray` inside `gpu_chat_turn` only. Full `app.py` rewrite.
133
+ **Files changed:** `app.py` only.
134
  **Result:** Pending β€” pushed, awaiting rebuild.
135