Spaces:

build-small-hackathon
/

Sign2Voice

Build error

App Files Files Community

Sign2Voice / app.py

lilblueyes

Update requirements

dfed012 11 days ago

Raw

History Blame Contribute Delete

11.3 kB

	from __future__ import annotations

	from pathlib import Path

	import gradio as gr

	from signspeak.llm import generate_subtitle_and_instruction
	from signspeak.live_debug import process_live_debug_frame
	from signspeak.pipeline import DEFAULT_INTENT, json_text, run_asl_video
	from signspeak.tts import generate_tts


	APP_DIR = Path(__file__).resolve().parent
	CUSTOM_CSS = (APP_DIR / "assets" / "styles.css").read_text(encoding="utf-8")

	LANGUAGE_CHOICES = [
	"Auto",
	"Chinese",
	"English",
	"Japanese",
	"Korean",
	"German",
	"French",
	"Russian",
	"Portuguese",
	"Spanish",
	"Italian",
	]

	SPEAKER_CHOICES = [
	"Vivian",
	"Serena",
	"Uncle_Fu",
	"Dylan",
	"Eric",
	"Ryan",
	"Aiden",
	"Ono_Anna",
	"Sohee",
	]


	def run_asl_brick(video_file: str \| None, gloss_override: str \| None = None) -> tuple[str, dict, str, str]:
	try:
	return run_asl_video(video_file, gloss_override)
	except Exception as exc:
	raise gr.Error(f"ASL pipeline failed: {type(exc).__name__}: {exc}") from exc


	def run_llm_brick(intent_json_text: str) -> tuple[str, str, dict]:
	try:
	return generate_subtitle_and_instruction(intent_json_text)
	except Exception as exc:
	raise gr.Error(f"llama.cpp generation failed: {type(exc).__name__}: {exc}") from exc


	def run_tts_brick(text: str, language: str, speaker: str, instruction: str) -> str:
	try:
	if (text or "").strip() == "No ASL words were detected yet.":
	raise ValueError("Analyze ASL did not detect words. Add a real ASL model or use an explicit debug override.")
	return generate_tts(text, language, speaker, instruction)
	except Exception as exc:
	raise gr.Error(f"Qwen3-TTS generation failed: {type(exc).__name__}: {exc}") from exc


	def build_video_input(label: str) -> gr.Video:
	return gr.Video(
	label=label,
	sources=["upload", "webcam"],
	format="mp4",
	)


	with gr.Blocks(title="Sign2Voice", css=CUSTOM_CSS, theme=gr.themes.Base()) as demo:
	gr.HTML(
	"""
	<main id="hero">
	<div class="brand-lockup">
	<span class="brand-mark" aria-hidden="true"></span>
	<div>
	<p class="eyebrow">Local-first sign-to-speech console</p>
	<h1>Sign2Voice</h1>
	</div>
	</div>
	<div class="hero-grid">
	<div>
	<p class="hero-copy">
	Translate camera or uploaded signing clips into natural speech with
	visible intent, expression, and confidence diagnostics.
	</p>
	<div class="pipeline-rail" aria-label="Pipeline stages">
	<span>Capture</span>
	<span>Signs</span>
	<span>Intent</span>
	<span>Voice</span>
	</div>
	</div>
	<div class="system-strip" aria-label="System capabilities">
	<span>Camera ready</span>
	<span>llama.cpp local</span>
	<span>Expressive voice</span>
	</div>
	</div>
	</main>
	"""
	)

	with gr.Tabs():
	with gr.Tab("Run demo"):
	with gr.Row(elem_classes=["demo-grid"]):
	with gr.Column(scale=6, elem_classes=["panel-shell", "input-panel"]):
	gr.HTML('<div class="section-kicker">01 Capture</div>')
	full_video_input = build_video_input("Video or camera capture")
	with gr.Accordion("Advanced debug controls", open=False):
	full_gloss_override_input = gr.Textbox(
	label="Manual gloss override",
	value="",
	lines=1,
	info="Optional. Use only to test the downstream LLM/TTS when no ASL model is available.",
	)
	with gr.Row(elem_classes=["control-row"]):
	full_language_input = gr.Dropdown(
	label="Language",
	choices=LANGUAGE_CHOICES,
	value="English",
	)
	full_speaker_input = gr.Dropdown(
	label="Speaker",
	choices=SPEAKER_CHOICES,
	value="Ryan",
	)
	run_demo_asl_button = gr.Button("1 Analyze ASL", elem_id="run_demo_asl")

	with gr.Column(scale=5, elem_classes=["panel-shell", "output-panel"]):
	gr.HTML('<div class="section-kicker">02 Live debug</div>')
	full_debug_video_output = gr.Video(label="Debug overlay playback")
	full_summary_output = gr.Textbox(label="ASL summary", lines=4)
	full_intent_output = gr.Code(label="Intent JSON", language="json", lines=8)

	with gr.Row(elem_classes=["demo-grid"]):
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">03 llama.cpp</div>')
	run_demo_llm_button = gr.Button("2 Generate subtitle", elem_id="run_demo_llm")
	full_subtitle_output = gr.Textbox(label="Subtitle", lines=3)
	full_instruction_output = gr.Textbox(label="Voice instruction", lines=3)
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">04 Qwen3-TTS</div>')
	run_demo_tts_button = gr.Button("3 Generate speech", elem_id="run_demo_tts")
	full_audio_output = gr.Audio(label="Generated audio", type="filepath")

	with gr.Accordion("Pipeline diagnostics", open=False):
	with gr.Row(elem_classes=["diagnostic-grid"]):
	full_asl_json_output = gr.JSON(label="ASL structured output")
	full_llm_json_output = gr.JSON(label="LLM structured output")

	with gr.Tab("Inspect bricks"):
	with gr.Row(elem_classes=["brick-grid"]):
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">ASL brick</div>')
	asl_video_input = build_video_input("Video or camera capture")
	asl_gloss_override_input = gr.Textbox(
	label="Debug gloss override",
	value="",
	lines=1,
	)
	run_asl_button = gr.Button("Run ASL brick", elem_id="run_asl")
	asl_summary_output = gr.Textbox(label="ASL summary", lines=4)
	asl_intent_output = gr.Code(label="Intent JSON", language="json", lines=12)
	with gr.Column(scale=1):
	asl_debug_video_output = gr.Video(label="Debug overlay playback")
	asl_json_output = gr.JSON(label="ASL structured output")

	with gr.Row(elem_classes=["brick-grid"]):
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">llama.cpp brick</div>')
	intent_input = gr.Code(
	label="Intent JSON",
	value=json_text(DEFAULT_INTENT),
	language="json",
	lines=14,
	)
	run_llm_button = gr.Button(
	"Generate subtitle",
	elem_id="run_llm",
	)
	with gr.Column(scale=1):
	subtitle_output = gr.Textbox(label="Subtitle", lines=3)
	instruction_output = gr.Textbox(label="Voice instruction", lines=3)
	llm_json_output = gr.JSON(label="LLM structured output")

	with gr.Row(elem_classes=["brick-grid"]):
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">Qwen3-TTS brick</div>')
	tts_language_input = gr.Dropdown(
	label="Language",
	choices=LANGUAGE_CHOICES,
	value="English",
	)
	tts_speaker_input = gr.Dropdown(
	label="Speaker",
	choices=SPEAKER_CHOICES,
	value="Ryan",
	)
	run_tts_button = gr.Button("Generate speech", elem_id="run_tts")
	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Generated audio", type="filepath")

	with gr.Tab("Live camera debug"):
	with gr.Row(elem_classes=["demo-grid"]):
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">Camera stream</div>')
	live_camera_input = gr.Image(
	label="Live camera frame",
	sources=["webcam"],
	streaming=True,
	type="numpy",
	)
	with gr.Column(scale=1, elem_classes=["panel-shell"]):
	gr.HTML('<div class="section-kicker">Live overlay</div>')
	live_camera_output = gr.Image(label="Overlay preview", type="numpy")
	live_camera_status = gr.Textbox(label="Live status", lines=3)

	gr.HTML(
	"""
	<p class="footer-note">
	Build Small badges targeted: Off the Grid, Llama Champion, Off-Brand.
	</p>
	"""
	)

	run_demo_asl_button.click(
	fn=run_asl_brick,
	inputs=[full_video_input, full_gloss_override_input],
	outputs=[full_intent_output, full_asl_json_output, full_summary_output, full_debug_video_output],
	)

	run_demo_llm_button.click(
	fn=run_llm_brick,
	inputs=[full_intent_output],
	outputs=[full_subtitle_output, full_instruction_output, full_llm_json_output],
	)

	run_demo_tts_button.click(
	fn=run_tts_brick,
	inputs=[
	full_subtitle_output,
	full_language_input,
	full_speaker_input,
	full_instruction_output,
	],
	outputs=[full_audio_output],
	)

	run_asl_button.click(
	fn=run_asl_brick,
	inputs=[asl_video_input, asl_gloss_override_input],
	outputs=[asl_intent_output, asl_json_output, asl_summary_output, asl_debug_video_output],
	)

	run_llm_button.click(
	fn=run_llm_brick,
	inputs=[intent_input],
	outputs=[subtitle_output, instruction_output, llm_json_output],
	)

	run_tts_button.click(
	fn=run_tts_brick,
	inputs=[
	subtitle_output,
	tts_language_input,
	tts_speaker_input,
	instruction_output,
	],
	outputs=[audio_output],
	)

	live_camera_input.stream(
	fn=process_live_debug_frame,
	inputs=[live_camera_input],
	outputs=[live_camera_output, live_camera_status],
	)


	if __name__ == "__main__":
	demo.queue().launch()