File size: 4,084 Bytes
b3c89b0
 
f46812d
b3c89b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f46812d
b3c89b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f46812d
b3c89b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6897d69
b3c89b0
 
 
 
 
 
 
 
 
 
f46812d
b3c89b0
 
c29c741
 
 
 
b3c89b0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# app.py - Speech to ASL Avatar on Hugging Face Spaces (ZeroGPU compatible)

import gradio as gr
import whisper
import requests
import tempfile
import os
from spaces import GPU   # Required for ZeroGPU hardware

# Load API key from HF Space secrets (Settings β†’ Secrets)
API_KEY = os.environ.get("SIGN_SPEAK_API_KEY")
if not API_KEY:
    raise ValueError("SIGN_SPEAK_API_KEY not set in Space secrets!")

BASE_URL = "https://api.sign-speak.com"
PRODUCE_SIGN_URL = f"{BASE_URL}/produce-sign"

def get_sign_language(text: str, request_class="BLOCKING", identity="MALE"):
    """
    Calls Sign-Speak API to generate ASL avatar video.
    This runs on CPU (external API call), no GPU needed here.
    """
    headers = {
        "X-api-key": API_KEY,
        "Content-Type": "application/json"
    }
    payload = {
        "english": text.strip(),
        "request_class": request_class.upper(),
        "identity": identity.upper(),
        # Optional: "model_version": "SLP.2.xs" for faster/smaller model
    }
    response = requests.post(PRODUCE_SIGN_URL, json=payload, headers=headers)
    
    if response.status_code == 200:
        # Save MP4 bytes to temp file (Gradio Video component needs filepath)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
            tmp.write(response.content)
            return tmp.name
    elif response.status_code == 202:
        data = response.json()
        batch_id = data.get("batch_id")
        raise ValueError(f"Batch processing started (ID: {batch_id}). Add polling if needed.")
    else:
        raise ValueError(f"Sign-Speak API error {response.status_code}: {response.text}")

@GPU(duration=120)  # ← ZeroGPU decorator: request GPU for up to 120 seconds
def transcribe_and_translate(audio_filepath):
    """
    Heavy function: loads Whisper model and transcribes audio.
    Marked with @GPU so ZeroGPU hardware is allocated here.
    """
    if audio_filepath is None:
        return "No audio recorded.", None
    
    try:
        # Load Whisper model (small for better speed on ZeroGPU)
        # Use device="cuda" to force GPU usage inside this decorated function
        model = whisper.load_model("small", device="cuda")  # or "base" if even faster needed
        
        # Transcribe the audio file
        result = model.transcribe(audio_filepath, language="en")
        text = result["text"].strip()
        
        if not text:
            return "No speech detected in the recording.", None
        
        # Generate ASL video from text
        video_path = get_sign_language(text)
        
        return f"Transcribed: \"{text}\"", video_path
    
    except Exception as e:
        return f"Error: {str(e)}", None

# ── Gradio UI ────────────────────────────────────────────────────────────────
with gr.Blocks(title="Speech β†’ ASL Avatar Translator") as demo:
    gr.Markdown("""
    # Speech to ASL Avatar (ZeroGPU)
    1. Record your voice using the microphone below  
    2. Click **Translate**  
    3. Whisper transcribes β†’ Sign-Speak generates ASL signing video
    """)
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],  # added upload fallback
            type="filepath",
            label="Speak here (click record) or upload audio",
            format="wav"
        )
        submit_btn = gr.Button("Sign Translate", variant="primary")
    
    transcript_output = gr.Textbox(label="Transcribed Text / Status", lines=3)
    video_output = gr.Video(label="ASL Avatar Signing Video", autoplay=True)
    
    # Wire up the button
    submit_btn.click(
        fn=transcribe_and_translate,
        inputs=audio_input,
        outputs=[transcript_output, video_output]
    )

# Launch (HF Spaces ignores server_name/port)
demo.launch(
    server_name="0.0.0.0",
    server_port=7860,
    share=False,
    debug=False,
    ssr_mode=False  # disable experimental SSR to avoid proxy issues
)