Sign-to-speech / app.py
Devubiodee's picture
Update app.py
46dd736 verified
Raw
History Blame Contribute Delete
3.12 kB
import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
from transformers import LlavaNextVideoForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from gtts import gTTS
import tempfile
print("Loading model from Hugging Face (4-bit mode)...")
# ====================== 4-BIT CONFIG ======================
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
"Remostart/sign-language",
quantization_config=bnb_config,
device_map="auto"
)
processor = AutoProcessor.from_pretrained("Remostart/sign-language")
print("โœ… Model loaded successfully in 4-bit!")
def translate_and_speak(video_path):
if video_path is None:
return "Please upload a video.", None
# Load frames
cap = cv2.VideoCapture(video_path)
frames = []
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total > 0:
for i in np.linspace(0, total - 1, 8, dtype=int):
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if ret:
frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
cap.release()
if not frames:
return "Could not read video.", None
# Prompt
conversation = [
{
"role": "user",
"content": [
{"type": "video"},
{"type": "text", "text": "Translate the sign language in this video to English."}
]
}
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, videos=frames, return_tensors="pt").to(model.device)
# Generation (memory friendly)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=50,
do_sample=False,
num_beams=2,
repetition_penalty=1.2
)
translation = processor.decode(output[0], skip_special_tokens=True)
if "ASSISTANT:" in translation:
translation = translation.split("ASSISTANT:")[-1].strip()
# Text to Speech
tts = gTTS(text=translation, lang='en')
audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
tts.save(audio_path)
if torch.cuda.is_available():
torch.cuda.empty_cache()
return translation, audio_path
# ====================== GRADIO INTERFACE ======================
with gr.Blocks(title="Sign Language to Speech") as demo:
gr.Markdown("# ๐Ÿ–๏ธ Sign Language Translator (Text + Voice)")
gr.Markdown("Upload a sign language video and get the translation as **text and voice**.")
video = gr.Video(label="Upload Video")
btn = gr.Button("Translate & Speak", variant="primary")
text_out = gr.Textbox(label="Translation (Text)", lines=3)
audio_out = gr.Audio(label="Voice Output", type="filepath")
btn.click(translate_and_speak, inputs=video, outputs=[text_out, audio_out])
demo.launch()