Spaces:

Remostartdev
/

Sign-to-speech

Sleeping

App Files Files Community

Sign-to-speech / app.py

Devubiodee

Update app.py

46dd736 verified 1 day ago

Raw

History Blame Contribute Delete

3.12 kB

	import gradio as gr
	import torch
	import cv2
	import numpy as np
	from PIL import Image
	from transformers import LlavaNextVideoForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
	from gtts import gTTS
	import tempfile

	print("Loading model from Hugging Face (4-bit mode)...")

	# ====================== 4-BIT CONFIG ======================
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True
	)

	model = LlavaNextVideoForConditionalGeneration.from_pretrained(
	"Remostart/sign-language",
	quantization_config=bnb_config,
	device_map="auto"
	)

	processor = AutoProcessor.from_pretrained("Remostart/sign-language")
	print("✅ Model loaded successfully in 4-bit!")

	def translate_and_speak(video_path):
	if video_path is None:
	return "Please upload a video.", None

	# Load frames
	cap = cv2.VideoCapture(video_path)
	frames = []
	total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if total > 0:
	for i in np.linspace(0, total - 1, 8, dtype=int):
	cap.set(cv2.CAP_PROP_POS_FRAMES, i)
	ret, frame = cap.read()
	if ret:
	frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
	cap.release()

	if not frames:
	return "Could not read video.", None

	# Prompt
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "video"},
	{"type": "text", "text": "Translate the sign language in this video to English."}
	]
	}
	]
	prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

	inputs = processor(text=prompt, videos=frames, return_tensors="pt").to(model.device)

	# Generation (memory friendly)
	with torch.no_grad():
	output = model.generate(
	**inputs,
	max_new_tokens=50,
	do_sample=False,
	num_beams=2,
	repetition_penalty=1.2
	)

	translation = processor.decode(output[0], skip_special_tokens=True)
	if "ASSISTANT:" in translation:
	translation = translation.split("ASSISTANT:")[-1].strip()

	# Text to Speech
	tts = gTTS(text=translation, lang='en')
	audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
	tts.save(audio_path)

	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return translation, audio_path

	# ====================== GRADIO INTERFACE ======================
	with gr.Blocks(title="Sign Language to Speech") as demo:
	gr.Markdown("# 🖐️ Sign Language Translator (Text + Voice)")
	gr.Markdown("Upload a sign language video and get the translation as text and voice.")

	video = gr.Video(label="Upload Video")
	btn = gr.Button("Translate & Speak", variant="primary")

	text_out = gr.Textbox(label="Translation (Text)", lines=3)
	audio_out = gr.Audio(label="Voice Output", type="filepath")

	btn.click(translate_and_speak, inputs=video, outputs=[text_out, audio_out])

	demo.launch()