import gradio as gr import torch import cv2 import numpy as np from PIL import Image from transformers import LlavaNextVideoForConditionalGeneration, AutoProcessor, BitsAndBytesConfig from gtts import gTTS import tempfile print("Loading model from Hugging Face (4-bit mode)...") # ====================== 4-BIT CONFIG ====================== bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True ) model = LlavaNextVideoForConditionalGeneration.from_pretrained( "Remostart/sign-language", quantization_config=bnb_config, device_map="auto" ) processor = AutoProcessor.from_pretrained("Remostart/sign-language") print("✅ Model loaded successfully in 4-bit!") def translate_and_speak(video_path): if video_path is None: return "Please upload a video.", None # Load frames cap = cv2.VideoCapture(video_path) frames = [] total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if total > 0: for i in np.linspace(0, total - 1, 8, dtype=int): cap.set(cv2.CAP_PROP_POS_FRAMES, i) ret, frame = cap.read() if ret: frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) cap.release() if not frames: return "Could not read video.", None # Prompt conversation = [ { "role": "user", "content": [ {"type": "video"}, {"type": "text", "text": "Translate the sign language in this video to English."} ] } ] prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = processor(text=prompt, videos=frames, return_tensors="pt").to(model.device) # Generation (memory friendly) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=50, do_sample=False, num_beams=2, repetition_penalty=1.2 ) translation = processor.decode(output[0], skip_special_tokens=True) if "ASSISTANT:" in translation: translation = translation.split("ASSISTANT:")[-1].strip() # Text to Speech tts = gTTS(text=translation, lang='en') audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name tts.save(audio_path) if torch.cuda.is_available(): torch.cuda.empty_cache() return translation, audio_path # ====================== GRADIO INTERFACE ====================== with gr.Blocks(title="Sign Language to Speech") as demo: gr.Markdown("# 🖐️ Sign Language Translator (Text + Voice)") gr.Markdown("Upload a sign language video and get the translation as **text and voice**.") video = gr.Video(label="Upload Video") btn = gr.Button("Translate & Speak", variant="primary") text_out = gr.Textbox(label="Translation (Text)", lines=3) audio_out = gr.Audio(label="Voice Output", type="filepath") btn.click(translate_and_speak, inputs=video, outputs=[text_out, audio_out]) demo.launch()