Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| from transformers import LlavaNextVideoForConditionalGeneration, AutoProcessor, BitsAndBytesConfig | |
| from gtts import gTTS | |
| import tempfile | |
| print("Loading model from Hugging Face (4-bit mode)...") | |
| # ====================== 4-BIT CONFIG ====================== | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True | |
| ) | |
| model = LlavaNextVideoForConditionalGeneration.from_pretrained( | |
| "Remostart/sign-language", | |
| quantization_config=bnb_config, | |
| device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained("Remostart/sign-language") | |
| print("โ Model loaded successfully in 4-bit!") | |
| def translate_and_speak(video_path): | |
| if video_path is None: | |
| return "Please upload a video.", None | |
| # Load frames | |
| cap = cv2.VideoCapture(video_path) | |
| frames = [] | |
| total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| if total > 0: | |
| for i in np.linspace(0, total - 1, 8, dtype=int): | |
| cap.set(cv2.CAP_PROP_POS_FRAMES, i) | |
| ret, frame = cap.read() | |
| if ret: | |
| frames.append(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))) | |
| cap.release() | |
| if not frames: | |
| return "Could not read video.", None | |
| # Prompt | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video"}, | |
| {"type": "text", "text": "Translate the sign language in this video to English."} | |
| ] | |
| } | |
| ] | |
| prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) | |
| inputs = processor(text=prompt, videos=frames, return_tensors="pt").to(model.device) | |
| # Generation (memory friendly) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=50, | |
| do_sample=False, | |
| num_beams=2, | |
| repetition_penalty=1.2 | |
| ) | |
| translation = processor.decode(output[0], skip_special_tokens=True) | |
| if "ASSISTANT:" in translation: | |
| translation = translation.split("ASSISTANT:")[-1].strip() | |
| # Text to Speech | |
| tts = gTTS(text=translation, lang='en') | |
| audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name | |
| tts.save(audio_path) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return translation, audio_path | |
| # ====================== GRADIO INTERFACE ====================== | |
| with gr.Blocks(title="Sign Language to Speech") as demo: | |
| gr.Markdown("# ๐๏ธ Sign Language Translator (Text + Voice)") | |
| gr.Markdown("Upload a sign language video and get the translation as **text and voice**.") | |
| video = gr.Video(label="Upload Video") | |
| btn = gr.Button("Translate & Speak", variant="primary") | |
| text_out = gr.Textbox(label="Translation (Text)", lines=3) | |
| audio_out = gr.Audio(label="Voice Output", type="filepath") | |
| btn.click(translate_and_speak, inputs=video, outputs=[text_out, audio_out]) | |
| demo.launch() |