Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoProcessor, AutoModelForCTC, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline | |
| import torch | |
| import librosa | |
| import soundfile as sf | |
| import io | |
| import os | |
| # Use HF_TOKEN from env | |
| os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN") | |
| # Models (use CPU if no GPU; for free tier, may be slow/large - upgrade for GPU) | |
| asr_model_name = "ai4bharat/indicconformer-600m-multilingual" | |
| asr_processor = AutoProcessor.from_pretrained(asr_model_name) | |
| asr_model = AutoModelForCTC.from_pretrained(asr_model_name) | |
| llm_model_name = "ai4bharat/IndicBART" | |
| llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name, do_lower_case=False, use_fast=False, keep_accents=True) | |
| llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name) | |
| trans_model_name = "ai4bharat/IndicTrans3-beta" | |
| trans_tokenizer = AutoTokenizer.from_pretrained(trans_model_name) | |
| trans_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_name) | |
| tts_pipe = pipeline("text-to-speech", model="ai4bharat/indic-parler-tts-v2") # Switch to non-gated if issues | |
| def full_pipeline(audio, source_lang, target_lang): | |
| # ASR | |
| audio_array, _ = librosa.load(io.BytesIO(audio), sr=16000) | |
| inputs = asr_processor(audio_array, sampling_rate=16000, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = asr_model(inputs.input_values).logits | |
| pred_ids = torch.argmax(logits, dim=-1) | |
| text = asr_processor.batch_decode(pred_ids)[0] | |
| # LLM response (echo for test) | |
| inputs = llm_tokenizer(text, return_tensors="pt") | |
| outputs = llm_model.generate(**inputs) | |
| response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Translation | |
| if source_lang != target_lang: | |
| inputs = trans_tokenizer(response, return_tensors="pt") | |
| outputs = trans_model.generate(**inputs) | |
| response = trans_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # TTS | |
| tts_output = tts_pipe(response) | |
| with io.BytesIO() as buffer: | |
| sf.write(buffer, tts_output["audio"][0], tts_output["sampling_rate"], format="wav") | |
| audio_bytes = buffer.getvalue() | |
| return audio_bytes, text, response | |
| iface = gr.Interface( | |
| fn=full_pipeline, | |
| inputs=[gr.Audio(type="file"), gr.Textbox(label="Source Lang e.g. hi"), gr.Textbox(label="Target Lang e.g. en")], | |
| outputs=[gr.Audio(label="Response Audio"), gr.Textbox(label="Transcribed Text"), gr.Textbox(label="Response Text")], | |
| title="HanuVak Backend" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(server_name="0.0.0.0", server_port=7860) |