import gradio as gr from transformers import AutoProcessor, AutoModelForCTC, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline import torch import librosa import soundfile as sf import io import os # Use HF_TOKEN from env os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN") # Models (use CPU if no GPU; for free tier, may be slow/large - upgrade for GPU) asr_model_name = "ai4bharat/indicconformer-600m-multilingual" asr_processor = AutoProcessor.from_pretrained(asr_model_name) asr_model = AutoModelForCTC.from_pretrained(asr_model_name) llm_model_name = "ai4bharat/IndicBART" llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name, do_lower_case=False, use_fast=False, keep_accents=True) llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name) trans_model_name = "ai4bharat/IndicTrans3-beta" trans_tokenizer = AutoTokenizer.from_pretrained(trans_model_name) trans_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_name) tts_pipe = pipeline("text-to-speech", model="ai4bharat/indic-parler-tts-v2") # Switch to non-gated if issues def full_pipeline(audio, source_lang, target_lang): # ASR audio_array, _ = librosa.load(io.BytesIO(audio), sr=16000) inputs = asr_processor(audio_array, sampling_rate=16000, return_tensors="pt") with torch.no_grad(): logits = asr_model(inputs.input_values).logits pred_ids = torch.argmax(logits, dim=-1) text = asr_processor.batch_decode(pred_ids)[0] # LLM response (echo for test) inputs = llm_tokenizer(text, return_tensors="pt") outputs = llm_model.generate(**inputs) response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True) # Translation if source_lang != target_lang: inputs = trans_tokenizer(response, return_tensors="pt") outputs = trans_model.generate(**inputs) response = trans_tokenizer.decode(outputs[0], skip_special_tokens=True) # TTS tts_output = tts_pipe(response) with io.BytesIO() as buffer: sf.write(buffer, tts_output["audio"][0], tts_output["sampling_rate"], format="wav") audio_bytes = buffer.getvalue() return audio_bytes, text, response iface = gr.Interface( fn=full_pipeline, inputs=[gr.Audio(type="file"), gr.Textbox(label="Source Lang e.g. hi"), gr.Textbox(label="Target Lang e.g. en")], outputs=[gr.Audio(label="Response Audio"), gr.Textbox(label="Transcribed Text"), gr.Textbox(label="Response Text")], title="HanuVak Backend" ) if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)