Spaces:
Runtime error
Runtime error
File size: 2,542 Bytes
147f94f 62f2e1f 147f94f 14e8d86 147f94f 62f2e1f 147f94f 62f2e1f 14e8d86 147f94f 62f2e1f 147f94f 14e8d86 62f2e1f 147f94f 62f2e1f 147f94f 62f2e1f 147f94f 62f2e1f 147f94f 14e8d86 147f94f 62f2e1f 147f94f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch
import librosa
import soundfile as sf
import io
import os
# Use HF_TOKEN from env
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
# Models (use CPU if no GPU; for free tier, may be slow/large - upgrade for GPU)
asr_model_name = "ai4bharat/indicconformer-600m-multilingual"
asr_processor = AutoProcessor.from_pretrained(asr_model_name)
asr_model = AutoModelForCTC.from_pretrained(asr_model_name)
llm_model_name = "ai4bharat/IndicBART"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name, do_lower_case=False, use_fast=False, keep_accents=True)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)
trans_model_name = "ai4bharat/IndicTrans3-beta"
trans_tokenizer = AutoTokenizer.from_pretrained(trans_model_name)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_name)
tts_pipe = pipeline("text-to-speech", model="ai4bharat/indic-parler-tts-v2") # Switch to non-gated if issues
def full_pipeline(audio, source_lang, target_lang):
# ASR
audio_array, _ = librosa.load(io.BytesIO(audio), sr=16000)
inputs = asr_processor(audio_array, sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = asr_model(inputs.input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
text = asr_processor.batch_decode(pred_ids)[0]
# LLM response (echo for test)
inputs = llm_tokenizer(text, return_tensors="pt")
outputs = llm_model.generate(**inputs)
response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Translation
if source_lang != target_lang:
inputs = trans_tokenizer(response, return_tensors="pt")
outputs = trans_model.generate(**inputs)
response = trans_tokenizer.decode(outputs[0], skip_special_tokens=True)
# TTS
tts_output = tts_pipe(response)
with io.BytesIO() as buffer:
sf.write(buffer, tts_output["audio"][0], tts_output["sampling_rate"], format="wav")
audio_bytes = buffer.getvalue()
return audio_bytes, text, response
iface = gr.Interface(
fn=full_pipeline,
inputs=[gr.Audio(type="file"), gr.Textbox(label="Source Lang e.g. hi"), gr.Textbox(label="Target Lang e.g. en")],
outputs=[gr.Audio(label="Response Audio"), gr.Textbox(label="Transcribed Text"), gr.Textbox(label="Response Text")],
title="HanuVak Backend"
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860) |