File size: 2,542 Bytes
147f94f
62f2e1f
147f94f
 
 
 
14e8d86
147f94f
62f2e1f
 
147f94f
62f2e1f
 
 
 
14e8d86
 
 
147f94f
 
 
 
 
 
62f2e1f
147f94f
 
14e8d86
62f2e1f
 
 
 
 
 
 
 
 
 
147f94f
 
62f2e1f
147f94f
 
 
 
 
62f2e1f
 
147f94f
62f2e1f
147f94f
 
 
 
 
 
14e8d86
147f94f
62f2e1f
147f94f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from transformers import AutoProcessor, AutoModelForCTC, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch
import librosa
import soundfile as sf
import io
import os

# Use HF_TOKEN from env
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

# Models (use CPU if no GPU; for free tier, may be slow/large - upgrade for GPU)
asr_model_name = "ai4bharat/indicconformer-600m-multilingual"
asr_processor = AutoProcessor.from_pretrained(asr_model_name)
asr_model = AutoModelForCTC.from_pretrained(asr_model_name)

llm_model_name = "ai4bharat/IndicBART"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name, do_lower_case=False, use_fast=False, keep_accents=True)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

trans_model_name = "ai4bharat/IndicTrans3-beta"
trans_tokenizer = AutoTokenizer.from_pretrained(trans_model_name)
trans_model = AutoModelForSeq2SeqLM.from_pretrained(trans_model_name)

tts_pipe = pipeline("text-to-speech", model="ai4bharat/indic-parler-tts-v2")  # Switch to non-gated if issues

def full_pipeline(audio, source_lang, target_lang):
    # ASR
    audio_array, _ = librosa.load(io.BytesIO(audio), sr=16000)
    inputs = asr_processor(audio_array, sampling_rate=16000, return_tensors="pt")
    with torch.no_grad():
        logits = asr_model(inputs.input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    text = asr_processor.batch_decode(pred_ids)[0]

    # LLM response (echo for test)
    inputs = llm_tokenizer(text, return_tensors="pt")
    outputs = llm_model.generate(**inputs)
    response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Translation
    if source_lang != target_lang:
        inputs = trans_tokenizer(response, return_tensors="pt")
        outputs = trans_model.generate(**inputs)
        response = trans_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # TTS
    tts_output = tts_pipe(response)
    with io.BytesIO() as buffer:
        sf.write(buffer, tts_output["audio"][0], tts_output["sampling_rate"], format="wav")
        audio_bytes = buffer.getvalue()

    return audio_bytes, text, response

iface = gr.Interface(
    fn=full_pipeline,
    inputs=[gr.Audio(type="file"), gr.Textbox(label="Source Lang e.g. hi"), gr.Textbox(label="Target Lang e.g. en")],
    outputs=[gr.Audio(label="Response Audio"), gr.Textbox(label="Transcribed Text"), gr.Textbox(label="Response Text")],
    title="HanuVak Backend"
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)