File size: 3,624 Bytes
3845214
 
 
a988a6f
3845214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a988a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3845214
 
 
 
a988a6f
3845214
 
a988a6f
 
3845214
 
 
 
 
 
a988a6f
3845214
 
 
 
 
 
a988a6f
3845214
 
a988a6f
 
3845214
 
 
 
 
 
a988a6f
3845214
 
 
a988a6f
3845214
 
 
 
 
 
a988a6f
3845214
 
 
 
a988a6f
3845214
 
 
 
 
a988a6f
3845214
 
 
 
a988a6f
3845214
 
a988a6f
3845214
 
a988a6f
3845214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a988a6f
 
3845214
 
 
 
 
 
a988a6f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import json
import torch
import torchaudio
import requests
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import FileResponse
from transformers import (
    Wav2Vec2Processor, Wav2Vec2ForCTC,
    AutoFeatureExtractor, AutoModelForAudioClassification
)
from starlette.middleware.cors import CORSMiddleware

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# Load config
with open("config.json") as f:
    config = json.load(f)

ELEVEN_API_KEY = config["eleven_api_key"]
VOICE_ID = config["eleven_voice_id"]
LLM_URL = config["llm_url"]



def load_audio(audio_path, target_sr=16000):
    wav, sr = torchaudio.load(audio_path)

    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)


    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, sr, target_sr)

    return wav.squeeze().numpy(), target_sr


#  STT MODEL 
print("Loading STT model...")
stt_processor = Wav2Vec2Processor.from_pretrained("facebook/mms-1b-all")
stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-1b-all").to(DEVICE)
stt_model.eval()
print("STT loaded")

def transcribe(audio_path):
    wav, sr = load_audio(audio_path)
    inputs = stt_processor(wav, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = stt_model(inputs.input_values.to(DEVICE)).logits
    ids = torch.argmax(logits, dim=-1)
    return stt_processor.batch_decode(ids)[0].strip()


# EMOTION MODEL  #
print("Loading Emotion model...")
emotion_extractor = AutoFeatureExtractor.from_pretrained("superb/hubert-base-superb-er")
emotion_model = AutoModelForAudioClassification.from_pretrained(
    "superb/hubert-base-superb-er"
).to(DEVICE)
emotion_model.eval()
print("Emotion model loaded")

def get_emotion(audio_path):
    wav, sr = load_audio(audio_path)
    feats = emotion_extractor(wav, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        out = emotion_model(feats["input_values"].to(DEVICE))
    pred = torch.argmax(out.logits, dim=-1).item()
    return emotion_model.config.id2label[pred]


#  LLM CALL  
def ask_llm(text):
    payload = {"query": text}
    r = requests.post(LLM_URL, json=payload, timeout=200)

    try:
        return r.json()["answer"]
    except:
        return str(r.json())


# TTS 
def tts_eleven(text, out_file="response.mp3"):
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
    headers = {
        "xi-api-key": ELEVEN_API_KEY,
        "Content-Type": "application/json",
    }
    payload = {"text": text, "model_id": "eleven_multilingual_v2"}

    resp = requests.post(url, json=payload, headers=headers)
    if resp.status_code != 200:
        raise Exception(f"ElevenLabs API Error: {resp.text}")

    with open(out_file, "wb") as f:
        f.write(resp.content)

    return out_file


# FASTAPI APP 
app = FastAPI(title="Voice AI API")


app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.post("/process-audio/")
async def process_audio(file: UploadFile = File(...)):
    audio_path = f"temp_{file.filename}"
    with open(audio_path, "wb") as f:
        f.write(await file.read())

    transcript = transcribe(audio_path)
    emotion = get_emotion(audio_path)
    llm_response = ask_llm(transcript)
    tts_file = tts_eleven(llm_response)

    return FileResponse(tts_file, media_type="audio/mpeg", filename="response.mp3")


@app.get("/")
async def root():
    return {
        "message": "Voice AI API is running. Use /process-audio/ to upload audio."
    }