File size: 2,279 Bytes
a6f123f
a902cdb
 
a6f123f
 
 
a902cdb
 
 
 
a6f123f
 
a902cdb
a6f123f
a902cdb
a6f123f
 
a902cdb
 
 
a6f123f
 
a902cdb
a6f123f
 
 
b77f089
 
a902cdb
a6f123f
a902cdb
b77f089
a6f123f
b77f089
 
 
 
 
a6f123f
b77f089
a6f123f
a902cdb
b77f089
 
 
a6f123f
a902cdb
b77f089
 
 
 
 
a6f123f
a902cdb
b77f089
 
 
 
a6f123f
b77f089
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import streamlit as st
import whisper
from transformers import AutoModelForCausalLM, AutoTokenizer
from gtts import gTTS
import os

# Hugging Face Token (if using a private model)
HF_AUTH_TOKEN = ""  # Replace with your token if needed; leave empty for public models

# Load Whisper Model
@st.cache_resource
def load_whisper_model():
    return whisper.load_model("base")

# Load Llama-2 Model
@st.cache_resource
def load_llama_model():
    model_name = "meta-llama/Llama-2-7b-chat-hf"  # Official Llama-2 model from Meta
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_AUTH_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=HF_AUTH_TOKEN, torch_dtype="auto")
    return tokenizer, model

# Initialize models
whisper_model = load_whisper_model()
llama_tokenizer, llama_model = load_llama_model()

# Streamlit App
def main():
    st.title("Audio Query App with Llama-2 and Whisper")

    # File upload
    uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "m4a"])

    if uploaded_file is not None:
        # Save the audio file locally
        input_audio_path = "input_audio.wav"
        with open(input_audio_path, "wb") as f:
            f.write(uploaded_file.read())

        st.audio(input_audio_path, format="audio/wav")

        # Step 1: Transcribe audio
        with st.spinner("Transcribing audio..."):
            transcription = whisper_model.transcribe(input_audio_path)["text"]
        st.write(f"**Transcription:** {transcription}")

        # Step 2: Generate response using Llama-2
        with st.spinner("Generating response..."):
            inputs = llama_tokenizer(transcription, return_tensors="pt")
            outputs = llama_model.generate(**inputs, max_length=150)
            response_text = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
        st.write(f"**Response:** {response_text}")

        # Step 3: Convert text response to audio
        with st.spinner("Converting response to audio..."):
            response_audio_path = "response_audio.mp3"
            tts = gTTS(text=response_text, lang="en")
            tts.save(response_audio_path)

        st.audio(response_audio_path, format="audio/mp3")

if __name__ == "__main__":
    main()