import streamlit as st import os import json import faiss import numpy as np from vosk import Model, KaldiRecognizer import wave from pydub import AudioSegment from sentence_transformers import SentenceTransformer from transformers import pipeline # ------------------------------ # 1. LOAD MODELS OFFLINE # ------------------------------ @st.cache_resource def load_stt_model(): model_path = "vosk-model-small-en-us-0.15" return Model(model_path) @st.cache_resource def load_embedding_model(): return SentenceTransformer("all-MiniLM-L6-v2") @st.cache_resource def load_qa_model(): return pipeline("question-answering", model="distilbert-base-cased-distilled-squad") stt_model = load_stt_model() embedder = load_embedding_model() qa_model = load_qa_model() # ------------------------------ # 2. FUNCTIONS # ------------------------------ def transcribe_audio(file_path): wf = wave.open(file_path, "rb") rec = KaldiRecognizer(stt_model, wf.getframerate()) rec.SetWords(True) text_result = "" while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = rec.Result() text_result += json.loads(res)["text"] + " " final_res = rec.FinalResult() text_result += json.loads(final_res)["text"] return text_result.strip() def convert_to_wav(uploaded_file): audio = AudioSegment.from_file(uploaded_file) output_path = "temp.wav" audio.export(output_path, format="wav") return output_path def save_text(text): with open(f"transcripts/data.txt", "a", encoding="utf-8") as f: f.write(text + "\n") def build_vector_db(): with open("transcripts/data.txt", "r", encoding="utf-8") as f: docs = f.readlines() embeddings = embedder.encode(docs) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings).astype("float32")) return docs, index def retrieve(query, docs, index, top_k=3): q_emb = embedder.encode([query]) D, I = index.search(np.array(q_emb).astype("float32"), top_k) results = [docs[i] for i in I[0]] return " ".join(results) # ------------------------------ # 3. STREAMLIT UI # ------------------------------ st.title("🔴 Offline GenAI RAG from Audio (No API • No Internet)") st.write("🎤 Upload or record audio → 📄 Convert to text → 🤖 Ask questions offline") menu = st.sidebar.radio("Navigation", ["Upload Audio", "Ask Questions"]) # ----------------------------------- # UPLOAD AUDIO PAGE # ----------------------------------- if menu == "Upload Audio": st.header("🎤 Upload or Record Audio") audio_file = st.file_uploader("Upload audio file", type=["wav", "mp3", "m4a"]) if audio_file: st.success("File uploaded successfully") wav_path = convert_to_wav(audio_file) st.info("Transcribing offline... please wait") text = transcribe_audio(wav_path) st.subheader("📝 Transcribed Text") st.write(text) save_text(text) st.success("Saved locally in transcripts/ folder") # ----------------------------------- # ASK QUESTION PAGE # ----------------------------------- if menu == "Ask Questions": st.header("❓ Ask Questions From Your Audio Knowledge Base") docs, index = build_vector_db() user_q = st.text_input("Enter your question") if st.button("Get Answer"): context = retrieve(user_q, docs, index) result = qa_model(question=user_q, context=context) st.subheader("🧠 Answer") st.write(result["answer"]) st.caption("Based only on your stored audio transcriptions")