ytrsoymr commited on
Commit
ae7eaf9
·
verified ·
1 Parent(s): 38b0403

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +50 -0
  2. config.py +17 -0
  3. requirements.txt +16 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import assemblyai as aai
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_chroma import Chroma
5
+ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, ASSEMBLYAI_API_KEY
6
+ import tempfile
7
+
8
+ # Initialize AssemblyAI
9
+ aai.settings.api_key = ASSEMBLYAI_API_KEY
10
+ transcriber = aai.Transcriber()
11
+
12
+ # Load embeddings model
13
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
14
+
15
+ # Load ChromaDB
16
+ db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
17
+
18
+ def transcribe_audio(audio_path):
19
+ """Convert audio to text using AssemblyAI."""
20
+ transcript = transcriber.transcribe(audio_path)
21
+ return transcript.text if transcript else ""
22
+
23
+ def retrieve_similar_chunks(query: str, k=5):
24
+ """Retrieve top-k most relevant document chunks from ChromaDB."""
25
+ results = db.similarity_search(query, k=k)
26
+ return [(doc.metadata['num'], doc.page_content) for doc in results]
27
+
28
+ # Streamlit UI
29
+ st.title("Video Subtitle Search Engine")
30
+
31
+ uploaded_file = st.file_uploader("Upload an audio/video file", type=["mp3", "wav", "mp4"])
32
+ num_results = st.slider("Number of results", 1, 10, 5)
33
+
34
+ if uploaded_file:
35
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
36
+ tmp_file.write(uploaded_file.read())
37
+ tmp_path = tmp_file.name
38
+
39
+ st.write("Transcribing audio...")
40
+ query_text = transcribe_audio(tmp_path)
41
+ st.write("Transcription:", query_text)
42
+
43
+ if query_text:
44
+ st.write("Searching for relevant subtitles...")
45
+ results = retrieve_similar_chunks(query_text, num_results)
46
+
47
+ for num, content in results:
48
+ st.markdown(f"**Subtitle ID:** [{num}](https://www.opensubtitles.org/en/subtitles/{num})")
49
+ st.write(content)
50
+ st.markdown("---")
config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables from .env file
5
+ load_dotenv()
6
+
7
+ # Retrieve API key from environment variables
8
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
9
+
10
+ # Embedding model name
11
+ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
12
+
13
+ # ChromaDB path
14
+ CHROMA_DB_PATH = "./chroma_db"
15
+
16
+ # AssemblyAI API key
17
+ ASSEMBLYAI_API_KEY=os.getenv("ASSEMBLYAI_API_KEY")
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ ipykernel
3
+ langchain
4
+ langchain-community
5
+ dotenv
6
+ torch
7
+ torchaudio
8
+ transformers
9
+ numpy
10
+ streamlit
11
+ sentence-transformers
12
+ chromadb
13
+ scikit-learn
14
+ langchain_huggingface
15
+ langchain_chroma
16
+ assemblyai