ytrsoymr commited on
Commit
1c454de
Β·
verified Β·
1 Parent(s): 541c97b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -50
app.py CHANGED
@@ -1,50 +1,58 @@
1
- import streamlit as st
2
- import assemblyai as aai
3
- from langchain_huggingface import HuggingFaceEmbeddings
4
- from langchain_chroma import Chroma
5
- from config import CHROMA_DB_PATH, EMBEDDING_MODEL, ASSEMBLYAI_API_KEY
6
- import tempfile
7
-
8
- # Initialize AssemblyAI
9
- aai.settings.api_key = ASSEMBLYAI_API_KEY
10
- transcriber = aai.Transcriber()
11
-
12
- # Load embeddings model
13
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
14
-
15
- # Load ChromaDB
16
- db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
17
-
18
- def transcribe_audio(audio_path):
19
- """Convert audio to text using AssemblyAI."""
20
- transcript = transcriber.transcribe(audio_path)
21
- return transcript.text if transcript else ""
22
-
23
- def retrieve_similar_chunks(query: str, k=5):
24
- """Retrieve top-k most relevant document chunks from ChromaDB."""
25
- results = db.similarity_search(query, k=k)
26
- return [(doc.metadata['num'], doc.page_content) for doc in results]
27
-
28
- # Streamlit UI
29
- st.title("Video Subtitle Search Engine")
30
-
31
- uploaded_file = st.file_uploader("Upload an audio/video file", type=["mp3", "wav", "mp4"])
32
- num_results = st.slider("Number of results", 1, 10, 5)
33
-
34
- if uploaded_file:
35
- with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
36
- tmp_file.write(uploaded_file.read())
37
- tmp_path = tmp_file.name
38
-
39
- st.write("Transcribing audio...")
40
- query_text = transcribe_audio(tmp_path)
41
- st.write("Transcription:", query_text)
42
-
43
- if query_text:
44
- st.write("Searching for relevant subtitles...")
45
- results = retrieve_similar_chunks(query_text, num_results)
46
-
47
- for num, content in results:
48
- st.markdown(f"**Subtitle ID:** [{num}](https://www.opensubtitles.org/en/subtitles/{num})")
49
- st.write(content)
50
- st.markdown("---")
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import assemblyai as aai
3
+ from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_chroma import Chroma
5
+ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, ASSEMBLYAI_API_KEY
6
+ import tempfile
7
+
8
+ # Initialize AssemblyAI
9
+ aai.settings.api_key = ASSEMBLYAI_API_KEY
10
+ transcriber = aai.Transcriber()
11
+
12
+ # Load embeddings model
13
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
14
+
15
+ # Load ChromaDB
16
+ db = Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings)
17
+
18
+ def transcribe_audio(audio_path):
19
+ """Convert audio to text using AssemblyAI."""
20
+ transcript = transcriber.transcribe(audio_path)
21
+ return transcript.text if transcript else ""
22
+
23
+ def retrieve_similar_chunks(query: str, k=5):
24
+ """Retrieve top-k most relevant document chunks from ChromaDB."""
25
+ results = db.similarity_search(query, k=k)
26
+ return [(doc.metadata['num'], doc.page_content) for doc in results]
27
+
28
+ # Streamlit UI
29
+ st.set_page_config(page_title="Video Subtitle Search", layout="wide")
30
+ st.title("🎬 Video Subtitle Search Engine")
31
+ st.markdown("Upload an audio/video file to transcribe and find relevant subtitles.")
32
+
33
+ uploaded_file = st.file_uploader("πŸ“€ Upload an audio/video file", type=["mp3", "wav", "mp4"])
34
+ num_results = st.slider("πŸ” Number of search results", 1, 10, 5)
35
+
36
+ if uploaded_file:
37
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
38
+ tmp_file.write(uploaded_file.read())
39
+ tmp_path = tmp_file.name
40
+
41
+ with st.spinner("Transcribing audio... This may take a moment ⏳"):
42
+ query_text = transcribe_audio(tmp_path)
43
+
44
+ if query_text:
45
+ st.success("βœ… Transcription completed!")
46
+ st.text_area("πŸ“ Transcribed Text", query_text, height=150)
47
+
48
+ st.subheader("πŸ”Ž Searching for relevant subtitles...")
49
+ results = retrieve_similar_chunks(query_text, num_results)
50
+
51
+ if results:
52
+ for num, content in results:
53
+ with st.expander(f"πŸ“Œ Subtitle ID: {num}"):
54
+ st.write(content)
55
+ st.markdown(f"[View on OpenSubtitles](https://www.opensubtitles.org/en/subtitles/{num})")
56
+ st.markdown("---")
57
+ else:
58
+ st.warning("⚠️ No relevant subtitles found. Try a different audio file or increase search results.")