Mpavan45 commited on
Commit
73e6346
Β·
verified Β·
1 Parent(s): d982eac

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import whisper
3
+ import numpy as np
4
+ import chromadb
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import os
8
+
9
+ # Set up the title and description
10
+ st.title("πŸŽ₯ AI-Powered Video Subtitle Extractor with Cosine Similarity & Chroma DB")
11
+ st.write("Upload a video to extract speech, convert it to text, and find matching subtitles using cosine similarity.")
12
+
13
+ # πŸ”₯ Load Chroma DB
14
+ chroma_path = "./chroma.sqlite3" # Path to your local Chroma DB file
15
+ chroma_client = chromadb.PersistentClient(path=chroma_path)
16
+ collection = chroma_client.get_collection(name="subtitle_chunk1")
17
+
18
+ # Load embedding model
19
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
20
+
21
+ # Whisper model
22
+ model = whisper.load_model("base")
23
+
24
+ # Function to extract subtitles using cosine similarity
25
+ def find_matching_subtitles(transcribed_text, top_k=5):
26
+ # Generate embedding for transcribed text
27
+ query_embedding = embedder.encode(transcribed_text).reshape(1, -1)
28
+
29
+ # Retrieve all stored subtitles from Chroma DB
30
+ results = collection.get()
31
+ all_embeddings = np.array(results['embeddings'])
32
+ all_documents = results['documents']
33
+ all_metadata = results['metadatas']
34
+
35
+ # Calculate cosine similarity
36
+ similarities = cosine_similarity(query_embedding, all_embeddings)[0]
37
+
38
+ # Get top K matches
39
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
40
+
41
+ # Display matching subtitles
42
+ matching_subtitles = []
43
+ for idx in top_indices:
44
+ matching_subtitles.append({
45
+ "subtitle": all_documents[idx],
46
+ "similarity": similarities[idx],
47
+ "metadata": all_metadata[idx]
48
+ })
49
+
50
+ return matching_subtitles
51
+
52
+ # Streamlit UI
53
+ uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mkv", "avi", "mov"])
54
+
55
+ if uploaded_file:
56
+ # Save uploaded video temporarily
57
+ temp_video_path = os.path.join("temp_video", "uploaded_video.mp4")
58
+ os.makedirs("temp_video", exist_ok=True)
59
+
60
+ with open(temp_video_path, "wb") as f:
61
+ f.write(uploaded_file.read())
62
+
63
+ # Transcribe video speech
64
+ st.info("⏳ Transcribing video speech...")
65
+ transcription = model.transcribe(temp_video_path)
66
+ transcribed_text = transcription['text']
67
+
68
+ st.success("βœ… Transcription complete!")
69
+ st.write("### Transcribed Speech:")
70
+ st.write(transcribed_text)
71
+
72
+ # Find matching subtitles
73
+ st.info("πŸ” Finding matching subtitles...")
74
+ matching_subtitles = find_matching_subtitles(transcribed_text)
75
+
76
+ st.write("### 🎯 Matching Subtitles:")
77
+ for match in matching_subtitles:
78
+ st.write(f"**Subtitle:** {match['subtitle']}")
79
+ st.write(f"**Similarity:** {match['similarity']:.4f}")
80
+ st.write(f"**Metadata:** {match['metadata']}")
81
+ st.write("---")
82
+
83
+ # Clean up temporary video
84
+ os.remove(temp_video_path)
85
+
86
+ st.sidebar.write("πŸ”Ž Upload a video to extract and match subtitles using Cosine Similarity & Chroma DB.")