Mpavan45 commited on
Commit
2ce54cc
Β·
verified Β·
1 Parent(s): e0e692f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -81
app.py CHANGED
@@ -1,86 +1,70 @@
1
  import streamlit as st
2
- import whisper
3
- import numpy as np
4
- import chromadb
5
  from sentence_transformers import SentenceTransformer
6
- from sklearn.metrics.pairwise import cosine_similarity
7
- import os
8
-
9
- # Set up the title and description
10
- st.title("πŸŽ₯ AI-Powered Video Subtitle Extractor with Cosine Similarity & Chroma DB")
11
- st.write("Upload a video to extract speech, convert it to text, and find matching subtitles using cosine similarity.")
12
-
13
- # πŸ”₯ Load Chroma DB
14
- chroma_path = "./chroma.sqlite3" # Path to your local Chroma DB file
15
- chroma_client = chromadb.PersistentClient(path=chroma_path)
16
- collection = chroma_client.get_collection(name="subtitle_chunk1")
17
-
18
- # Load embedding model
19
- embedder = SentenceTransformer('all-MiniLM-L6-v2')
20
-
21
- # Whisper model
22
- model = whisper.load_model("base")
23
-
24
- # Function to extract subtitles using cosine similarity
25
- def find_matching_subtitles(transcribed_text, top_k=5):
26
- # Generate embedding for transcribed text
27
- query_embedding = embedder.encode(transcribed_text).reshape(1, -1)
28
-
29
- # Retrieve all stored subtitles from Chroma DB
30
- results = collection.get()
31
- all_embeddings = np.array(results['embeddings'])
32
- all_documents = results['documents']
33
- all_metadata = results['metadatas']
34
-
35
- # Calculate cosine similarity
36
- similarities = cosine_similarity(query_embedding, all_embeddings)[0]
37
-
38
- # Get top K matches
39
- top_indices = np.argsort(similarities)[-top_k:][::-1]
40
-
41
- # Display matching subtitles
42
- matching_subtitles = []
43
- for idx in top_indices:
44
- matching_subtitles.append({
45
- "subtitle": all_documents[idx],
46
- "similarity": similarities[idx],
47
- "metadata": all_metadata[idx]
48
- })
49
 
50
- return matching_subtitles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  # Streamlit UI
53
- uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mkv", "avi", "mov"])
54
-
55
- if uploaded_file:
56
- # Save uploaded video temporarily
57
- temp_video_path = os.path.join("temp_video", "uploaded_video.mp4")
58
- os.makedirs("temp_video", exist_ok=True)
59
-
60
- with open(temp_video_path, "wb") as f:
61
- f.write(uploaded_file.read())
62
-
63
- # Transcribe video speech
64
- st.info("⏳ Transcribing video speech...")
65
- transcription = model.transcribe(temp_video_path)
66
- transcribed_text = transcription['text']
67
-
68
- st.success("βœ… Transcription complete!")
69
- st.write("### Transcribed Speech:")
70
- st.write(transcribed_text)
71
-
72
- # Find matching subtitles
73
- st.info("πŸ” Finding matching subtitles...")
74
- matching_subtitles = find_matching_subtitles(transcribed_text)
75
-
76
- st.write("### 🎯 Matching Subtitles:")
77
- for match in matching_subtitles:
78
- st.write(f"**Subtitle:** {match['subtitle']}")
79
- st.write(f"**Similarity:** {match['similarity']:.4f}")
80
- st.write(f"**Metadata:** {match['metadata']}")
81
- st.write("---")
82
-
83
- # Clean up temporary video
84
- os.remove(temp_video_path)
85
-
86
- st.sidebar.write("πŸ”Ž Upload a video to extract and match subtitles using Cosine Similarity & Chroma DB.")
 
1
  import streamlit as st
 
 
 
2
  from sentence_transformers import SentenceTransformer
3
+ import re
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from nltk.tokenize import word_tokenize
7
+ from nltk.stem import WordNetLemmatizer
8
+ import chromadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ # Download NLTK resources if not available
11
+ nltk.download('punkt')
12
+ nltk.download('stopwords')
13
+ nltk.download('wordnet')
14
+
15
+ # Load the SentenceTransformer model
16
+ @st.cache_resource
17
+ def load_model():
18
+ return SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ model = load_model()
21
+
22
+ # Connect to ChromaDB client
23
+ @st.cache_resource
24
+ def get_chroma_collection():
25
+ try:
26
+ client = chromadb.PersistentClient(path="vectordb")
27
+ return client.get_collection("searchengine1")
28
+ except Exception as e:
29
+ st.error(f"Database connection failed: {e}")
30
+ return None
31
+
32
+ collection = get_chroma_collection()
33
+
34
+ # Function to clean and preprocess text
35
+ def clean_text(text):
36
+ text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
37
+ tokens = word_tokenize(text)
38
+ stop_words = set(stopwords.words('english'))
39
+ lemmatizer = WordNetLemmatizer()
40
+ clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
41
+ return ' '.join(clean_tokens).strip()
42
 
43
  # Streamlit UI
44
+ st.title("πŸ” Semantic Search Engine")
45
+ st.write("Enter your query below to search relevant documents.")
46
+
47
+ query = st.text_input("Search Query:", "")
48
+
49
+ if query and collection:
50
+ with st.spinner("Searching..."):
51
+ cleaned_query = clean_text(query)
52
+ query_embedding = model.encode([cleaned_query])
53
+
54
+ # Perform the search query
55
+ results = collection.query(
56
+ query_embeddings=query_embedding,
57
+ n_results=5,
58
+ include=['documents']
59
+ )
60
+
61
+ documents = results.get('documents', [])
62
+
63
+ # Display results
64
+ if documents:
65
+ st.subheader("πŸ”Ή Search Results:")
66
+ for i, query_documents in enumerate(documents):
67
+ for j, document in enumerate(query_documents):
68
+ st.markdown(f"**{j+1}.** {document}")
69
+ else:
70
+ st.warning("No results found. Try a different query.")