Spaces:

Mpavan45
/

AI_Powered_Video_Subtitle

Sleeping

App Files Files Community

Mpavan45 commited on Mar 30, 2025

Commit

2ce54cc

verified ·

1 Parent(s): e0e692f

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -81

app.py CHANGED Viewed

@@ -1,86 +1,70 @@
 import streamlit as st
-import whisper
-import numpy as np
-import chromadb
 from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
-import os
-# Set up the title and description
-st.title("🎥 AI-Powered Video Subtitle Extractor with Cosine Similarity & Chroma DB")
-st.write("Upload a video to extract speech, convert it to text, and find matching subtitles using cosine similarity.")
-# 🔥 Load Chroma DB
-chroma_path = "./chroma.sqlite3"  # Path to your local Chroma DB file
-chroma_client = chromadb.PersistentClient(path=chroma_path)
-collection = chroma_client.get_collection(name="subtitle_chunk1")
-# Load embedding model
-embedder = SentenceTransformer('all-MiniLM-L6-v2')
-# Whisper model
-model = whisper.load_model("base")
-# Function to extract subtitles using cosine similarity
-def find_matching_subtitles(transcribed_text, top_k=5):
-    # Generate embedding for transcribed text
-    query_embedding = embedder.encode(transcribed_text).reshape(1, -1)
-    # Retrieve all stored subtitles from Chroma DB
-    results = collection.get()
-    all_embeddings = np.array(results['embeddings'])
-    all_documents = results['documents']
-    all_metadata = results['metadatas']
-    # Calculate cosine similarity
-    similarities = cosine_similarity(query_embedding, all_embeddings)[0]
-    # Get top K matches
-    top_indices = np.argsort(similarities)[-top_k:][::-1]
-    # Display matching subtitles
-    matching_subtitles = []
-    for idx in top_indices:
-        matching_subtitles.append({
-            "subtitle": all_documents[idx],
-            "similarity": similarities[idx],
-            "metadata": all_metadata[idx]
-        })
-    return matching_subtitles
 # Streamlit UI
-uploaded_file = st.file_uploader("Upload Video", type=["mp4", "mkv", "avi", "mov"])
-if uploaded_file:
-    # Save uploaded video temporarily
-    temp_video_path = os.path.join("temp_video", "uploaded_video.mp4")
-    os.makedirs("temp_video", exist_ok=True)
-    with open(temp_video_path, "wb") as f:
-        f.write(uploaded_file.read())
-    # Transcribe video speech
-    st.info("⏳ Transcribing video speech...")
-    transcription = model.transcribe(temp_video_path)
-    transcribed_text = transcription['text']
-    st.success("✅ Transcription complete!")
-    st.write("### Transcribed Speech:")
-    st.write(transcribed_text)
-    # Find matching subtitles
-    st.info("🔍 Finding matching subtitles...")
-    matching_subtitles = find_matching_subtitles(transcribed_text)
-    st.write("### 🎯 Matching Subtitles:")
-    for match in matching_subtitles:
-        st.write(f"**Subtitle:** {match['subtitle']}")
-        st.write(f"**Similarity:** {match['similarity']:.4f}")
-        st.write(f"**Metadata:** {match['metadata']}")
-        st.write("---")
-    # Clean up temporary video
-    os.remove(temp_video_path)
-st.sidebar.write("🔎 Upload a video to extract and match subtitles using Cosine Similarity & Chroma DB.")

 import streamlit as st
 from sentence_transformers import SentenceTransformer
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import chromadb
+# Download NLTK resources if not available
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+# Load the SentenceTransformer model
+@st.cache_resource
+def load_model():
+    return SentenceTransformer('all-MiniLM-L6-v2')
+model = load_model()
+# Connect to ChromaDB client
+@st.cache_resource
+def get_chroma_collection():
+    try:
+        client = chromadb.PersistentClient(path="vectordb")
+        return client.get_collection("searchengine1")
+    except Exception as e:
+        st.error(f"Database connection failed: {e}")
+        return None
+collection = get_chroma_collection()
+# Function to clean and preprocess text
+def clean_text(text):
+    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
+    tokens = word_tokenize(text)
+    stop_words = set(stopwords.words('english'))
+    lemmatizer = WordNetLemmatizer()
+    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
+    return ' '.join(clean_tokens).strip()
 # Streamlit UI
+st.title("🔍 Semantic Search Engine")
+st.write("Enter your query below to search relevant documents.")
+query = st.text_input("Search Query:", "")
+if query and collection:
+    with st.spinner("Searching..."):
+        cleaned_query = clean_text(query)
+        query_embedding = model.encode([cleaned_query])
+        # Perform the search query
+        results = collection.query(
+            query_embeddings=query_embedding,
+            n_results=5,
+            include=['documents']
+        )
+        documents = results.get('documents', [])
+        # Display results
+        if documents:
+            st.subheader("🔹 Search Results:")
+            for i, query_documents in enumerate(documents):
+                for j, document in enumerate(query_documents):
+                    st.markdown(f"**{j+1}.** {document}")
+        else:
+            st.warning("No results found. Try a different query.")