Mpavan45's picture
Update app.py
8995acb verified
raw
history blame
2.42 kB
import streamlit as st
import pandas as pd
import chromadb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import whisper
import os
# Initialize Chroma DB
chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="subtitles")
# Sidebar for subtitle upload
st.sidebar.header("πŸ“‚ Upload Subtitle CSV")
subtitle_file = st.sidebar.file_uploader("Upload subtitle dataset (CSV)", type=["csv"])
if subtitle_file:
# Read CSV and store in Chroma DB
subtitle_df = pd.read_csv(subtitle_file)
for i, row in subtitle_df.iterrows():
collection.add(
documents=[row['cleaned_subtitle']],
metadatas=[{"subtitle": row['subtitle']}],
ids=[f"subtitle_{i}"]
)
st.sidebar.success("βœ… Subtitles stored in Chroma DB permanently.")
# Function to transcribe video using Whisper
def video_to_text(video_path):
model = whisper.load_model("small")
result = model.transcribe(video_path)
return result['text']
# Function to match transcribed text with subtitles
def match_with_chroma(transcription, collection):
# Retrieve all subtitle texts from Chroma DB
docs = collection.get()
subtitles = [doc['document'] for doc in docs['documents']]
# Vectorization with TF-IDF
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([transcription] + subtitles)
# Cosine similarity
similarity_scores = cosine_similarity(vectors[0:1], vectors[1:])
# Best match
best_match_index = similarity_scores.argmax()
best_match = docs['metadatas'][best_match_index]['subtitle']
return best_match
# Streamlit UI
st.title("πŸŽ₯ Video Subtitle Extractor")
# Upload video
video_file = st.file_uploader("Upload a video", type=["mp4", "mkv", "avi"])
if video_file:
with st.spinner("Processing video..."):
# Save uploaded video temporarily
video_path = "uploaded_video.mp4"
with open(video_path, "wb") as f:
f.write(video_file.read())
# Transcribe video
transcription = video_to_text(video_path)
# Find best-matching subtitle
best_match = match_with_chroma(transcription, collection)
# Display results
st.subheader("πŸ” Extracted Subtitle")
st.write(best_match)
# Clean up temporary video file
os.remove(video_path)