| import streamlit as st |
| import pandas as pd |
| import chromadb |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.metrics.pairwise import cosine_similarity |
| import whisper |
| import os |
|
|
| |
| chroma_client = chromadb.Client() |
| collection = chroma_client.create_collection(name="subtitles") |
|
|
| |
| st.sidebar.header("π Upload Subtitle CSV") |
| subtitle_file = st.sidebar.file_uploader("Upload subtitle dataset (CSV)", type=["csv"]) |
|
|
| if subtitle_file: |
| |
| subtitle_df = pd.read_csv(subtitle_file) |
|
|
| for i, row in subtitle_df.iterrows(): |
| collection.add( |
| documents=[row['cleaned_subtitle']], |
| metadatas=[{"subtitle": row['subtitle']}], |
| ids=[f"subtitle_{i}"] |
| ) |
| st.sidebar.success("β
Subtitles stored in Chroma DB permanently.") |
|
|
| |
| def video_to_text(video_path): |
| model = whisper.load_model("small") |
| result = model.transcribe(video_path) |
| return result['text'] |
|
|
| |
| def match_with_chroma(transcription, collection): |
| |
| docs = collection.get() |
| subtitles = [doc['document'] for doc in docs['documents']] |
|
|
| |
| vectorizer = TfidfVectorizer() |
| vectors = vectorizer.fit_transform([transcription] + subtitles) |
|
|
| |
| similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]) |
|
|
| |
| best_match_index = similarity_scores.argmax() |
| best_match = docs['metadatas'][best_match_index]['subtitle'] |
|
|
| return best_match |
|
|
| |
| st.title("π₯ Video Subtitle Extractor") |
|
|
| |
| video_file = st.file_uploader("Upload a video", type=["mp4", "mkv", "avi"]) |
|
|
| if video_file: |
| with st.spinner("Processing video..."): |
| |
| video_path = "uploaded_video.mp4" |
| with open(video_path, "wb") as f: |
| f.write(video_file.read()) |
|
|
| |
| transcription = video_to_text(video_path) |
|
|
| |
| best_match = match_with_chroma(transcription, collection) |
|
|
| |
| st.subheader("π Extracted Subtitle") |
| st.write(best_match) |
|
|
| |
| os.remove(video_path) |
|
|