Mpavan45 commited on
Commit
8789469
Β·
verified Β·
1 Parent(s): d806510

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -163
app.py CHANGED
@@ -1,167 +1,160 @@
1
  import streamlit as st
2
- import ffmpeg
3
- import numpy as np
4
- import pandas as pd
5
- from sentence_transformers import SentenceTransformer
6
- from sklearn.metrics.pairwise import cosine_similarity
7
- from transformers import pipeline
8
- import os
9
  import tempfile
 
10
  import chromadb
11
-
12
- # βœ… Chunked Upload Config (50 MB chunks to bypass 200 MB limit)
13
- MAX_CHUNK_SIZE = 50 * 1024 * 1024 # 50 MB
14
-
15
- def upload_file_in_chunks(file):
16
- """Split file into 50 MB chunks to bypass the 200 MB limit."""
17
- chunk_list = []
18
- bytes_data = file.read()
19
- total_size = len(bytes_data)
20
- num_chunks = (total_size // MAX_CHUNK_SIZE) + 1
21
-
22
- st.write(f"πŸ”Ή Splitting into {num_chunks} chunks...")
23
-
24
- for i in range(num_chunks):
25
- start = i * MAX_CHUNK_SIZE
26
- end = min((i + 1) * MAX_CHUNK_SIZE, total_size)
27
- chunk_list.append(bytes_data[start:end])
28
-
29
- # Combine all chunks into a single CSV
30
- full_csv_data = b"".join(chunk_list)
31
- return full_csv_data
32
-
33
- # Initialize Chroma DB
34
- chroma_client = chromadb.Client()
35
- collection_name = "subtitle_chunks"
36
-
37
- # Hugging Face Whisper Model
38
- whisper_model = pipeline("automatic-speech-recognition", model="openai/whisper-base")
39
-
40
- # Function to extract audio from video
41
- def extract_audio(video_file, audio_file):
42
- input_file = ffmpeg.input(video_file)
43
- output_file = ffmpeg.output(input_file, audio_file, **{'vn': None, 'ar': 16000, 'ac': 1, 'f': 'wav'})
44
- ffmpeg.run(output_file)
45
-
46
- # Function to transcribe audio using Whisper
47
- def transcribe_audio(audio_file):
48
- result = whisper_model(audio_file)
49
- return result['text']
50
-
51
- # Function to generate embeddings
52
- def generate_embedding(text):
53
- model = SentenceTransformer('all-MiniLM-L6-v2')
54
- return model.encode(text).tolist()
55
-
56
- # Function to split large CSV into smaller chunks
57
- def split_csv(csv_path, chunk_size=50000):
58
- chunks = []
59
- for i, chunk in enumerate(pd.read_csv(csv_path, chunksize=chunk_size)):
60
- chunk_file = f"chunk_{i}.csv"
61
- chunk.to_csv(chunk_file, index=False)
62
- chunks.append(chunk_file)
63
- return chunks
64
-
65
- # Store subtitle chunks in Chroma DB
66
- def store_chunks_in_chroma(chunk_files):
67
- collection = chroma_client.create_collection(name=collection_name)
68
-
69
- for chunk_file in chunk_files:
70
- chunk = pd.read_csv(chunk_file)
71
-
72
- for idx, row in chunk.iterrows():
73
- text = row['text']
74
- embedding = generate_embedding(text)
75
-
76
- collection.add(
77
- documents=[text],
78
- metadatas=[{"source": chunk_file}],
79
- ids=[f"{chunk_file}_{idx}"]
80
- )
81
-
82
- # Search subtitles in Chroma DB
83
- def search_in_chroma(transcribed_text, top_k=10):
84
- query_embedding = generate_embedding(transcribed_text)
85
- collection = chroma_client.get_collection(name=collection_name)
86
-
87
- results = collection.query(
88
- query_embeddings=[query_embedding],
89
- n_results=top_k
90
- )
91
-
92
- matches = []
93
- for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
94
- matches.append({"text": doc, "source": meta['source']})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- return matches
97
-
98
- # Streamlit UI
99
- st.title("πŸŽ₯ Video Subtitle Search with Hugging Face Whisper + Chroma DB")
100
-
101
- # Sidebar for CSV Upload with Chunking
102
- st.sidebar.header("πŸ“ Upload Subtitle Database")
103
- csv_file = st.sidebar.file_uploader("Upload Subtitle CSV (200 MB+ supported)", type=["csv"])
104
-
105
- # Upload video
106
- uploaded_video = st.file_uploader("Upload a video", type=["mp4", "avi", "mov", "mkv"])
107
-
108
- # If both CSV and video are uploaded
109
- if csv_file and uploaded_video:
110
- # Handle large CSV upload using 50 MB chunks
111
- st.info("Uploading CSV in 50 MB chunks...")
112
- full_csv_data = upload_file_in_chunks(csv_file)
113
-
114
- # Save CSV locally
115
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_csv:
116
- temp_csv.write(full_csv_data)
117
- csv_path = temp_csv.name
118
-
119
- # Save video locally
120
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
121
- temp_video.write(uploaded_video.getbuffer())
122
- video_path = temp_video.name
123
-
124
- audio_path = "temp_audio.wav"
125
-
126
- # Extract audio
127
- st.info("Extracting audio...")
128
- extract_audio(video_path, audio_path)
129
-
130
- # Transcribe audio using Hugging Face Whisper
131
- st.info("Transcribing audio...")
132
- transcribed_text = transcribe_audio(audio_path)
133
- st.text_area("Transcribed Text", transcribed_text, height=150)
134
-
135
- # Split the CSV database into smaller chunks
136
- st.info("Splitting CSV into smaller chunks...")
137
- chunk_files = split_csv(csv_path, chunk_size=50000)
138
-
139
- # Store chunks in Chroma DB
140
- st.info("Storing subtitle chunks in Chroma DB...")
141
- store_chunks_in_chroma(chunk_files)
142
-
143
- # Search subtitles in Chroma DB
144
- st.info("Searching subtitles in Chroma DB...")
145
- matching_subtitles = search_in_chroma(transcribed_text)
146
-
147
- # Display video
148
- st.video(video_path)
149
-
150
- # Display matching subtitles
151
- st.subheader("πŸ“œ Matching Subtitles (Chroma DB + Cosine Similarity)")
152
- for sub in matching_subtitles:
153
- st.write(f"**Subtitle:** {sub['text']}")
154
- st.write(f"**Source:** {sub['source']}")
155
- st.write("---")
156
-
157
- # Cleanup
158
- os.remove(video_path)
159
- os.remove(audio_path)
160
- os.remove(csv_path)
161
-
162
- # Remove chunk files
163
- for chunk_file in chunk_files:
164
- os.remove(chunk_file)
165
-
166
- else:
167
- st.warning("Please upload both a video file and a CSV file to proceed.")
 
1
  import streamlit as st
2
+ import whisper
 
 
 
 
 
 
3
  import tempfile
4
+ import os
5
  import chromadb
6
+ from pydub import AudioSegment
7
+ from sentence_transformers import SentenceTransformer
8
+ import numpy as np
9
+ import pickle
10
+
11
+ # πŸ’‘ Set Page Configuration
12
+ st.set_page_config(page_title="Audio Subtitle Tool", layout="wide")
13
+
14
+ # 🌐 Initialize ChromaDB
15
+ db_client = chromadb.PersistentClient(path="./chroma_data")
16
+ sub_db = db_client.get_or_create_collection(name="audio_subtitles")
17
+
18
+ # βš™οΈ Load Whisper Model
19
+ @st.cache_resource()
20
+ def load_transcriber():
21
+ return whisper.load_model("base")
22
+
23
+ transcriber = load_transcriber()
24
+
25
+ # βš™οΈ Load Sentence Transformer
26
+ @st.cache_resource()
27
+ def load_encoder():
28
+ return SentenceTransformer("paraphrase-MiniLM-L6-v2")
29
+
30
+ semantic_encoder = load_encoder()
31
+
32
+ # πŸ”₯ Load Subtitle Data from Pickle
33
+ @st.cache_resource()
34
+ def load_subtitle_store():
35
+ with open("subtitle_data.pkl", "rb") as file:
36
+ subtitle_store = pickle.load(file)
37
+
38
+ # ⏱️ Format Time into SRT format
39
+ def time_formatter(sec):
40
+ hr = int(sec // 3600)
41
+ mins = int((sec % 3600) // 60)
42
+ secs = int(sec % 60)
43
+ millis = int((sec % 1) * 1000)
44
+ return f"{hr:02}:{mins:02}:{secs:02},{millis:03}"
45
+
46
+ # πŸŽ™οΈ Transcribe Audio and Store in ChromaDB
47
+ @st.cache_resource()
48
+ def process_audio(file_path):
49
+ result = transcriber.transcribe(file_path, beam_size=1)
50
+ segments = result['segments']
51
+ srt_subtitles = []
52
+ plain_text_subs = []
53
+
54
+ for idx, seg in enumerate(segments):
55
+ start = time_formatter(seg['start'])
56
+ end = time_formatter(seg['end'])
57
+ content = seg['text']
58
+
59
+ # Create SRT format
60
+ srt_subtitles.append(f"{idx + 1}\n{start} --> {end}\n{content}\n")
61
+ plain_text_subs.append(content)
62
+
63
+ # Store in ChromaDB
64
+ sub_db.upsert(
65
+ documents=[content],
66
+ metadatas=[{"start_time": start, "end_time": end}],
67
+ ids=[f"subtitle_{idx}"]
68
+ )
69
+
70
+ return "\n".join(srt_subtitles), " ".join(plain_text_subs)
71
+
72
+ # πŸ” Subtitle Search Function
73
+ def search_content(query):
74
+ query_vec = semantic_encoder.encode(query).tolist()
75
+ matches = sub_db.query(query_embeddings=[query_vec], n_results=5)
76
+
77
+ if 'documents' in matches and matches['documents']:
78
+ results = []
79
+ for idx, doc in enumerate(matches['documents'][0]):
80
+ meta = matches['metadatas'][0][idx]
81
+ results.append(f"πŸ“Œ {doc} (From: {meta['start_time']} β†’ To: {meta['end_time']})")
82
+ return results
83
+ return []
84
+
85
+ # πŸ”₯ Sidebar Navigation
86
+ st.sidebar.title("🎯 Navigation")
87
+ page = st.sidebar.radio("Choose Action", ["Generate Subtitles", "Search Subtitles"])
88
+
89
+ # πŸŽ›οΈ Sidebar Styling
90
+ st.sidebar.markdown(
91
+ """
92
+ <style>
93
+ .sidebar .sidebar-content {
94
+ background-color: #f0f0f5;
95
+ color: #333;
96
+ }
97
+ </style>
98
+ """,
99
+ unsafe_allow_html=True
100
+ )
101
+
102
+ # 🎬 Header Section
103
+ st.markdown(
104
+ """
105
+ <h1 style='text-align: center; color: #673AB7;'>🎀 Audio Subtitle Generator & Search</h1>
106
+ <p style='text-align: center; font-size: 18px; color: #666;'>Generate subtitles from audio files and search through them.</p>
107
+ """,
108
+ unsafe_allow_html=True
109
+ )
110
+
111
+ # πŸš€ Load Subtitle Data
112
+ load_subtitle_store()
113
+
114
+ # 🎡 File Upload Section
115
+ st.markdown("### 🎧 Upload Your Audio File")
116
+ audio_file = st.file_uploader("Supported formats: MP3, WAV", type=["mp3", "wav"])
117
+
118
+ # 🎯 Page Logic
119
+ if page == "Generate Subtitles":
120
+ if audio_file:
121
+ st.audio(audio_file, format='audio/mp3')
122
+
123
+ with st.spinner("πŸ”§ Processing audio... Please wait."):
124
+ # Temporary file handling
125
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio:
126
+ if audio_file.name.endswith(".mp3"):
127
+ audio = AudioSegment.from_mp3(audio_file)
128
+ elif audio_file.name.endswith(".wav"):
129
+ audio = AudioSegment.from_wav(audio_file)
130
+ audio = audio.set_frame_rate(16000)
131
+ audio.export(tmp_audio.name, format="wav")
132
+
133
+ # Transcribe and store in ChromaDB
134
+ subtitle_output, plain_output = process_audio(tmp_audio.name)
135
+ st.success("βœ… Transcription Completed!")
136
+
137
+ # Display Subtitles
138
+ st.markdown("### πŸ“œ Generated Subtitles")
139
+ st.text_area("Subtitles (SRT Format)", subtitle_output, height=300)
140
+
141
+ # Download Options
142
+ st.download_button("⬇️ Download SRT File", subtitle_output, file_name="subtitles.srt", mime="text/plain")
143
+ st.download_button("⬇️ Download Plain Text", plain_output, file_name="subtitles.txt", mime="text/plain")
144
+
145
+ os.remove(tmp_audio.name)
146
+
147
+ elif page == "Search Subtitles":
148
+ st.subheader("πŸ”Ž Search Subtitles")
149
+ query_input = st.text_input("Enter text to search")
150
 
151
+ if query_input:
152
+ with st.spinner("πŸ” Searching..."):
153
+ search_matches = search_content(query_input)
154
+
155
+ if search_matches:
156
+ st.success("βœ… Results Found:")
157
+ for match in search_matches:
158
+ st.write(match)
159
+ else:
160
+ st.warning("⚠️ No matching results. Try a different query.")