Mpavan45 commited on
Commit
fe10fa8
·
verified ·
1 Parent(s): 4b63f9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -26
app.py CHANGED
@@ -94,46 +94,58 @@ from chromadb.config import Settings
94
  def extract_audio(uploaded_file):
95
  """Extracts audio from video or handles audio file directly."""
96
  audio_path = "temp_audio.wav"
97
-
98
- # Write uploaded file to a temporary file
99
  temp_file = f"temp_{uploaded_file.name}"
100
  with open(temp_file, "wb") as f:
101
  f.write(uploaded_file.getvalue())
102
 
103
- # Extract audio from video files or use directly for audio files
104
  if uploaded_file.name.endswith(('.mp4', '.mkv')):
105
  ffmpeg.input(temp_file).output(audio_path).run(overwrite_output=True)
106
  else:
107
  audio_path = temp_file
108
-
109
- return audio_path
110
 
111
  def transcribe_audio(audio_path):
112
- """Transcribes audio to text using Whisper with model download handling."""
113
  try:
114
  model = whisper.load_model("base")
115
  except Exception:
116
  st.warning("Downloading Whisper model. This may take a while...")
117
  model = whisper.load_model("base")
 
118
  result = model.transcribe(audio_path)
119
  return result['text']
120
 
121
  def load_embeddings():
122
  """Loads subtitle embeddings from pkl file."""
123
- with open('subtitle_embeddings.pkl', 'rb') as f:
124
- embeddings = pickle.load(f)
125
- return embeddings
 
 
 
 
126
 
127
  def save_to_chroma(embeddings):
128
  """Stores embeddings in Chroma DB."""
129
  client = Client(Settings())
130
- collection = client.create_collection(name="subtitles")
 
 
 
 
 
 
131
  for idx, row in embeddings.iterrows():
132
  collection.add(
133
  documents=[row['subtitle']],
134
  ids=[str(idx)],
135
  embeddings=[row['embedding']]
136
  )
 
137
  return collection
138
 
139
  def search_subtitles(query, collection):
@@ -149,11 +161,11 @@ def main():
149
  uploaded_file = st.file_uploader("Upload Video/Audio", type=["mp4", "mkv", "mp3", "wav"])
150
  query = st.text_input("Search Subtitles")
151
  download_btn = st.button("Download Subtitles")
152
-
153
  if uploaded_file:
154
  with st.spinner("Extracting audio..."):
155
- audio_path = extract_audio(uploaded_file.name)
156
-
157
  with st.spinner("Generating subtitles..."):
158
  subtitles = transcribe_audio(audio_path)
159
  st.success("Subtitles Generated!")
@@ -163,23 +175,34 @@ def main():
163
  st.video(uploaded_file)
164
  else:
165
  st.audio(uploaded_file)
166
-
167
  st.text_area("Generated Subtitles", subtitles, height=300)
168
-
169
  # Load and search embeddings
170
  embeddings = load_embeddings()
171
- collection = save_to_chroma(embeddings)
172
-
173
- if query:
174
- results = search_subtitles(query, collection)
175
- st.write("### Matching Subtitles:")
176
- for sub in results:
177
- st.write(f"- {sub}")
178
-
 
 
179
  if download_btn:
180
- with open("generated_subtitles.srt", "w") as f:
181
- f.write(subtitles)
182
- st.download_button("Download SRT", "generated_subtitles.srt")
 
 
 
 
 
 
 
 
 
183
 
184
  if __name__ == '__main__':
185
  main()
 
94
  def extract_audio(uploaded_file):
95
  """Extracts audio from video or handles audio file directly."""
96
  audio_path = "temp_audio.wav"
97
+
98
+ # Save uploaded file temporarily
99
  temp_file = f"temp_{uploaded_file.name}"
100
  with open(temp_file, "wb") as f:
101
  f.write(uploaded_file.getvalue())
102
 
103
+ # Extract audio for video files, keep as-is for audio
104
  if uploaded_file.name.endswith(('.mp4', '.mkv')):
105
  ffmpeg.input(temp_file).output(audio_path).run(overwrite_output=True)
106
  else:
107
  audio_path = temp_file
108
+
109
+ return audio_path, temp_file
110
 
111
  def transcribe_audio(audio_path):
112
+ """Transcribes audio to text using Whisper."""
113
  try:
114
  model = whisper.load_model("base")
115
  except Exception:
116
  st.warning("Downloading Whisper model. This may take a while...")
117
  model = whisper.load_model("base")
118
+
119
  result = model.transcribe(audio_path)
120
  return result['text']
121
 
122
  def load_embeddings():
123
  """Loads subtitle embeddings from pkl file."""
124
+ if os.path.exists('subtitle_embeddings.pkl'):
125
+ with open('subtitle_embeddings.pkl', 'rb') as f:
126
+ embeddings = pickle.load(f)
127
+ return embeddings
128
+ else:
129
+ st.error("No embeddings file found.")
130
+ return pd.DataFrame()
131
 
132
  def save_to_chroma(embeddings):
133
  """Stores embeddings in Chroma DB."""
134
  client = Client(Settings())
135
+
136
+ # Check if collection exists, else create
137
+ try:
138
+ collection = client.get_collection("subtitles")
139
+ except:
140
+ collection = client.create_collection(name="subtitles")
141
+
142
  for idx, row in embeddings.iterrows():
143
  collection.add(
144
  documents=[row['subtitle']],
145
  ids=[str(idx)],
146
  embeddings=[row['embedding']]
147
  )
148
+
149
  return collection
150
 
151
  def search_subtitles(query, collection):
 
161
  uploaded_file = st.file_uploader("Upload Video/Audio", type=["mp4", "mkv", "mp3", "wav"])
162
  query = st.text_input("Search Subtitles")
163
  download_btn = st.button("Download Subtitles")
164
+
165
  if uploaded_file:
166
  with st.spinner("Extracting audio..."):
167
+ audio_path, temp_file = extract_audio(uploaded_file)
168
+
169
  with st.spinner("Generating subtitles..."):
170
  subtitles = transcribe_audio(audio_path)
171
  st.success("Subtitles Generated!")
 
175
  st.video(uploaded_file)
176
  else:
177
  st.audio(uploaded_file)
178
+
179
  st.text_area("Generated Subtitles", subtitles, height=300)
180
+
181
  # Load and search embeddings
182
  embeddings = load_embeddings()
183
+ if not embeddings.empty:
184
+ collection = save_to_chroma(embeddings)
185
+
186
+ if query:
187
+ results = search_subtitles(query, collection)
188
+ st.write("### Matching Subtitles:")
189
+ for sub in results:
190
+ st.write(f"- {sub}")
191
+
192
+ # Subtitle download option
193
  if download_btn:
194
+ srt_content = f"1\n00:00:00,000 --> 00:00:10,000\n{subtitles}\n"
195
+
196
+ st.download_button(
197
+ label="Download SRT",
198
+ data=srt_content.encode('utf-8'),
199
+ file_name="generated_subtitles.srt",
200
+ mime="text/plain"
201
+ )
202
+
203
+ # Cleanup temporary files
204
+ os.remove(audio_path)
205
+ os.remove(temp_file)
206
 
207
  if __name__ == '__main__':
208
  main()