clementBE commited on
Commit
1e2831b
·
verified ·
1 Parent(s): eee08a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -28
app.py CHANGED
@@ -1,3 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
  import subprocess
@@ -9,20 +69,36 @@ import uuid
9
  import base64
10
  import torch
11
  import shutil
12
- from docx import Document # for DOCX export
13
 
14
- # Auto-select device: GPU if available
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model = whisper.load_model("base", device=device)
17
 
 
 
 
 
18
  def format_timestamp(seconds):
 
 
 
19
  h = int(seconds // 3600)
20
  m = int((seconds % 3600) // 60)
21
  s = int(seconds % 60)
22
  ms = int((seconds - int(seconds)) * 1000)
23
  return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
24
 
 
 
 
 
25
  def write_vtt(segments, filepath):
 
 
 
26
  with open(filepath, "w", encoding="utf-8") as f:
27
  f.write("WEBVTT\n\n")
28
  for i, seg in enumerate(segments, start=1):
@@ -31,18 +107,33 @@ def write_vtt(segments, filepath):
31
  text = seg['text'].strip()
32
  f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
33
 
 
 
 
 
34
  def write_docx(entries, filepath):
 
 
 
35
  doc = Document()
36
  doc.add_heading("Transcript", level=1)
37
- full_text = " ".join([text for _, text in entries]) # concatenate all segments
38
- doc.add_paragraph(full_text) # single paragraph with all text
39
  doc.save(filepath)
40
  return filepath
41
 
 
 
 
 
42
  def parse_vtt(filepath):
 
 
 
43
  entries = []
44
  with open(filepath, "r", encoding="utf-8") as f:
45
  lines = f.readlines()
 
46
  idx = 0
47
  while idx < len(lines):
48
  line = lines[idx].strip()
@@ -58,19 +149,40 @@ def parse_vtt(filepath):
58
  idx += 1
59
  return entries
60
 
 
 
 
 
61
  def parse_timestamp(ts_str):
 
 
 
62
  h, m, rest = ts_str.split(":")
63
  s, ms = rest.split(".")
64
  return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
65
 
 
 
 
 
66
  def capture_screenshot(video_path, time_sec, out_path):
 
 
 
67
  cmd = [
68
  "ffmpeg", "-ss", str(time_sec), "-i", video_path,
69
  "-frames:v", "1", "-q:v", "2", out_path, "-y"
70
  ]
71
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
72
 
 
 
 
 
73
  def save_voice_plot(times, db, start_sec, out_path):
 
 
 
74
  plt.figure(figsize=(8, 3))
75
  plt.plot(times, db, color="purple")
76
  plt.axvline(x=start_sec, color="red", linestyle="--")
@@ -82,7 +194,14 @@ def save_voice_plot(times, db, start_sec, out_path):
82
  plt.savefig(out_path)
83
  plt.close()
84
 
 
 
 
 
85
  def file_to_base64(filepath):
 
 
 
86
  with open(filepath, "rb") as f:
87
  data = f.read()
88
  ext = os.path.splitext(filepath)[1].lower().replace('.', '')
@@ -90,14 +209,33 @@ def file_to_base64(filepath):
90
  b64 = base64.b64encode(data).decode('utf-8')
91
  return f"data:{mime};base64,{b64}"
92
 
 
 
 
 
93
  def extract_audio(video_path, output_dir):
 
 
 
94
  audio_path = os.path.join(output_dir, "audio.mp3")
95
  subprocess.run([
96
- "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path
 
97
  ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
98
  return audio_path
99
 
 
 
 
 
100
  def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
 
 
 
 
 
 
 
101
  html = f"""<!DOCTYPE html>
102
  <html lang="en">
103
  <head>
@@ -113,19 +251,11 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
113
  }}
114
  .segment {{
115
  display: flex;
116
- align-items: center;
117
  gap: 20px;
118
  margin-bottom: 40px;
119
  }}
120
- .text {{
121
- flex: 2;
122
- }}
123
- .media {{
124
- flex: 3;
125
- display: flex;
126
- flex-direction: column;
127
- gap: 10px;
128
- }}
129
  </style>
130
  </head>
131
  <body>
@@ -136,6 +266,7 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
136
  for time_range, text in entries:
137
  start = time_range.split(" --> ")[0]
138
  start_sec = int(parse_timestamp(start))
 
139
  screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
140
  plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
141
 
@@ -159,9 +290,25 @@ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, outpu
159
 
160
  with open(output_html_path, "w", encoding="utf-8") as f:
161
  f.write(html)
 
162
  return output_html_path
163
 
 
 
 
 
164
  def process(video_file):
 
 
 
 
 
 
 
 
 
 
 
165
  session_id = str(uuid.uuid4())
166
  base_dir = os.path.join("session_data", session_id)
167
  os.makedirs(base_dir, exist_ok=True)
@@ -174,20 +321,20 @@ def process(video_file):
174
  video_path = video_file.name
175
  video_id = os.path.splitext(os.path.basename(video_path))[0]
176
 
177
- # Extract audio
178
  audio_path = extract_audio(video_path, base_dir)
179
 
180
- # Transcription
181
  result = model.transcribe(audio_path)
182
  vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
183
  write_vtt(result["segments"], vtt_path)
184
  entries = parse_vtt(vtt_path)
185
 
186
- # Create DOCX transcript
187
  docx_path = os.path.join(base_dir, f"{video_id}.docx")
188
  write_docx(entries, docx_path)
189
 
190
- # Voice intensity curve
191
  y, sr = librosa.load(audio_path, sr=None)
192
  S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
193
  freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
@@ -196,31 +343,37 @@ def process(video_file):
196
  voice_db = 20 * np.log10(voice_energy + 1e-6)
197
  times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
198
 
199
- # Generate screenshots + plots
200
  for time_range, _ in entries:
201
  start = time_range.split(" --> ")[0]
202
  start_sec = parse_timestamp(start)
203
- screenshot_out = os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")
204
- plot_out = os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")
205
- capture_screenshot(video_path, start_sec, screenshot_out)
206
- save_voice_plot(times, voice_db, start_sec, plot_out)
207
 
208
- # HTML output
209
  html_output_path = os.path.join(base_dir, f"{video_id}.html")
210
- final_html = generate_html(entries, video_id, video_path, screenshots_dir, plots_dir, html_output_path)
 
 
 
 
211
 
212
- # Create ZIP of screenshots
213
  zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip")
214
  shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir)
215
 
216
- # Return files + HTML preview
217
  with open(final_html, "r", encoding="utf-8") as f:
218
  html_content = f.read()
219
 
220
  return docx_path, final_html, zip_path, html_content
221
 
222
 
 
223
  # Gradio UI
 
224
  demo = gr.Interface(
225
  fn=process,
226
  inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],
 
1
+ """
2
+ ===========================================================
3
+ Video Annotated Transcript Generator
4
+ ===========================================================
5
+
6
+ This Gradio application processes a video file and produces:
7
+
8
+ 1. A full transcript (DOCX format)
9
+ 2. A WEBVTT subtitle file
10
+ 3. Screenshots at each transcript timestamp (ZIP)
11
+ 4. Voice intensity plots synchronized with the transcript
12
+ 5. An interactive HTML file showing:
13
+ - Screenshot
14
+ - Sound intensity plot
15
+ - Editable text of each segment
16
+
17
+ The pipeline:
18
+ -------------
19
+ UPLOAD VIDEO
20
+ → Extract audio (ffmpeg)
21
+ → Transcribe speech using Whisper
22
+ → Produce VTT + DOCX
23
+ → Analyze sound intensity using Librosa
24
+ → Capture screenshots at segment timestamps
25
+ → Generate annotated HTML page
26
+ → Return all outputs to the user
27
+
28
+ -----------------------------------------------------------
29
+ HOW TO GET VIDEOS USING “VIDEO DOWNLOADHELPER”
30
+ -----------------------------------------------------------
31
+
32
+ Video DownloadHelper is a browser extension (Firefox / Chrome)
33
+ that allows you to save video files locally.
34
+
35
+ Steps:
36
+ 1. Install the extension:
37
+ https://www.downloadhelper.net/
38
+
39
+ 2. Go to the video you want to download
40
+ (YouTube, Vimeo, news websites, etc.)
41
+
42
+ 3. Click the DownloadHelper icon in your browser.
43
+
44
+ 4. Choose a file format such as:
45
+ • MP4
46
+ • WebM
47
+ • MKV
48
+
49
+ 5. Save the file to your computer.
50
+
51
+ 6. Upload the saved file into this Gradio app.
52
+
53
+ Note:
54
+ - The extension cannot download YouTube videos with DRM.
55
+ - If a website blocks downloading, try the “Companion App”
56
+ recommended by Video DownloadHelper.
57
+
58
+ ===========================================================
59
+ """
60
+
61
  import gradio as gr
62
  import os
63
  import subprocess
 
69
  import base64
70
  import torch
71
  import shutil
72
+ from docx import Document # DOCX export
73
 
74
+ # ----------------------------------------------------------
75
+ # Auto-select GPU if available for Whisper
76
+ # ----------------------------------------------------------
77
  device = "cuda" if torch.cuda.is_available() else "cpu"
78
  model = whisper.load_model("base", device=device)
79
 
80
+
81
+ # ----------------------------------------------------------
82
+ # Utility: Convert seconds → WebVTT timestamp format
83
+ # ----------------------------------------------------------
84
  def format_timestamp(seconds):
85
+ """
86
+ Convert time in seconds to WebVTT format HH:MM:SS.MS
87
+ """
88
  h = int(seconds // 3600)
89
  m = int((seconds % 3600) // 60)
90
  s = int(seconds % 60)
91
  ms = int((seconds - int(seconds)) * 1000)
92
  return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
93
 
94
+
95
+ # ----------------------------------------------------------
96
+ # Write segments to a .vtt subtitle file
97
+ # ----------------------------------------------------------
98
  def write_vtt(segments, filepath):
99
+ """
100
+ Save Whisper segments to a .vtt (WebVTT subtitle) file.
101
+ """
102
  with open(filepath, "w", encoding="utf-8") as f:
103
  f.write("WEBVTT\n\n")
104
  for i, seg in enumerate(segments, start=1):
 
107
  text = seg['text'].strip()
108
  f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
109
 
110
+
111
+ # ----------------------------------------------------------
112
+ # Export transcript to DOCX
113
+ # ----------------------------------------------------------
114
  def write_docx(entries, filepath):
115
+ """
116
+ Export transcript text into a single DOCX document.
117
+ """
118
  doc = Document()
119
  doc.add_heading("Transcript", level=1)
120
+ full_text = " ".join([text for _, text in entries])
121
+ doc.add_paragraph(full_text)
122
  doc.save(filepath)
123
  return filepath
124
 
125
+
126
+ # ----------------------------------------------------------
127
+ # Read a .vtt file and return list of (timerange, text)
128
+ # ----------------------------------------------------------
129
  def parse_vtt(filepath):
130
+ """
131
+ Basic VTT parser: returns a list of (timestamp, text)
132
+ """
133
  entries = []
134
  with open(filepath, "r", encoding="utf-8") as f:
135
  lines = f.readlines()
136
+
137
  idx = 0
138
  while idx < len(lines):
139
  line = lines[idx].strip()
 
149
  idx += 1
150
  return entries
151
 
152
+
153
+ # ----------------------------------------------------------
154
+ # Parse a VTT timestamp "HH:MM:SS.MS"
155
+ # ----------------------------------------------------------
156
  def parse_timestamp(ts_str):
157
+ """
158
+ Convert WebVTT timestamp to seconds.
159
+ """
160
  h, m, rest = ts_str.split(":")
161
  s, ms = rest.split(".")
162
  return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
163
 
164
+
165
+ # ----------------------------------------------------------
166
+ # Capture screenshot using ffmpeg
167
+ # ----------------------------------------------------------
168
  def capture_screenshot(video_path, time_sec, out_path):
169
+ """
170
+ Extract a frame at a specific time using ffmpeg.
171
+ """
172
  cmd = [
173
  "ffmpeg", "-ss", str(time_sec), "-i", video_path,
174
  "-frames:v", "1", "-q:v", "2", out_path, "-y"
175
  ]
176
  subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
177
 
178
+
179
+ # ----------------------------------------------------------
180
+ # Save a voice intensity plot around the timestamp
181
+ # ----------------------------------------------------------
182
  def save_voice_plot(times, db, start_sec, out_path):
183
+ """
184
+ Plot voice-band intensity (300–3000 Hz) and mark the timestamp.
185
+ """
186
  plt.figure(figsize=(8, 3))
187
  plt.plot(times, db, color="purple")
188
  plt.axvline(x=start_sec, color="red", linestyle="--")
 
194
  plt.savefig(out_path)
195
  plt.close()
196
 
197
+
198
+ # ----------------------------------------------------------
199
+ # Convert image → base64 to embed in HTML
200
+ # ----------------------------------------------------------
201
  def file_to_base64(filepath):
202
+ """
203
+ Convert a file to a base64 string for HTML embedding.
204
+ """
205
  with open(filepath, "rb") as f:
206
  data = f.read()
207
  ext = os.path.splitext(filepath)[1].lower().replace('.', '')
 
209
  b64 = base64.b64encode(data).decode('utf-8')
210
  return f"data:{mime};base64,{b64}"
211
 
212
+
213
+ # ----------------------------------------------------------
214
+ # Extract audio track from video
215
+ # ----------------------------------------------------------
216
  def extract_audio(video_path, output_dir):
217
+ """
218
+ Extract audio as MP3 using ffmpeg.
219
+ """
220
  audio_path = os.path.join(output_dir, "audio.mp3")
221
  subprocess.run([
222
+ "ffmpeg", "-y", "-i", video_path, "-vn",
223
+ "-acodec", "libmp3lame", audio_path
224
  ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
225
  return audio_path
226
 
227
+
228
+ # ----------------------------------------------------------
229
+ # Generate the annotated HTML transcript
230
+ # ----------------------------------------------------------
231
  def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
232
+ """
233
+ Create a complete HTML page showing:
234
+ - text
235
+ - screenshot
236
+ - voice plot
237
+ for each segment.
238
+ """
239
  html = f"""<!DOCTYPE html>
240
  <html lang="en">
241
  <head>
 
251
  }}
252
  .segment {{
253
  display: flex;
 
254
  gap: 20px;
255
  margin-bottom: 40px;
256
  }}
257
+ .text {{ flex: 2; }}
258
+ .media {{ flex: 3; display: flex; flex-direction: column; gap: 10px; }}
 
 
 
 
 
 
 
259
  </style>
260
  </head>
261
  <body>
 
266
  for time_range, text in entries:
267
  start = time_range.split(" --> ")[0]
268
  start_sec = int(parse_timestamp(start))
269
+
270
  screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
271
  plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
272
 
 
290
 
291
  with open(output_html_path, "w", encoding="utf-8") as f:
292
  f.write(html)
293
+
294
  return output_html_path
295
 
296
+
297
+ # ----------------------------------------------------------
298
+ # The main processing pipeline executed by Gradio
299
+ # ----------------------------------------------------------
300
  def process(video_file):
301
+ """
302
+ Main function:
303
+ - Creates session folder
304
+ - Extracts audio
305
+ - Runs Whisper transcription
306
+ - Generates VTT + DOCX
307
+ - Computes sound intensity
308
+ - Captures screenshots
309
+ - Builds annotated HTML
310
+ """
311
+ # Create isolated session
312
  session_id = str(uuid.uuid4())
313
  base_dir = os.path.join("session_data", session_id)
314
  os.makedirs(base_dir, exist_ok=True)
 
321
  video_path = video_file.name
322
  video_id = os.path.splitext(os.path.basename(video_path))[0]
323
 
324
+ # 1. Extract audio
325
  audio_path = extract_audio(video_path, base_dir)
326
 
327
+ # 2. Transcription using Whisper
328
  result = model.transcribe(audio_path)
329
  vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
330
  write_vtt(result["segments"], vtt_path)
331
  entries = parse_vtt(vtt_path)
332
 
333
+ # 3. DOCX transcript
334
  docx_path = os.path.join(base_dir, f"{video_id}.docx")
335
  write_docx(entries, docx_path)
336
 
337
+ # 4. Voice intensity curve
338
  y, sr = librosa.load(audio_path, sr=None)
339
  S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
340
  freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
 
343
  voice_db = 20 * np.log10(voice_energy + 1e-6)
344
  times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
345
 
346
+ # 5. Screenshots + plots for each segment
347
  for time_range, _ in entries:
348
  start = time_range.split(" --> ")[0]
349
  start_sec = parse_timestamp(start)
350
+ capture_screenshot(video_path, start_sec,
351
+ os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg"))
352
+ save_voice_plot(times, voice_db, start_sec,
353
+ os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png"))
354
 
355
+ # 6. HTML output
356
  html_output_path = os.path.join(base_dir, f"{video_id}.html")
357
+ final_html = generate_html(
358
+ entries, video_id, video_path,
359
+ screenshots_dir, plots_dir,
360
+ html_output_path
361
+ )
362
 
363
+ # 7. ZIP screenshots
364
  zip_path = os.path.join(base_dir, f"{video_id}_screenshots.zip")
365
  shutil.make_archive(zip_path.replace(".zip", ""), "zip", screenshots_dir)
366
 
367
+ # 8. HTML preview as text
368
  with open(final_html, "r", encoding="utf-8") as f:
369
  html_content = f.read()
370
 
371
  return docx_path, final_html, zip_path, html_content
372
 
373
 
374
+ # ----------------------------------------------------------
375
  # Gradio UI
376
+ # ----------------------------------------------------------
377
  demo = gr.Interface(
378
  fn=process,
379
  inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],