clementBE commited on
Commit
cf494c7
·
verified ·
1 Parent(s): c37ed20

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import subprocess
4
+ import whisper
5
+ import librosa
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import uuid
9
+ import base64
10
+
11
+ model = whisper.load_model("base")
12
+
13
+ def format_timestamp(seconds):
14
+ h = int(seconds // 3600)
15
+ m = int((seconds % 3600) // 60)
16
+ s = int(seconds % 60)
17
+ ms = int((seconds - int(seconds)) * 1000)
18
+ return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"
19
+
20
+ def write_vtt(segments, filepath):
21
+ with open(filepath, "w", encoding="utf-8") as f:
22
+ f.write("WEBVTT\n\n")
23
+ for i, seg in enumerate(segments, start=1):
24
+ start = format_timestamp(seg['start'])
25
+ end = format_timestamp(seg['end'])
26
+ text = seg['text'].strip()
27
+ f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
28
+
29
+ def parse_vtt(filepath):
30
+ entries = []
31
+ with open(filepath, "r", encoding="utf-8") as f:
32
+ lines = f.readlines()
33
+ idx = 0
34
+ while idx < len(lines):
35
+ line = lines[idx].strip()
36
+ if "-->" in line:
37
+ time_range = line
38
+ idx += 1
39
+ text_lines = []
40
+ while idx < len(lines) and lines[idx].strip() != '':
41
+ text_lines.append(lines[idx].strip())
42
+ idx += 1
43
+ entries.append((time_range, ' '.join(text_lines)))
44
+ else:
45
+ idx += 1
46
+ return entries
47
+
48
+ def parse_timestamp(ts_str):
49
+ h, m, rest = ts_str.split(":")
50
+ s, ms = rest.split(".")
51
+ return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000
52
+
53
+ def capture_screenshot(video_path, time_sec, out_path):
54
+ cmd = [
55
+ "ffmpeg", "-ss", str(time_sec), "-i", video_path,
56
+ "-frames:v", "1", "-q:v", "2", out_path, "-y"
57
+ ]
58
+ subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
59
+
60
+ def save_voice_plot(times, db, start_sec, out_path):
61
+ plt.figure(figsize=(8, 3))
62
+ plt.plot(times, db, color="purple")
63
+ plt.axvline(x=start_sec, color="red", linestyle="--")
64
+ interp_val = np.interp(start_sec, times, db)
65
+ plt.scatter([start_sec], [interp_val], color="red")
66
+ plt.xlabel("Time (s)")
67
+ plt.ylabel("Voice band dB")
68
+ plt.tight_layout()
69
+ plt.savefig(out_path)
70
+ plt.close()
71
+
72
+ def file_to_base64(filepath):
73
+ with open(filepath, "rb") as f:
74
+ data = f.read()
75
+ ext = os.path.splitext(filepath)[1].lower().replace('.', '')
76
+ mime = f"image/{'jpeg' if ext=='jpg' else ext}"
77
+ b64 = base64.b64encode(data).decode('utf-8')
78
+ return f"data:{mime};base64,{b64}"
79
+
80
+ def extract_audio(video_path, output_dir):
81
+ audio_path = os.path.join(output_dir, "audio.mp3")
82
+ subprocess.run([
83
+ "ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame", audio_path
84
+ ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
85
+ return audio_path
86
+
87
+ def generate_html(entries, video_id, video_path, screenshot_dir, plot_dir, output_html_path):
88
+ html = f"""<!DOCTYPE html>
89
+ <html lang="en">
90
+ <head>
91
+ <meta charset="UTF-8"><title>{video_id}</title>
92
+ <style>
93
+ body {{ font-family: Arial; font-size: 18px; margin: 20px; }}
94
+ .media img {{
95
+ width: 480px;
96
+ height: auto;
97
+ border: 1px solid #ccc;
98
+ border-radius: 6px;
99
+ box-shadow: 2px 2px 6px rgba(0,0,0,0.1);
100
+ }}
101
+ .segment {{
102
+ display: flex;
103
+ align-items: center;
104
+ gap: 20px;
105
+ margin-bottom: 40px;
106
+ }}
107
+ .text {{
108
+ flex: 2;
109
+ }}
110
+ .media {{
111
+ flex: 3;
112
+ display: flex;
113
+ flex-direction: column;
114
+ gap: 10px;
115
+ }}
116
+ </style>
117
+ </head>
118
+ <body>
119
+ <h1>Annotated Transcript for {video_id}</h1>
120
+ <p>Uploaded video file: {os.path.basename(video_path)}</p>
121
+ """
122
+
123
+ for time_range, text in entries:
124
+ start = time_range.split(" --> ")[0]
125
+ start_sec = int(parse_timestamp(start))
126
+ screenshot_path = os.path.join(screenshot_dir, f"{video_id}_{start_sec}.jpg")
127
+ plot_path = os.path.join(plot_dir, f"{video_id}_{start_sec}_sound.png")
128
+
129
+ screenshot_b64 = file_to_base64(screenshot_path) if os.path.exists(screenshot_path) else ""
130
+ plot_b64 = file_to_base64(plot_path) if os.path.exists(plot_path) else ""
131
+
132
+ html += f"""
133
+ <div class="segment">
134
+ <div class="text">
135
+ <h3>{time_range}</h3>
136
+ <p contenteditable="true">{text}</p>
137
+ </div>
138
+ <div class="media">
139
+ <img src="{screenshot_b64}" alt="Screenshot at {start_sec}s">
140
+ <img src="{plot_b64}" alt="Voice energy plot at {start_sec}s">
141
+ </div>
142
+ </div>
143
+ """
144
+
145
+ html += "</body></html>"
146
+
147
+ with open(output_html_path, "w", encoding="utf-8") as f:
148
+ f.write(html)
149
+ return output_html_path
150
+
151
+ def process(video_file):
152
+ session_id = str(uuid.uuid4())
153
+ base_dir = os.path.join("session_data", session_id)
154
+ os.makedirs(base_dir, exist_ok=True)
155
+
156
+ screenshots_dir = os.path.join(base_dir, "screenshots")
157
+ plots_dir = os.path.join(base_dir, "plots")
158
+ os.makedirs(screenshots_dir, exist_ok=True)
159
+ os.makedirs(plots_dir, exist_ok=True)
160
+
161
+ video_path = video_file.name
162
+ video_id = os.path.splitext(os.path.basename(video_path))[0]
163
+
164
+ # Extract audio
165
+ audio_path = extract_audio(video_path, base_dir)
166
+
167
+ # Transcription
168
+ result = model.transcribe(audio_path)
169
+ vtt_path = os.path.join(base_dir, f"{video_id}.vtt")
170
+ write_vtt(result["segments"], vtt_path)
171
+ entries = parse_vtt(vtt_path)
172
+
173
+ # Voice intensity curve
174
+ y, sr = librosa.load(audio_path, sr=None)
175
+ S = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
176
+ freqs = librosa.fft_frequencies(sr=sr, n_fft=2048)
177
+ voice_band = (freqs >= 300) & (freqs <= 3000)
178
+ voice_energy = S[voice_band, :].mean(axis=0)
179
+ voice_db = 20 * np.log10(voice_energy + 1e-6)
180
+ times = librosa.frames_to_time(np.arange(len(voice_db)), sr=sr, hop_length=512)
181
+
182
+ # Generate screenshots + plots
183
+ for time_range, _ in entries:
184
+ start = time_range.split(" --> ")[0]
185
+ start_sec = parse_timestamp(start)
186
+ screenshot_out = os.path.join(screenshots_dir, f"{video_id}_{int(start_sec)}.jpg")
187
+ plot_out = os.path.join(plots_dir, f"{video_id}_{int(start_sec)}_sound.png")
188
+ capture_screenshot(video_path, start_sec, screenshot_out)
189
+ save_voice_plot(times, voice_db, start_sec, plot_out)
190
+
191
+ # HTML output
192
+ html_output_path = os.path.join(base_dir, f"{video_id}.html")
193
+ final_html = generate_html(entries, video_id, video_path, screenshots_dir, plots_dir, html_output_path)
194
+
195
+ return final_html
196
+
197
+ demo = gr.Interface(
198
+ fn=process,
199
+ inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".mov", ".mkv"])],
200
+ outputs=gr.File(label="Download Annotated HTML"),
201
+ title="Video Annotated Transcript",
202
+ description="🎥 Upload a video file (mp4/mov/mkv). The tool will transcribe speech, capture screenshots, analyze sound intensity, and generate an editable HTML transcript."
203
+ )
204
+
205
+ if __name__ == "__main__":
206
+ demo.launch()