lochn commited on
Commit
fc8db39
Β·
verified Β·
1 Parent(s): 7b1ae93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +369 -120
app.py CHANGED
@@ -1,24 +1,34 @@
1
  import os
2
  import subprocess
3
  import time
 
 
4
  from pathlib import Path
 
5
 
6
  import spacy
7
  import gradio as gr
8
- from dotenv import load_dotenv
9
- from huggingface_hub import login
10
  from transformers import pipeline
11
 
12
- # β€”β€”β€” Load environment variables β€”β€”β€”
13
- load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # β€”β€”β€” spaCy setup β€”β€”β€”
16
- try:
17
- nlp = spacy.load("en_core_web_sm")
18
- except OSError:
19
- from spacy.cli import download as spacy_download
20
- spacy_download("en_core_web_sm")
21
- nlp = spacy.load("en_core_web_sm")
22
 
23
 
24
  def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
@@ -28,136 +38,375 @@ def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
28
  try:
29
  return func(*args, **kwargs)
30
  except Exception as e:
31
- if attempt < max_retries - 1:
32
- print(f"Rate limit or error, retrying in {delay}s…")
33
- time.sleep(delay)
34
- delay *= backoff
 
 
 
 
35
  else:
36
- print("Maximum retries reached; aborting.")
37
  raise
38
  return wrapper
39
 
40
 
41
- def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = "chunks") -> list[Path]:
42
- Path(output_dir).mkdir(exist_ok=True)
43
- output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
44
- cmd = [
45
- "ffmpeg", "-y", "-i", input_path,
46
- "-f", "segment", "-segment_time", str(chunk_length),
47
- "-reset_timestamps", "1", output_pattern
48
- ]
49
- subprocess.run(cmd, check=True)
50
- return sorted(Path(output_dir).glob("chunk_*.mp4"))
51
 
52
 
53
- def extract_audio(video_path: str, audio_path: str) -> None:
54
- cmd = [
55
- "ffmpeg", "-y", "-i", video_path,
56
- "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
57
- audio_path
58
- ]
59
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
 
62
- def segment_text(segments: list[dict]) -> list[str]:
63
- full = "\n\n".join(seg.get("text", "") for seg in segments)
64
- return [b.strip() for b in full.split("\n\n") if b.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
- def extract_key_phrases(text: str, top_n=5) -> list[str]:
68
- doc = nlp(text)
69
- phrases = [chunk.text for chunk in doc.noun_chunks]
70
- return list(dict.fromkeys(phrases))[:top_n]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
- def extract_frame(video_path: str, timestamp: str, output_path: str) -> None:
74
- cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", output_path]
75
- subprocess.run(cmd, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
  @retry_on_rate_limit
79
- def transcribe_audio(asr_pipeline, audio_path: str) -> list[dict]:
80
- result = asr_pipeline(audio_path)
81
- return result.get("chunks", [{"text": result["text"], "timestamp": (0.0, 0.0)}])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
  @retry_on_rate_limit
85
  def summarize_text(summarizer_pipeline, text: str) -> str:
86
- out = summarizer_pipeline(
87
- text,
88
- max_length=200,
89
- min_length=30,
90
- do_sample=False
91
- )
92
- return out[0]["summary_text"].strip()
93
-
94
-
95
- def run_pipeline(video_file: str) -> list[dict]:
96
- hf_token = os.getenv("HF_TOKEN")
97
- if not hf_token:
98
- raise EnvironmentError("HF_TOKEN environment variable is not set.")
99
- login(token=hf_token)
100
-
101
- asr = pipeline(
102
- "automatic-speech-recognition",
103
- model="openai/whisper-large-v2",
104
- chunk_length_s=30,
105
- stride_length_s=(5, 5),
106
- return_timestamps="sentence",
107
- token=hf_token
108
- )
109
- summarizer = pipeline(
110
- "summarization",
111
- model="facebook/bart-large-cnn",
112
- token=hf_token
113
- )
114
-
115
- chunks = chunk_video(video_file)
116
-
117
- segments = []
118
- for chunk in chunks:
119
- wav = str(chunk).replace(".mp4", ".wav")
120
- extract_audio(str(chunk), wav)
121
- for c in transcribe_audio(asr, wav):
122
- segments.append({
123
- "text": c["text"],
124
- "start": f"{int(c['timestamp'][0]//60):02d}:{c['timestamp'][0]%60:06.3f}",
125
- "end": f"{int(c['timestamp'][1]//60):02d}:{c['timestamp'][1]%60:06.3f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- blocks = segment_text(segments)
129
- summaries = [summarize_text(summarizer, b) for b in blocks]
130
- phrases = [extract_key_phrases(b) for b in blocks]
131
-
132
- Path("frames").mkdir(exist_ok=True)
133
- frames = []
134
- for seg in segments:
135
- ts_clean = seg["start"].replace(":", "-")
136
- out = f"frames/frame_{ts_clean}.jpg"
137
- extract_frame(video_file, seg["start"], out)
138
- frames.append(out)
139
-
140
- timeline = []
141
- for seg, sumry, ph, fr in zip(segments, summaries, phrases, frames):
142
- timeline.append({
143
- "start_time": seg["start"],
144
- "end_time": seg["end"],
145
- "summary": sumry,
146
- "key_phrases": ph,
147
- "frame": fr
148
- })
149
-
150
- return timeline
151
-
152
-
153
- # β€”β€”β€” Gradio UI β€”β€”β€”
154
- demo = gr.Blocks()
155
- with demo:
156
- gr.Markdown("# Lecture Capture AI Pipeline (HF-powered)")
157
- vid = gr.Video(label="Lecture Video")
158
- btn = gr.Button("Process")
159
- out = gr.JSON(label="Timeline")
160
- btn.click(fn=run_pipeline, inputs=[vid], outputs=out)
161
 
162
  if __name__ == "__main__":
163
- demo.launch()
 
 
1
  import os
2
  import subprocess
3
  import time
4
+ import tempfile
5
+ import shutil
6
  from pathlib import Path
7
+ from typing import List, Dict, Optional
8
 
9
  import spacy
10
  import gradio as gr
 
 
11
  from transformers import pipeline
12
 
13
+ # β€”β€”β€” spaCy setup for HF Spaces β€”β€”β€”
14
+ def setup_spacy():
15
+ """Setup spaCy model with proper error handling for HF Spaces"""
16
+ try:
17
+ nlp = spacy.load("en_core_web_sm")
18
+ return nlp
19
+ except OSError:
20
+ print("Downloading spaCy model...")
21
+ try:
22
+ from spacy.cli import download as spacy_download
23
+ spacy_download("en_core_web_sm")
24
+ nlp = spacy.load("en_core_web_sm")
25
+ return nlp
26
+ except Exception as e:
27
+ print(f"Failed to download spaCy model: {e}")
28
+ # Return None if spaCy fails - we'll handle this gracefully
29
+ return None
30
 
31
+ nlp = setup_spacy()
 
 
 
 
 
 
32
 
33
 
34
  def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
 
38
  try:
39
  return func(*args, **kwargs)
40
  except Exception as e:
41
+ if "rate limit" in str(e).lower() or "429" in str(e):
42
+ if attempt < max_retries - 1:
43
+ print(f"Rate limit detected, retrying in {delay}s...")
44
+ time.sleep(delay)
45
+ delay *= backoff
46
+ else:
47
+ print("Maximum retries reached for rate limit.")
48
+ raise
49
  else:
50
+ # For non-rate-limit errors, raise immediately
51
  raise
52
  return wrapper
53
 
54
 
55
+ def check_ffmpeg():
56
+ """Check if ffmpeg is available in HF Spaces"""
57
+ try:
58
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
59
+ return True
60
+ except (subprocess.CalledProcessError, FileNotFoundError):
61
+ return False
 
 
 
62
 
63
 
64
+ def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None) -> List[Path]:
65
+ """Chunk video with temporary directory handling for HF Spaces"""
66
+ if output_dir is None:
67
+ output_dir = tempfile.mkdtemp(prefix="chunks_")
68
+
69
+ Path(output_dir).mkdir(exist_ok=True)
70
+ output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
71
+
72
+ try:
73
+ cmd = [
74
+ "ffmpeg", "-y", "-i", input_path,
75
+ "-f", "segment", "-segment_time", str(chunk_length),
76
+ "-reset_timestamps", "1", "-c", "copy", # Use copy to avoid re-encoding
77
+ output_pattern
78
+ ]
79
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
80
+
81
+ if result.returncode != 0:
82
+ print(f"FFmpeg error: {result.stderr}")
83
+ return []
84
+
85
+ return sorted(Path(output_dir).glob("chunk_*.mp4"))
86
+ except subprocess.TimeoutExpired:
87
+ print("Video chunking timed out")
88
+ return []
89
+ except Exception as e:
90
+ print(f"Error chunking video: {str(e)}")
91
+ return []
92
 
93
 
94
+ def extract_audio(video_path: str, audio_path: str) -> bool:
95
+ """Extract audio with better error handling for HF Spaces"""
96
+ try:
97
+ cmd = [
98
+ "ffmpeg", "-y", "-i", video_path,
99
+ "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
100
+ audio_path
101
+ ]
102
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
103
+
104
+ if result.returncode != 0:
105
+ print(f"Audio extraction error: {result.stderr}")
106
+ return False
107
+ return True
108
+ except subprocess.TimeoutExpired:
109
+ print("Audio extraction timed out")
110
+ return False
111
+ except Exception as e:
112
+ print(f"Error extracting audio: {str(e)}")
113
+ return False
114
 
115
 
116
+ def extract_key_phrases(text: str, top_n: int = 5) -> List[str]:
117
+ """Extract key phrases with fallback if spaCy is not available"""
118
+ if nlp is None:
119
+ # Fallback: simple word extraction
120
+ words = text.split()
121
+ # Get longer words as "key phrases"
122
+ key_words = [w for w in words if len(w) > 4 and w.isalpha()]
123
+ return list(dict.fromkeys(key_words))[:top_n]
124
+
125
+ try:
126
+ doc = nlp(text)
127
+ phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
128
+ # Remove duplicates while preserving order
129
+ seen = set()
130
+ unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
131
+ return unique_phrases[:top_n]
132
+ except Exception as e:
133
+ print(f"Error extracting key phrases: {str(e)}")
134
+ return []
135
 
136
 
137
+ def extract_frame(video_path: str, timestamp: str, output_path: str) -> bool:
138
+ """Extract frame with timeout for HF Spaces"""
139
+ try:
140
+ cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
141
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
142
+
143
+ if result.returncode != 0:
144
+ print(f"Frame extraction error: {result.stderr}")
145
+ return False
146
+ return True
147
+ except subprocess.TimeoutExpired:
148
+ print("Frame extraction timed out")
149
+ return False
150
+ except Exception as e:
151
+ print(f"Error extracting frame: {str(e)}")
152
+ return False
153
 
154
 
155
  @retry_on_rate_limit
156
+ def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
157
+ """Transcribe audio with better error handling"""
158
+ try:
159
+ result = asr_pipeline(audio_path)
160
+
161
+ if isinstance(result, dict):
162
+ if "chunks" in result:
163
+ return result["chunks"]
164
+ else:
165
+ return [{"text": result.get("text", ""), "timestamp": (0.0, 0.0)}]
166
+ elif isinstance(result, str):
167
+ return [{"text": result, "timestamp": (0.0, 0.0)}]
168
+ else:
169
+ return [{"text": str(result), "timestamp": (0.0, 0.0)}]
170
+ except Exception as e:
171
+ print(f"Transcription error: {str(e)}")
172
+ return [{"text": "Transcription failed", "timestamp": (0.0, 0.0)}]
173
 
174
 
175
  @retry_on_rate_limit
176
  def summarize_text(summarizer_pipeline, text: str) -> str:
177
+ """Summarize text with length constraints for HF Spaces"""
178
+ if not text.strip():
179
+ return "No content to summarize."
180
+
181
+ # Truncate text if too long for the model
182
+ max_length = 1024 # BART's max input length
183
+ if len(text) > max_length:
184
+ text = text[:max_length]
185
+
186
+ try:
187
+ # Adjust parameters for shorter text
188
+ min_len = min(30, len(text.split()) // 4)
189
+ max_len = min(200, len(text.split()) // 2)
190
+
191
+ if min_len >= max_len:
192
+ min_len = max(10, max_len - 10)
193
+
194
+ result = summarizer_pipeline(
195
+ text,
196
+ max_length=max_len,
197
+ min_length=min_len,
198
+ do_sample=False
199
+ )
200
+
201
+ if isinstance(result, list) and len(result) > 0:
202
+ return result[0]["summary_text"].strip()
203
+ return "Failed to generate summary."
204
+ except Exception as e:
205
+ print(f"Summarization error: {str(e)}")
206
+ return f"Summary generation failed: {str(e)}"
207
+
208
+
209
+ def format_timestamp(seconds: float) -> str:
210
+ """Format seconds into MM:SS.mmm format"""
211
+ minutes = int(seconds // 60)
212
+ remaining_seconds = seconds % 60
213
+ return f"{minutes:02d}:{remaining_seconds:06.3f}"
214
+
215
+
216
+ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
217
+ """Main pipeline function optimized for HF Spaces"""
218
+ if not video_file:
219
+ return [{"error": "No video file provided"}]
220
+
221
+ # Check if ffmpeg is available
222
+ if not check_ffmpeg():
223
+ return [{"error": "FFmpeg is not available in this environment"}]
224
+
225
+ progress(0.1, desc="Initializing models...")
226
+
227
+ # Initialize models with error handling
228
+ try:
229
+ asr = pipeline(
230
+ "automatic-speech-recognition",
231
+ model="openai/whisper-base", # Use smaller model for HF Spaces
232
+ chunk_length_s=30,
233
+ stride_length_s=(4, 2),
234
+ return_timestamps="word"
235
+ )
236
+ progress(0.2, desc="ASR model loaded...")
237
+
238
+ summarizer = pipeline(
239
+ "summarization",
240
+ model="facebook/bart-large-cnn"
241
+ )
242
+ progress(0.3, desc="Summarization model loaded...")
243
+
244
+ except Exception as e:
245
+ return [{"error": f"Failed to load models: {str(e)}"}]
246
+
247
+ # Create temporary directories
248
+ temp_dir = tempfile.mkdtemp(prefix="lecture_capture_")
249
+ chunks_dir = os.path.join(temp_dir, "chunks")
250
+ frames_dir = os.path.join(temp_dir, "frames")
251
+
252
+ try:
253
+ Path(chunks_dir).mkdir(exist_ok=True)
254
+ Path(frames_dir).mkdir(exist_ok=True)
255
+
256
+ progress(0.4, desc="Processing video chunks...")
257
+
258
+ # Process video - use shorter chunks for HF Spaces
259
+ chunks = chunk_video(video_file, chunk_length=120, output_dir=chunks_dir)
260
+ if not chunks:
261
+ return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
262
+
263
+ progress(0.5, desc=f"Processing {len(chunks)} chunks...")
264
+
265
+ # Process each chunk
266
+ all_segments = []
267
+ for i, chunk in enumerate(chunks):
268
+ progress(0.5 + (0.3 * i / len(chunks)), desc=f"Processing chunk {i+1}/{len(chunks)}...")
269
+
270
+ wav_path = str(chunk).replace(".mp4", ".wav")
271
+
272
+ # Extract audio
273
+ if not extract_audio(str(chunk), wav_path):
274
+ continue
275
+
276
+ # Transcribe
277
+ try:
278
+ chunk_segments = transcribe_audio(asr, wav_path)
279
+
280
+ # Calculate absolute timestamps
281
+ chunk_start_time = i * 120 # 120 seconds per chunk
282
+
283
+ for seg in chunk_segments:
284
+ if isinstance(seg.get("timestamp"), tuple) and len(seg["timestamp"]) == 2:
285
+ start_time = chunk_start_time + seg["timestamp"][0]
286
+ end_time = chunk_start_time + seg["timestamp"][1]
287
+ else:
288
+ start_time = chunk_start_time
289
+ end_time = chunk_start_time + 120
290
+
291
+ all_segments.append({
292
+ "text": seg.get("text", ""),
293
+ "start": format_timestamp(start_time),
294
+ "end": format_timestamp(end_time),
295
+ "start_seconds": start_time,
296
+ "end_seconds": end_time
297
+ })
298
+ except Exception as e:
299
+ print(f"Error processing chunk {i}: {str(e)}")
300
+ continue
301
+
302
+ if not all_segments:
303
+ return [{"error": "No segments were successfully processed"}]
304
+
305
+ progress(0.8, desc="Generating summaries and extracting key phrases...")
306
+
307
+ # Sort segments by start time
308
+ all_segments.sort(key=lambda x: x["start_seconds"])
309
+
310
+ # Generate timeline
311
+ timeline = []
312
+ for i, segment in enumerate(all_segments[:20]): # Limit to 20 segments for HF Spaces
313
+ segment_text = segment["text"]
314
+
315
+ # Generate summary
316
+ try:
317
+ summary = summarize_text(summarizer, segment_text) if segment_text else "No content"
318
+ except Exception as e:
319
+ summary = f"Summary failed: {str(e)}"
320
+
321
+ # Extract key phrases
322
+ key_phrases = extract_key_phrases(segment_text) if segment_text else []
323
+
324
+ # Extract frame (optional, may fail in HF Spaces)
325
+ frame_path = os.path.join(frames_dir, f"frame_{i:03d}.jpg")
326
+ frame_extracted = extract_frame(video_file, segment["start"], frame_path)
327
+
328
+ timeline.append({
329
+ "start_time": segment["start"],
330
+ "end_time": segment["end"],
331
+ "text": segment_text,
332
+ "summary": summary,
333
+ "key_phrases": key_phrases,
334
+ "frame_available": frame_extracted
335
  })
336
+
337
+ progress(1.0, desc="Processing complete!")
338
+ return timeline
339
+
340
+ except Exception as e:
341
+ import traceback
342
+ return [{"error": f"Pipeline failed: {str(e)}", "details": traceback.format_exc()}]
343
+
344
+ finally:
345
+ # Clean up temporary files
346
+ try:
347
+ shutil.rmtree(temp_dir)
348
+ except Exception as e:
349
+ print(f"Failed to clean up temp directory: {str(e)}")
350
+
351
+
352
+ # β€”β€”β€” Gradio UI optimized for HF Spaces β€”β€”β€”
353
+ def create_interface():
354
+ with gr.Blocks(title="Lecture Capture AI Pipeline", theme=gr.themes.Soft()) as demo:
355
+ gr.Markdown("""
356
+ # πŸŽ“ Lecture Capture AI Pipeline
357
+
358
+ Upload a lecture video to automatically generate:
359
+ - πŸ“ Transcription with timestamps
360
+ - πŸ“‹ Summaries for each segment
361
+ - πŸ”‘ Key phrases extraction
362
+
363
+ **Note**: This runs on Hugging Face Spaces with limited resources. Processing may take time for longer videos.
364
+ """)
365
+
366
+ with gr.Row():
367
+ with gr.Column(scale=1):
368
+ video_input = gr.Video(
369
+ label="πŸ“Ή Upload Lecture Video",
370
+ height=300
371
+ )
372
+
373
+ process_btn = gr.Button(
374
+ "πŸš€ Process Video",
375
+ variant="primary",
376
+ size="lg"
377
+ )
378
+
379
+ gr.Markdown("""
380
+ ### πŸ’‘ Tips:
381
+ - Shorter videos (< 10 minutes) work best
382
+ - Clear audio improves transcription quality
383
+ - Processing may take 2-5 minutes depending on video length
384
+ """)
385
+
386
+ with gr.Column(scale=2):
387
+ output_json = gr.JSON(
388
+ label="πŸ“Š Generated Timeline",
389
+ height=600
390
+ )
391
+
392
+ process_btn.click(
393
+ fn=run_pipeline,
394
+ inputs=[video_input],
395
+ outputs=[output_json],
396
+ show_progress=True
397
+ )
398
+
399
+ gr.Markdown("""
400
+ ### πŸ”§ Technical Details:
401
+ - Uses Whisper (base) for speech recognition
402
+ - BART for text summarization
403
+ - spaCy for key phrase extraction
404
+ - Optimized for Hugging Face Spaces environment
405
+ """)
406
+
407
+ return demo
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
  if __name__ == "__main__":
411
+ demo = create_interface()
412
+ demo.launch()