lochn commited on
Commit
5b5fd29
Β·
verified Β·
1 Parent(s): 386b12b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +324 -251
app.py CHANGED
@@ -4,21 +4,22 @@ import time
4
  import tempfile
5
  import shutil
6
  from pathlib import Path
7
- from typing import List, Dict, Optional
8
- import threading
9
  import json
 
 
 
10
 
11
  import gradio as gr
12
- import torch
13
  import numpy as np
14
 
15
  # Try to import optional dependencies
16
  try:
17
  import whisper
18
  WHISPER_AVAILABLE = True
 
19
  except ImportError:
20
  WHISPER_AVAILABLE = False
21
- print("Whisper not available, will use fallback")
22
 
23
  try:
24
  import spacy
@@ -26,67 +27,28 @@ try:
26
  try:
27
  nlp = spacy.load("en_core_web_sm")
28
  SPACY_AVAILABLE = True
 
29
  except OSError:
30
  SPACY_AVAILABLE = False
31
- print("spaCy model not available, using fallback")
32
  except ImportError:
33
  SPACY_AVAILABLE = False
34
- print("spaCy not available, using fallback")
35
 
36
  try:
37
  from transformers import pipeline
 
38
  TRANSFORMERS_AVAILABLE = True
 
39
  except ImportError:
40
  TRANSFORMERS_AVAILABLE = False
41
- print("Transformers not available, using fallback")
42
-
43
-
44
- # Global timeout handler
45
- class TimeoutError(Exception):
46
- pass
47
-
48
-
49
- def run_with_timeout(func, timeout_seconds, *args, **kwargs):
50
- """Run a function with a timeout"""
51
- result = [None]
52
- exception = [None]
53
-
54
- def target():
55
- try:
56
- result[0] = func(*args, **kwargs)
57
- except Exception as e:
58
- exception[0] = e
59
-
60
- thread = threading.Thread(target=target)
61
- thread.daemon = True
62
- thread.start()
63
- thread.join(timeout_seconds)
64
-
65
- if thread.is_alive():
66
- raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
67
-
68
- if exception[0]:
69
- raise exception[0]
70
-
71
- return result[0]
72
-
73
-
74
- def check_dependencies():
75
- """Check which dependencies are available"""
76
- deps = {
77
- 'ffmpeg': check_ffmpeg(),
78
- 'whisper': WHISPER_AVAILABLE,
79
- 'spacy': SPACY_AVAILABLE,
80
- 'transformers': TRANSFORMERS_AVAILABLE
81
- }
82
- print(f"Available dependencies: {deps}")
83
- return deps
84
 
85
 
86
  def check_ffmpeg():
87
  """Check if ffmpeg is available"""
88
  try:
89
- result = subprocess.run(["ffmpeg", "-version"], capture_output=True, timeout=10)
90
  return result.returncode == 0
91
  except:
92
  return False
@@ -99,7 +61,7 @@ def get_video_info(video_path: str) -> Dict:
99
  "ffprobe", "-v", "quiet", "-print_format", "json", "-show_format",
100
  "-show_streams", video_path
101
  ]
102
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
103
 
104
  if result.returncode == 0:
105
  info = json.loads(result.stdout)
@@ -141,12 +103,7 @@ def extract_audio_simple(video_path: str, audio_path: str, start_time: float = 0
141
 
142
  print(f"Extracting audio: {' '.join(cmd)}")
143
 
144
- result = subprocess.run(
145
- cmd,
146
- capture_output=True,
147
- text=True,
148
- timeout=60
149
- )
150
 
151
  if result.returncode == 0:
152
  if os.path.exists(audio_path) and os.path.getsize(audio_path) > 1000:
@@ -159,128 +116,123 @@ def extract_audio_simple(video_path: str, audio_path: str, start_time: float = 0
159
  print(f"FFmpeg error: {result.stderr}")
160
  return False
161
 
162
- except subprocess.TimeoutExpired:
163
- print("Audio extraction timed out")
164
- return False
165
  except Exception as e:
166
  print(f"Error extracting audio: {str(e)}")
167
  return False
168
 
169
 
170
- def transcribe_audio_whisper(audio_path: str) -> List[Dict]:
171
- """Transcribe using OpenAI Whisper"""
172
  try:
173
- print(f"Starting Whisper transcription of {audio_path}")
 
 
 
 
 
 
 
 
 
174
 
 
 
 
 
 
 
 
 
 
 
 
175
  if not WHISPER_AVAILABLE:
176
- return [{"text": "Whisper not available", "timestamp": (0, 30)}]
177
 
178
- def do_transcribe():
179
- model = whisper.load_model("tiny")
180
- result = model.transcribe(audio_path, language="en")
181
- return result
182
 
183
- result = run_with_timeout(do_transcribe, 120)
 
184
 
185
- if result and "segments" in result:
186
- segments = []
187
- for seg in result["segments"]:
188
- segments.append({
189
- "text": seg.get("text", "").strip(),
190
- "timestamp": (seg.get("start", 0), seg.get("end", 30))
191
- })
192
- return segments
193
- elif result and "text" in result:
194
- return [{"text": result["text"], "timestamp": (0, 30)}]
 
 
 
195
  else:
196
- return [{"text": "Transcription failed", "timestamp": (0, 30)}]
197
 
198
  except Exception as e:
199
  print(f"Whisper transcription error: {str(e)}")
200
- return [{"text": f"Transcription failed: {str(e)}", "timestamp": (0, 30)}]
201
 
202
 
203
- def transcribe_audio_transformers(audio_path: str) -> List[Dict]:
204
- """Transcribe using Transformers pipeline as fallback"""
205
  try:
206
- print(f"Starting Transformers transcription of {audio_path}")
207
-
208
  if not TRANSFORMERS_AVAILABLE:
209
- return [{"text": "Transformers not available", "timestamp": (0, 30)}]
210
 
211
- def do_transcribe():
212
- asr = pipeline(
213
- "automatic-speech-recognition",
214
- model="openai/whisper-tiny",
215
- device=0 if torch.cuda.is_available() else -1
216
- )
217
- result = asr(audio_path, return_timestamps=True)
218
- return result
219
 
220
- result = run_with_timeout(do_transcribe, 120)
 
 
 
 
 
221
 
222
- if isinstance(result, dict):
223
- if "chunks" in result:
224
- return result["chunks"]
225
- else:
226
- return [{"text": result.get("text", ""), "timestamp": (0, 30)}]
 
 
227
  else:
228
- return [{"text": str(result), "timestamp": (0, 30)}]
229
 
230
  except Exception as e:
231
  print(f"Transformers transcription error: {str(e)}")
232
- return [{"text": f"Transcription failed: {str(e)}", "timestamp": (0, 30)}]
233
-
234
-
235
- def transcribe_audio_fallback(audio_path: str) -> List[Dict]:
236
- """Fallback transcription method"""
237
- return [{"text": "Transcription not available - no speech recognition models loaded", "timestamp": (0, 30)}]
238
 
239
 
240
- def transcribe_audio(audio_path: str) -> List[Dict]:
241
- """Main transcription function with fallbacks"""
242
  # Try Whisper first
243
  if WHISPER_AVAILABLE:
244
  try:
245
- return transcribe_audio_whisper(audio_path)
246
  except Exception as e:
247
  print(f"Whisper failed: {e}")
248
 
249
  # Try Transformers as fallback
250
  if TRANSFORMERS_AVAILABLE:
251
  try:
252
- return transcribe_audio_transformers(audio_path)
253
  except Exception as e:
254
  print(f"Transformers failed: {e}")
255
 
256
  # Use fallback
257
- return transcribe_audio_fallback(audio_path)
258
-
259
-
260
- def extract_key_phrases_spacy(text: str, top_n: int = 5) -> List[str]:
261
- """Extract key phrases using spaCy"""
262
- if not SPACY_AVAILABLE or nlp is None:
263
- return extract_key_phrases_simple(text, top_n)
264
-
265
- try:
266
- doc = nlp(text)
267
- phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
268
- seen = set()
269
- unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
270
- return unique_phrases[:top_n]
271
- except Exception as e:
272
- print(f"spaCy key phrase extraction failed: {e}")
273
- return extract_key_phrases_simple(text, top_n)
274
 
275
 
276
  def extract_key_phrases_simple(text: str, top_n: int = 5) -> List[str]:
277
- """Simple key phrase extraction fallback"""
278
  if not text:
279
  return []
280
 
281
  words = text.split()
282
  key_words = [
283
- w.strip('.,!?";') for w in words
284
  if len(w) > 4 and w.isalpha() and w.lower() not in {
285
  'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been',
286
  'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there'
@@ -293,48 +245,8 @@ def extract_key_phrases_simple(text: str, top_n: int = 5) -> List[str]:
293
  return unique_words[:top_n]
294
 
295
 
296
- def extract_key_phrases(text: str, top_n: int = 5) -> List[str]:
297
- """Main key phrase extraction with fallback"""
298
- if SPACY_AVAILABLE:
299
- return extract_key_phrases_spacy(text, top_n)
300
- else:
301
- return extract_key_phrases_simple(text, top_n)
302
-
303
-
304
- def summarize_text_transformers(text: str) -> str:
305
- """Summarize using transformers"""
306
- if not TRANSFORMERS_AVAILABLE or len(text.split()) < 10:
307
- return summarize_text_simple(text)
308
-
309
- try:
310
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
311
-
312
- words = text.split()
313
- if len(words) > 500:
314
- text = " ".join(words[:500])
315
-
316
- input_length = len(words)
317
- max_new_tokens = min(100, max(20, input_length // 3))
318
- min_length = min(15, max(5, input_length // 8))
319
-
320
- result = summarizer(
321
- text,
322
- max_new_tokens=max_new_tokens,
323
- min_length=min_length,
324
- do_sample=False
325
- )
326
-
327
- if isinstance(result, list) and len(result) > 0:
328
- return result[0]["summary_text"].strip()
329
- return text
330
-
331
- except Exception as e:
332
- print(f"Transformers summarization failed: {e}")
333
- return summarize_text_simple(text)
334
-
335
-
336
  def summarize_text_simple(text: str) -> str:
337
- """Simple text summarization fallback"""
338
  if not text or len(text.split()) < 10:
339
  return text
340
 
@@ -346,19 +258,12 @@ def summarize_text_simple(text: str) -> str:
346
  elif len(sentences) <= 5:
347
  return '. '.join(sentences[:2]) + '.'
348
  else:
 
349
  middle_idx = len(sentences) // 2
350
  summary_sentences = [sentences[0], sentences[middle_idx], sentences[-1]]
351
  return '. '.join(summary_sentences) + '.'
352
 
353
 
354
- def summarize_text(text: str) -> str:
355
- """Main summarization function with fallback"""
356
- if TRANSFORMERS_AVAILABLE:
357
- return summarize_text_transformers(text)
358
- else:
359
- return summarize_text_simple(text)
360
-
361
-
362
  def format_timestamp(seconds: float) -> str:
363
  """Format seconds into MM:SS format"""
364
  minutes = int(seconds // 60)
@@ -371,46 +276,64 @@ def process_video_segment(video_path: str, start_time: float, duration: float, s
371
  try:
372
  print(f"Processing segment {segment_id}: {start_time}s - {start_time + duration}s")
373
 
 
374
  audio_path = os.path.join(temp_dir, f"segment_{segment_id:03d}.wav")
 
375
 
 
376
  if not extract_audio_simple(video_path, audio_path, start_time, duration):
377
  return {
378
  "segment": segment_id,
379
  "start_time": format_timestamp(start_time),
380
  "end_time": format_timestamp(start_time + duration),
 
 
381
  "text": "Audio extraction failed",
382
  "summary": "Failed to process this segment",
383
- "key_phrases": []
 
384
  }
385
 
386
- segments = transcribe_audio(audio_path)
387
- full_text = " ".join([seg["text"] for seg in segments if seg["text"]])
 
388
 
 
 
 
 
389
  try:
390
  os.remove(audio_path)
391
  except:
392
  pass
393
 
394
- if not full_text.strip():
395
  return {
396
  "segment": segment_id,
397
  "start_time": format_timestamp(start_time),
398
  "end_time": format_timestamp(start_time + duration),
399
- "text": "No speech detected",
 
 
400
  "summary": "No content in this segment",
401
- "key_phrases": []
 
402
  }
403
 
404
- summary = summarize_text(full_text)
405
- key_phrases = extract_key_phrases(full_text)
 
406
 
407
  return {
408
  "segment": segment_id,
409
  "start_time": format_timestamp(start_time),
410
  "end_time": format_timestamp(start_time + duration),
411
- "text": full_text,
 
 
412
  "summary": summary,
413
- "key_phrases": key_phrases
 
414
  }
415
 
416
  except Exception as e:
@@ -419,47 +342,49 @@ def process_video_segment(video_path: str, start_time: float, duration: float, s
419
  "segment": segment_id,
420
  "start_time": format_timestamp(start_time),
421
  "end_time": format_timestamp(start_time + duration),
 
 
422
  "text": f"Processing failed: {str(e)}",
423
  "summary": "Error occurred during processing",
424
- "key_phrases": []
 
425
  }
426
 
427
 
428
  def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
429
  """Main pipeline function"""
430
  if not video_file:
431
- return [{"error": "No video file provided"}]
432
-
433
- # Check dependencies
434
- deps = check_dependencies()
435
 
436
- if not deps['ffmpeg']:
437
- return [{"error": "FFmpeg is not available in this environment"}]
438
-
439
- if not (deps['whisper'] or deps['transformers']):
440
- return [{"error": "No speech recognition models available. Please install whisper or transformers."}]
441
 
442
  print(f"Processing video: {video_file}")
443
  progress(0.1, desc="Analyzing video...")
444
 
 
445
  video_info = get_video_info(video_file)
446
  print(f"Video info: {video_info}")
447
 
448
  if not video_info['has_audio']:
449
- return [{"error": "Video has no audio track"}]
450
 
451
  duration = video_info['duration']
452
  if duration == 0:
453
- return [{"error": "Could not determine video duration"}]
454
 
 
455
  max_duration = min(duration, 600) # Max 10 minutes
456
  segment_length = 120 # 2 minutes per segment
457
 
458
  progress(0.2, desc=f"Video duration: {duration:.1f}s, processing {max_duration:.1f}s...")
459
 
 
460
  temp_dir = tempfile.mkdtemp(prefix="lecture_capture_")
461
 
462
  try:
 
463
  segments_to_process = []
464
  current_time = 0
465
  segment_id = 1
@@ -479,6 +404,7 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
479
 
480
  print(f"Will process {len(segments_to_process)} segments")
481
 
 
482
  timeline = []
483
  for i, seg_info in enumerate(segments_to_process):
484
  progress(
@@ -487,9 +413,7 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
487
  )
488
 
489
  try:
490
- result = run_with_timeout(
491
- process_video_segment,
492
- 300,
493
  video_file,
494
  seg_info['start_time'],
495
  seg_info['duration'],
@@ -498,66 +422,205 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
498
  )
499
  timeline.append(result)
500
 
501
- except TimeoutError:
502
- print(f"Segment {i+1} timed out")
503
- timeline.append({
504
- "segment": seg_info['segment_id'],
505
- "start_time": format_timestamp(seg_info['start_time']),
506
- "end_time": format_timestamp(seg_info['start_time'] + seg_info['duration']),
507
- "text": "Processing timed out",
508
- "summary": "Segment processing exceeded time limit",
509
- "key_phrases": []
510
- })
511
  except Exception as e:
512
  print(f"Error processing segment {i+1}: {str(e)}")
513
  timeline.append({
514
  "segment": seg_info['segment_id'],
515
  "start_time": format_timestamp(seg_info['start_time']),
516
  "end_time": format_timestamp(seg_info['start_time'] + seg_info['duration']),
 
 
517
  "text": f"Error: {str(e)}",
518
  "summary": "Processing failed",
519
- "key_phrases": []
 
520
  })
521
 
522
- progress(1.0, desc="Processing complete!")
523
 
524
  if not timeline:
525
- return [{"error": "No segments were successfully processed"}]
526
 
527
- # Add dependency info to the result
528
- timeline.insert(0, {
529
- "info": "Processing completed",
530
- "dependencies_used": {k: v for k, v in deps.items() if v},
531
- "total_segments": len(timeline) - 1
532
- })
 
 
533
 
534
- return timeline
535
 
536
  except Exception as e:
537
  import traceback
538
  print(f"Pipeline error: {str(e)}")
539
  print(traceback.format_exc())
540
- return [{"error": f"Pipeline failed: {str(e)}"}]
541
 
542
  finally:
543
- try:
544
- shutil.rmtree(temp_dir)
545
- print("Cleaned up temporary files")
546
- except Exception as e:
547
- print(f"Failed to clean up temp directory: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
548
 
549
 
550
  def create_interface():
551
  with gr.Blocks(title="Lecture Capture AI Pipeline", theme=gr.themes.Soft()) as demo:
552
  gr.Markdown("""
553
- # πŸŽ“ Lecture Capture AI Pipeline (Robust Version)
554
 
555
  Upload a lecture video to automatically generate:
556
  - πŸ“ Transcription with timestamps
557
  - πŸ“‹ Summaries for each segment
558
  - πŸ”‘ Key phrases extraction
559
-
560
- **Features**: Automatic fallbacks, dependency checking, enhanced error handling
561
  """)
562
 
563
  with gr.Row():
@@ -573,46 +636,56 @@ def create_interface():
573
  size="lg"
574
  )
575
 
 
 
 
 
 
 
576
  gr.Markdown("""
577
  ### πŸ’‘ Tips:
578
  - Videos up to 10 minutes work best
579
  - Clear audio improves results
580
- - Processing takes 2-5 minutes
581
- - Automatic fallbacks if models unavailable
582
  """)
583
 
584
  with gr.Column(scale=2):
585
- output_json = gr.JSON(
586
- label="πŸ“Š Generated Timeline",
587
- height=600
588
- )
 
 
 
 
 
 
 
589
 
590
  process_btn.click(
591
  fn=run_pipeline,
592
  inputs=[video_input],
593
- outputs=[output_json],
594
  show_progress=True
595
  )
596
 
597
  gr.Markdown("""
598
  ### πŸ”§ Technical Details:
599
- - Primary: OpenAI Whisper for transcription
600
- - Fallback: Transformers pipeline
601
- - Text processing: spaCy + simple fallbacks
602
- - Automatic dependency detection
603
  """)
604
 
605
  return demo
606
 
607
 
608
  if __name__ == "__main__":
609
- # Check dependencies on startup
610
- deps = check_dependencies()
611
- print("=== Dependency Check ===")
612
- for dep, available in deps.items():
613
- status = "βœ…" if available else "❌"
614
- print(f"{status} {dep}: {available}")
615
- print("========================")
616
 
617
  demo = create_interface()
618
  demo.launch(debug=True)
 
4
  import tempfile
5
  import shutil
6
  from pathlib import Path
 
 
7
  import json
8
+ import datetime
9
+ import threading
10
+ from typing import List, Dict, Optional
11
 
12
  import gradio as gr
 
13
  import numpy as np
14
 
15
  # Try to import optional dependencies
16
  try:
17
  import whisper
18
  WHISPER_AVAILABLE = True
19
+ print("βœ… Whisper available")
20
  except ImportError:
21
  WHISPER_AVAILABLE = False
22
+ print("❌ Whisper not available")
23
 
24
  try:
25
  import spacy
 
27
  try:
28
  nlp = spacy.load("en_core_web_sm")
29
  SPACY_AVAILABLE = True
30
+ print("βœ… spaCy model available")
31
  except OSError:
32
  SPACY_AVAILABLE = False
33
+ print("❌ spaCy model not available")
34
  except ImportError:
35
  SPACY_AVAILABLE = False
36
+ print("❌ spaCy not available")
37
 
38
  try:
39
  from transformers import pipeline
40
+ import torch
41
  TRANSFORMERS_AVAILABLE = True
42
+ print("βœ… Transformers available")
43
  except ImportError:
44
  TRANSFORMERS_AVAILABLE = False
45
+ print("❌ Transformers not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  def check_ffmpeg():
49
  """Check if ffmpeg is available"""
50
  try:
51
+ result = subprocess.run(["ffmpeg", "-version"], capture_output=True)
52
  return result.returncode == 0
53
  except:
54
  return False
 
61
  "ffprobe", "-v", "quiet", "-print_format", "json", "-show_format",
62
  "-show_streams", video_path
63
  ]
64
+ result = subprocess.run(cmd, capture_output=True, text=True)
65
 
66
  if result.returncode == 0:
67
  info = json.loads(result.stdout)
 
103
 
104
  print(f"Extracting audio: {' '.join(cmd)}")
105
 
106
+ result = subprocess.run(cmd, capture_output=True, text=True)
 
 
 
 
 
107
 
108
  if result.returncode == 0:
109
  if os.path.exists(audio_path) and os.path.getsize(audio_path) > 1000:
 
116
  print(f"FFmpeg error: {result.stderr}")
117
  return False
118
 
 
 
 
119
  except Exception as e:
120
  print(f"Error extracting audio: {str(e)}")
121
  return False
122
 
123
 
124
+ def extract_frame(video_path: str, timestamp: float, output_path: str) -> bool:
125
+ """Extract frame from video at specific timestamp"""
126
  try:
127
+ cmd = [
128
+ "ffmpeg", "-y",
129
+ "-ss", str(timestamp),
130
+ "-i", video_path,
131
+ "-vframes", "1",
132
+ "-q:v", "2",
133
+ output_path
134
+ ]
135
+
136
+ result = subprocess.run(cmd, capture_output=True, text=True)
137
 
138
+ if result.returncode == 0 and os.path.exists(output_path):
139
+ return True
140
+ return False
141
+ except Exception as e:
142
+ print(f"Error extracting frame: {e}")
143
+ return False
144
+
145
+
146
+ def transcribe_audio_whisper_simple(audio_path: str) -> str:
147
+ """Simplified Whisper transcription that just returns text"""
148
+ try:
149
  if not WHISPER_AVAILABLE:
150
+ return "Whisper not available"
151
 
152
+ print(f"Starting Whisper transcription of {audio_path}")
 
 
 
153
 
154
+ # Load the smallest model
155
+ model = whisper.load_model("tiny")
156
 
157
+ # Use faster settings
158
+ options = {
159
+ "language": "en",
160
+ "task": "transcribe",
161
+ "fp16": False,
162
+ "beam_size": 1
163
+ }
164
+
165
+ # Transcribe
166
+ result = model.transcribe(audio_path, **options)
167
+
168
+ if result and "text" in result:
169
+ return result["text"].strip()
170
  else:
171
+ return "Transcription failed"
172
 
173
  except Exception as e:
174
  print(f"Whisper transcription error: {str(e)}")
175
+ return f"Transcription error: {str(e)}"
176
 
177
 
178
+ def transcribe_audio_transformers_simple(audio_path: str) -> str:
179
+ """Simplified Transformers transcription that just returns text"""
180
  try:
 
 
181
  if not TRANSFORMERS_AVAILABLE:
182
+ return "Transformers not available"
183
 
184
+ print(f"Starting Transformers transcription of {audio_path}")
 
 
 
 
 
 
 
185
 
186
+ # Use the smallest model with minimal settings
187
+ asr = pipeline(
188
+ "automatic-speech-recognition",
189
+ model="openai/whisper-tiny",
190
+ device=-1 # Force CPU
191
+ )
192
 
193
+ # Simple transcription
194
+ result = asr(audio_path)
195
+
196
+ if isinstance(result, dict) and "text" in result:
197
+ return result["text"].strip()
198
+ elif isinstance(result, str):
199
+ return result.strip()
200
  else:
201
+ return str(result)
202
 
203
  except Exception as e:
204
  print(f"Transformers transcription error: {str(e)}")
205
+ return f"Transcription error: {str(e)}"
 
 
 
 
 
206
 
207
 
208
+ def transcribe_audio_simple(audio_path: str) -> str:
209
+ """Main transcription function that returns simple text"""
210
  # Try Whisper first
211
  if WHISPER_AVAILABLE:
212
  try:
213
+ return transcribe_audio_whisper_simple(audio_path)
214
  except Exception as e:
215
  print(f"Whisper failed: {e}")
216
 
217
  # Try Transformers as fallback
218
  if TRANSFORMERS_AVAILABLE:
219
  try:
220
+ return transcribe_audio_transformers_simple(audio_path)
221
  except Exception as e:
222
  print(f"Transformers failed: {e}")
223
 
224
  # Use fallback
225
+ return "Transcription not available - no speech recognition models loaded"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
 
228
  def extract_key_phrases_simple(text: str, top_n: int = 5) -> List[str]:
229
+ """Simple key phrase extraction"""
230
  if not text:
231
  return []
232
 
233
  words = text.split()
234
  key_words = [
235
+ w.strip('.,!?";:()') for w in words
236
  if len(w) > 4 and w.isalpha() and w.lower() not in {
237
  'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been',
238
  'were', 'said', 'each', 'which', 'their', 'time', 'would', 'there'
 
245
  return unique_words[:top_n]
246
 
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  def summarize_text_simple(text: str) -> str:
249
+ """Simple text summarization"""
250
  if not text or len(text.split()) < 10:
251
  return text
252
 
 
258
  elif len(sentences) <= 5:
259
  return '. '.join(sentences[:2]) + '.'
260
  else:
261
+ # Take first, middle, and last sentences
262
  middle_idx = len(sentences) // 2
263
  summary_sentences = [sentences[0], sentences[middle_idx], sentences[-1]]
264
  return '. '.join(summary_sentences) + '.'
265
 
266
 
 
 
 
 
 
 
 
 
267
  def format_timestamp(seconds: float) -> str:
268
  """Format seconds into MM:SS format"""
269
  minutes = int(seconds // 60)
 
276
  try:
277
  print(f"Processing segment {segment_id}: {start_time}s - {start_time + duration}s")
278
 
279
+ # Create paths
280
  audio_path = os.path.join(temp_dir, f"segment_{segment_id:03d}.wav")
281
+ frame_path = os.path.join(temp_dir, f"frame_{segment_id:03d}.jpg")
282
 
283
+ # Extract audio for this segment
284
  if not extract_audio_simple(video_path, audio_path, start_time, duration):
285
  return {
286
  "segment": segment_id,
287
  "start_time": format_timestamp(start_time),
288
  "end_time": format_timestamp(start_time + duration),
289
+ "start_seconds": start_time,
290
+ "end_seconds": start_time + duration,
291
  "text": "Audio extraction failed",
292
  "summary": "Failed to process this segment",
293
+ "key_phrases": [],
294
+ "frame": None
295
  }
296
 
297
+ # Extract a frame from the middle of the segment
298
+ frame_time = start_time + (duration / 2)
299
+ frame_extracted = extract_frame(video_path, frame_time, frame_path)
300
 
301
+ # Transcribe audio
302
+ text = transcribe_audio_simple(audio_path)
303
+
304
+ # Clean up audio file
305
  try:
306
  os.remove(audio_path)
307
  except:
308
  pass
309
 
310
+ if not text or text.startswith("Transcription"):
311
  return {
312
  "segment": segment_id,
313
  "start_time": format_timestamp(start_time),
314
  "end_time": format_timestamp(start_time + duration),
315
+ "start_seconds": start_time,
316
+ "end_seconds": start_time + duration,
317
+ "text": text or "No speech detected",
318
  "summary": "No content in this segment",
319
+ "key_phrases": [],
320
+ "frame": frame_path if frame_extracted else None
321
  }
322
 
323
+ # Generate summary and key phrases
324
+ summary = summarize_text_simple(text)
325
+ key_phrases = extract_key_phrases_simple(text)
326
 
327
  return {
328
  "segment": segment_id,
329
  "start_time": format_timestamp(start_time),
330
  "end_time": format_timestamp(start_time + duration),
331
+ "start_seconds": start_time,
332
+ "end_seconds": start_time + duration,
333
+ "text": text,
334
  "summary": summary,
335
+ "key_phrases": key_phrases,
336
+ "frame": frame_path if frame_extracted else None
337
  }
338
 
339
  except Exception as e:
 
342
  "segment": segment_id,
343
  "start_time": format_timestamp(start_time),
344
  "end_time": format_timestamp(start_time + duration),
345
+ "start_seconds": start_time,
346
+ "end_seconds": start_time + duration,
347
  "text": f"Processing failed: {str(e)}",
348
  "summary": "Error occurred during processing",
349
+ "key_phrases": [],
350
+ "frame": None
351
  }
352
 
353
 
354
  def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
355
  """Main pipeline function"""
356
  if not video_file:
357
+ return [], "No video file provided", None
 
 
 
358
 
359
+ # Check if ffmpeg is available
360
+ if not check_ffmpeg():
361
+ return [], "FFmpeg is not available in this environment", None
 
 
362
 
363
  print(f"Processing video: {video_file}")
364
  progress(0.1, desc="Analyzing video...")
365
 
366
+ # Get video information
367
  video_info = get_video_info(video_file)
368
  print(f"Video info: {video_info}")
369
 
370
  if not video_info['has_audio']:
371
+ return [], "Video has no audio track", None
372
 
373
  duration = video_info['duration']
374
  if duration == 0:
375
+ return [], "Could not determine video duration", None
376
 
377
+ # Limit processing time
378
  max_duration = min(duration, 600) # Max 10 minutes
379
  segment_length = 120 # 2 minutes per segment
380
 
381
  progress(0.2, desc=f"Video duration: {duration:.1f}s, processing {max_duration:.1f}s...")
382
 
383
+ # Create temporary directory
384
  temp_dir = tempfile.mkdtemp(prefix="lecture_capture_")
385
 
386
  try:
387
+ # Calculate segments
388
  segments_to_process = []
389
  current_time = 0
390
  segment_id = 1
 
404
 
405
  print(f"Will process {len(segments_to_process)} segments")
406
 
407
+ # Process each segment
408
  timeline = []
409
  for i, seg_info in enumerate(segments_to_process):
410
  progress(
 
413
  )
414
 
415
  try:
416
+ result = process_video_segment(
 
 
417
  video_file,
418
  seg_info['start_time'],
419
  seg_info['duration'],
 
422
  )
423
  timeline.append(result)
424
 
 
 
 
 
 
 
 
 
 
 
425
  except Exception as e:
426
  print(f"Error processing segment {i+1}: {str(e)}")
427
  timeline.append({
428
  "segment": seg_info['segment_id'],
429
  "start_time": format_timestamp(seg_info['start_time']),
430
  "end_time": format_timestamp(seg_info['start_time'] + seg_info['duration']),
431
+ "start_seconds": seg_info['start_time'],
432
+ "end_seconds": seg_info['start_time'] + seg_info['duration'],
433
  "text": f"Error: {str(e)}",
434
  "summary": "Processing failed",
435
+ "key_phrases": [],
436
+ "frame": None
437
  })
438
 
439
+ progress(0.9, desc="Generating visual timeline...")
440
 
441
  if not timeline:
442
+ return [], "No segments were successfully processed", None
443
 
444
+ # Generate HTML for visual timeline
445
+ html_timeline = generate_visual_timeline(timeline, video_file)
446
+
447
+ # Generate summary of the entire video
448
+ all_text = " ".join([segment["text"] for segment in timeline if not segment["text"].startswith("Error") and not segment["text"].startswith("Processing")])
449
+ video_summary = summarize_text_simple(all_text) if all_text else "No valid transcription available"
450
+
451
+ progress(1.0, desc="Processing complete!")
452
 
453
+ return timeline, html_timeline, video_summary
454
 
455
  except Exception as e:
456
  import traceback
457
  print(f"Pipeline error: {str(e)}")
458
  print(traceback.format_exc())
459
+ return [], f"Pipeline failed: {str(e)}", None
460
 
461
  finally:
462
+ # Don't delete temp_dir as we need the frames for display
463
+ # We'll clean it up at the end of the session
464
+ pass
465
+
466
+
467
+ def generate_visual_timeline(timeline: List[Dict], video_path: str) -> str:
468
+ """Generate HTML for visual timeline"""
469
+ if not timeline:
470
+ return "<p>No timeline data available</p>"
471
+
472
+ html = """
473
+ <style>
474
+ .timeline-container {
475
+ font-family: Arial, sans-serif;
476
+ max-width: 100%;
477
+ margin: 0 auto;
478
+ }
479
+ .timeline-segment {
480
+ display: flex;
481
+ margin-bottom: 20px;
482
+ padding: 15px;
483
+ border-radius: 8px;
484
+ background-color: #f9f9f9;
485
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
486
+ }
487
+ .timeline-segment:nth-child(odd) {
488
+ background-color: #f0f7ff;
489
+ }
490
+ .timeline-thumbnail {
491
+ flex: 0 0 160px;
492
+ margin-right: 15px;
493
+ }
494
+ .timeline-thumbnail img {
495
+ width: 160px;
496
+ height: 90px;
497
+ object-fit: cover;
498
+ border-radius: 4px;
499
+ }
500
+ .timeline-content {
501
+ flex: 1;
502
+ }
503
+ .timeline-header {
504
+ display: flex;
505
+ justify-content: space-between;
506
+ margin-bottom: 8px;
507
+ }
508
+ .timeline-timestamp {
509
+ font-weight: bold;
510
+ color: #555;
511
+ }
512
+ .timeline-summary {
513
+ font-weight: bold;
514
+ margin-bottom: 8px;
515
+ }
516
+ .timeline-text {
517
+ margin-bottom: 8px;
518
+ color: #333;
519
+ }
520
+ .timeline-tags {
521
+ display: flex;
522
+ flex-wrap: wrap;
523
+ gap: 5px;
524
+ }
525
+ .timeline-tag {
526
+ background-color: #e1ecf4;
527
+ color: #39739d;
528
+ padding: 2px 8px;
529
+ border-radius: 12px;
530
+ font-size: 12px;
531
+ }
532
+ .timeline-placeholder {
533
+ background-color: #ddd;
534
+ display: flex;
535
+ align-items: center;
536
+ justify-content: center;
537
+ color: #666;
538
+ font-size: 12px;
539
+ }
540
+ .timeline-error {
541
+ color: #d32f2f;
542
+ font-style: italic;
543
+ }
544
+ </style>
545
+ <div class="timeline-container">
546
+ """
547
+
548
+ for segment in timeline:
549
+ # Skip if this is the info segment
550
+ if "info" in segment:
551
+ continue
552
+
553
+ segment_id = segment.get("segment", "")
554
+ start_time = segment.get("start_time", "")
555
+ end_time = segment.get("end_time", "")
556
+ text = segment.get("text", "")
557
+ summary = segment.get("summary", "")
558
+ key_phrases = segment.get("key_phrases", [])
559
+ frame_path = segment.get("frame")
560
+
561
+ # Check if this segment has an error
562
+ has_error = text.startswith("Error") or text.startswith("Processing failed") or text.startswith("Transcription error")
563
+
564
+ html += f"""
565
+ <div class="timeline-segment">
566
+ <div class="timeline-thumbnail">
567
+ """
568
+
569
+ if frame_path and os.path.exists(frame_path):
570
+ # Use base64 encoding for the image
571
+ import base64
572
+ try:
573
+ with open(frame_path, "rb") as img_file:
574
+ img_data = base64.b64encode(img_file.read()).decode('utf-8')
575
+ html += f'<img src="data:image/jpeg;base64,{img_data}" alt="Frame at {start_time}">'
576
+ except:
577
+ html += f'<div class="timeline-placeholder" style="width:160px;height:90px;">No thumbnail</div>'
578
+ else:
579
+ html += f'<div class="timeline-placeholder" style="width:160px;height:90px;">No thumbnail</div>'
580
+
581
+ html += """
582
+ </div>
583
+ <div class="timeline-content">
584
+ <div class="timeline-header">
585
+ """
586
+
587
+ html += f'<div class="timeline-timestamp">Segment {segment_id}: {start_time} - {end_time}</div>'
588
+
589
+ html += """
590
+ </div>
591
+ """
592
+
593
+ if has_error:
594
+ html += f'<div class="timeline-error">{text}</div>'
595
+ else:
596
+ html += f'<div class="timeline-summary">{summary}</div>'
597
+ html += f'<div class="timeline-text">{text}</div>'
598
+
599
+ if key_phrases:
600
+ html += '<div class="timeline-tags">'
601
+ for phrase in key_phrases:
602
+ html += f'<span class="timeline-tag">{phrase}</span>'
603
+ html += '</div>'
604
+
605
+ html += """
606
+ </div>
607
+ </div>
608
+ """
609
+
610
+ html += "</div>"
611
+ return html
612
 
613
 
614
  def create_interface():
615
  with gr.Blocks(title="Lecture Capture AI Pipeline", theme=gr.themes.Soft()) as demo:
616
  gr.Markdown("""
617
+ # πŸŽ“ Lecture Capture AI Pipeline (Visual Timeline)
618
 
619
  Upload a lecture video to automatically generate:
620
  - πŸ“ Transcription with timestamps
621
  - πŸ“‹ Summaries for each segment
622
  - πŸ”‘ Key phrases extraction
623
+ - πŸ–ΌοΈ Visual timeline with thumbnails
 
624
  """)
625
 
626
  with gr.Row():
 
636
  size="lg"
637
  )
638
 
639
+ video_summary = gr.Textbox(
640
+ label="πŸ“‹ Video Summary",
641
+ placeholder="Video summary will appear here after processing",
642
+ lines=4
643
+ )
644
+
645
  gr.Markdown("""
646
  ### πŸ’‘ Tips:
647
  - Videos up to 10 minutes work best
648
  - Clear audio improves results
649
+ - Processing may take several minutes
 
650
  """)
651
 
652
  with gr.Column(scale=2):
653
+ with gr.Tabs():
654
+ with gr.TabItem("Visual Timeline"):
655
+ timeline_html = gr.HTML(
656
+ label="Visual Timeline",
657
+ value="<p>Timeline will appear here after processing</p>"
658
+ )
659
+
660
+ with gr.TabItem("Raw Data"):
661
+ timeline_json = gr.JSON(
662
+ label="Timeline Data"
663
+ )
664
 
665
  process_btn.click(
666
  fn=run_pipeline,
667
  inputs=[video_input],
668
+ outputs=[timeline_json, timeline_html, video_summary],
669
  show_progress=True
670
  )
671
 
672
  gr.Markdown("""
673
  ### πŸ”§ Technical Details:
674
+ - Uses OpenAI Whisper for transcription
675
+ - Simplified processing for better compatibility
676
+ - Visual timeline with thumbnails
677
+ - No timeouts to ensure processing completes
678
  """)
679
 
680
  return demo
681
 
682
 
683
  if __name__ == "__main__":
684
+ # Check if ffmpeg is available
685
+ if check_ffmpeg():
686
+ print("βœ… FFmpeg available")
687
+ else:
688
+ print("❌ FFmpeg not available")
 
 
689
 
690
  demo = create_interface()
691
  demo.launch(debug=True)