lochn commited on
Commit
7fd53c2
Β·
verified Β·
1 Parent(s): fc8db39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -75
app.py CHANGED
@@ -9,6 +9,7 @@ from typing import List, Dict, Optional
9
  import spacy
10
  import gradio as gr
11
  from transformers import pipeline
 
12
 
13
  # β€”β€”β€” spaCy setup for HF Spaces β€”β€”β€”
14
  def setup_spacy():
@@ -25,13 +26,12 @@ def setup_spacy():
25
  return nlp
26
  except Exception as e:
27
  print(f"Failed to download spaCy model: {e}")
28
- # Return None if spaCy fails - we'll handle this gracefully
29
  return None
30
 
31
  nlp = setup_spacy()
32
 
33
 
34
- def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
35
  def wrapper(*args, **kwargs):
36
  delay = initial_delay
37
  for attempt in range(max_retries):
@@ -61,7 +61,7 @@ def check_ffmpeg():
61
  return False
62
 
63
 
64
- def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None) -> List[Path]:
65
  """Chunk video with temporary directory handling for HF Spaces"""
66
  if output_dir is None:
67
  output_dir = tempfile.mkdtemp(prefix="chunks_")
@@ -73,7 +73,7 @@ def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None
73
  cmd = [
74
  "ffmpeg", "-y", "-i", input_path,
75
  "-f", "segment", "-segment_time", str(chunk_length),
76
- "-reset_timestamps", "1", "-c", "copy", # Use copy to avoid re-encoding
77
  output_pattern
78
  ]
79
  result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
@@ -97,6 +97,7 @@ def extract_audio(video_path: str, audio_path: str) -> bool:
97
  cmd = [
98
  "ffmpeg", "-y", "-i", video_path,
99
  "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
 
100
  audio_path
101
  ]
102
  result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
@@ -118,14 +119,12 @@ def extract_key_phrases(text: str, top_n: int = 5) -> List[str]:
118
  if nlp is None:
119
  # Fallback: simple word extraction
120
  words = text.split()
121
- # Get longer words as "key phrases"
122
  key_words = [w for w in words if len(w) > 4 and w.isalpha()]
123
  return list(dict.fromkeys(key_words))[:top_n]
124
 
125
  try:
126
  doc = nlp(text)
127
  phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
128
- # Remove duplicates while preserving order
129
  seen = set()
130
  unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
131
  return unique_phrases[:top_n]
@@ -138,79 +137,103 @@ def extract_frame(video_path: str, timestamp: str, output_path: str) -> bool:
138
  """Extract frame with timeout for HF Spaces"""
139
  try:
140
  cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
141
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
142
 
143
  if result.returncode != 0:
144
- print(f"Frame extraction error: {result.stderr}")
145
  return False
146
  return True
147
- except subprocess.TimeoutExpired:
148
- print("Frame extraction timed out")
149
- return False
150
- except Exception as e:
151
- print(f"Error extracting frame: {str(e)}")
152
  return False
153
 
154
 
155
  @retry_on_rate_limit
156
  def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
157
- """Transcribe audio with better error handling"""
158
  try:
159
- result = asr_pipeline(audio_path)
 
 
 
 
 
 
160
 
161
  if isinstance(result, dict):
162
  if "chunks" in result:
163
  return result["chunks"]
164
  else:
165
- return [{"text": result.get("text", ""), "timestamp": (0.0, 0.0)}]
166
- elif isinstance(result, str):
167
- return [{"text": result, "timestamp": (0.0, 0.0)}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  else:
169
- return [{"text": str(result), "timestamp": (0.0, 0.0)}]
 
170
  except Exception as e:
171
  print(f"Transcription error: {str(e)}")
172
- return [{"text": "Transcription failed", "timestamp": (0.0, 0.0)}]
173
 
174
 
175
  @retry_on_rate_limit
176
  def summarize_text(summarizer_pipeline, text: str) -> str:
177
- """Summarize text with length constraints for HF Spaces"""
178
  if not text.strip():
179
  return "No content to summarize."
180
 
181
- # Truncate text if too long for the model
182
- max_length = 1024 # BART's max input length
183
- if len(text) > max_length:
184
- text = text[:max_length]
 
 
 
 
 
 
 
185
 
186
  try:
187
- # Adjust parameters for shorter text
188
- min_len = min(30, len(text.split()) // 4)
189
- max_len = min(200, len(text.split()) // 2)
190
-
191
- if min_len >= max_len:
192
- min_len = max(10, max_len - 10)
193
 
194
  result = summarizer_pipeline(
195
  text,
196
- max_length=max_len,
197
- min_length=min_len,
198
- do_sample=False
 
199
  )
200
 
201
  if isinstance(result, list) and len(result) > 0:
202
- return result[0]["summary_text"].strip()
203
- return "Failed to generate summary."
 
 
204
  except Exception as e:
205
  print(f"Summarization error: {str(e)}")
206
- return f"Summary generation failed: {str(e)}"
207
 
208
 
209
  def format_timestamp(seconds: float) -> str:
210
- """Format seconds into MM:SS.mmm format"""
211
  minutes = int(seconds // 60)
212
- remaining_seconds = seconds % 60
213
- return f"{minutes:02d}:{remaining_seconds:06.3f}"
214
 
215
 
216
  def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
@@ -224,20 +247,24 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
224
 
225
  progress(0.1, desc="Initializing models...")
226
 
227
- # Initialize models with error handling
228
  try:
 
229
  asr = pipeline(
230
  "automatic-speech-recognition",
231
- model="openai/whisper-base", # Use smaller model for HF Spaces
232
- chunk_length_s=30,
233
- stride_length_s=(4, 2),
234
- return_timestamps="word"
 
235
  )
236
  progress(0.2, desc="ASR model loaded...")
237
 
 
238
  summarizer = pipeline(
239
  "summarization",
240
- model="facebook/bart-large-cnn"
 
241
  )
242
  progress(0.3, desc="Summarization model loaded...")
243
 
@@ -255,11 +282,14 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
255
 
256
  progress(0.4, desc="Processing video chunks...")
257
 
258
- # Process video - use shorter chunks for HF Spaces
259
- chunks = chunk_video(video_file, chunk_length=120, output_dir=chunks_dir)
260
  if not chunks:
261
  return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
262
 
 
 
 
263
  progress(0.5, desc=f"Processing {len(chunks)} chunks...")
264
 
265
  # Process each chunk
@@ -271,33 +301,44 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
271
 
272
  # Extract audio
273
  if not extract_audio(str(chunk), wav_path):
 
274
  continue
275
 
276
- # Transcribe
277
  try:
278
  chunk_segments = transcribe_audio(asr, wav_path)
279
 
280
  # Calculate absolute timestamps
281
- chunk_start_time = i * 120 # 120 seconds per chunk
282
 
283
  for seg in chunk_segments:
284
- if isinstance(seg.get("timestamp"), tuple) and len(seg["timestamp"]) == 2:
285
- start_time = chunk_start_time + seg["timestamp"][0]
286
- end_time = chunk_start_time + seg["timestamp"][1]
 
287
  else:
288
  start_time = chunk_start_time
289
- end_time = chunk_start_time + 120
290
 
291
- all_segments.append({
292
- "text": seg.get("text", ""),
293
- "start": format_timestamp(start_time),
294
- "end": format_timestamp(end_time),
295
- "start_seconds": start_time,
296
- "end_seconds": end_time
297
- })
 
 
 
298
  except Exception as e:
299
  print(f"Error processing chunk {i}: {str(e)}")
300
  continue
 
 
 
 
 
 
301
 
302
  if not all_segments:
303
  return [{"error": "No segments were successfully processed"}]
@@ -307,31 +348,27 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
307
  # Sort segments by start time
308
  all_segments.sort(key=lambda x: x["start_seconds"])
309
 
310
- # Generate timeline
311
  timeline = []
312
- for i, segment in enumerate(all_segments[:20]): # Limit to 20 segments for HF Spaces
313
  segment_text = segment["text"]
314
 
315
  # Generate summary
316
  try:
317
- summary = summarize_text(summarizer, segment_text) if segment_text else "No content"
318
  except Exception as e:
319
- summary = f"Summary failed: {str(e)}"
320
 
321
  # Extract key phrases
322
  key_phrases = extract_key_phrases(segment_text) if segment_text else []
323
 
324
- # Extract frame (optional, may fail in HF Spaces)
325
- frame_path = os.path.join(frames_dir, f"frame_{i:03d}.jpg")
326
- frame_extracted = extract_frame(video_file, segment["start"], frame_path)
327
-
328
  timeline.append({
 
329
  "start_time": segment["start"],
330
  "end_time": segment["end"],
331
  "text": segment_text,
332
  "summary": summary,
333
- "key_phrases": key_phrases,
334
- "frame_available": frame_extracted
335
  })
336
 
337
  progress(1.0, desc="Processing complete!")
@@ -360,7 +397,7 @@ def create_interface():
360
  - πŸ“‹ Summaries for each segment
361
  - πŸ”‘ Key phrases extraction
362
 
363
- **Note**: This runs on Hugging Face Spaces with limited resources. Processing may take time for longer videos.
364
  """)
365
 
366
  with gr.Row():
@@ -378,9 +415,10 @@ def create_interface():
378
 
379
  gr.Markdown("""
380
  ### πŸ’‘ Tips:
381
- - Shorter videos (< 10 minutes) work best
382
  - Clear audio improves transcription quality
383
- - Processing may take 2-5 minutes depending on video length
 
384
  """)
385
 
386
  with gr.Column(scale=2):
@@ -398,7 +436,7 @@ def create_interface():
398
 
399
  gr.Markdown("""
400
  ### πŸ”§ Technical Details:
401
- - Uses Whisper (base) for speech recognition
402
  - BART for text summarization
403
  - spaCy for key phrase extraction
404
  - Optimized for Hugging Face Spaces environment
 
9
  import spacy
10
  import gradio as gr
11
  from transformers import pipeline
12
+ import torch
13
 
14
  # β€”β€”β€” spaCy setup for HF Spaces β€”β€”β€”
15
  def setup_spacy():
 
26
  return nlp
27
  except Exception as e:
28
  print(f"Failed to download spaCy model: {e}")
 
29
  return None
30
 
31
  nlp = setup_spacy()
32
 
33
 
34
+ def retry_on_rate_limit(func, max_retries=2, initial_delay=3, backoff=1.5):
35
  def wrapper(*args, **kwargs):
36
  delay = initial_delay
37
  for attempt in range(max_retries):
 
61
  return False
62
 
63
 
64
+ def chunk_video(input_path: str, chunk_length: int = 180, output_dir: str = None) -> List[Path]:
65
  """Chunk video with temporary directory handling for HF Spaces"""
66
  if output_dir is None:
67
  output_dir = tempfile.mkdtemp(prefix="chunks_")
 
73
  cmd = [
74
  "ffmpeg", "-y", "-i", input_path,
75
  "-f", "segment", "-segment_time", str(chunk_length),
76
+ "-reset_timestamps", "1", "-c", "copy",
77
  output_pattern
78
  ]
79
  result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
 
97
  cmd = [
98
  "ffmpeg", "-y", "-i", video_path,
99
  "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
100
+ "-t", "180", # Limit to 3 minutes per chunk
101
  audio_path
102
  ]
103
  result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
 
119
  if nlp is None:
120
  # Fallback: simple word extraction
121
  words = text.split()
 
122
  key_words = [w for w in words if len(w) > 4 and w.isalpha()]
123
  return list(dict.fromkeys(key_words))[:top_n]
124
 
125
  try:
126
  doc = nlp(text)
127
  phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
 
128
  seen = set()
129
  unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
130
  return unique_phrases[:top_n]
 
137
  """Extract frame with timeout for HF Spaces"""
138
  try:
139
  cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
140
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
141
 
142
  if result.returncode != 0:
 
143
  return False
144
  return True
145
+ except (subprocess.TimeoutExpired, Exception):
 
 
 
 
146
  return False
147
 
148
 
149
  @retry_on_rate_limit
150
  def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
151
+ """Transcribe audio with improved error handling"""
152
  try:
153
+ # Use the pipeline with proper parameters
154
+ result = asr_pipeline(
155
+ audio_path,
156
+ return_timestamps=True,
157
+ chunk_length_s=30,
158
+ stride_length_s=5
159
+ )
160
 
161
  if isinstance(result, dict):
162
  if "chunks" in result:
163
  return result["chunks"]
164
  else:
165
+ # Handle single result
166
+ text = result.get("text", "")
167
+ timestamps = result.get("timestamps", [(0.0, 30.0)])
168
+ if isinstance(timestamps, list) and len(timestamps) > 0:
169
+ return [{"text": text, "timestamp": timestamps[0]}]
170
+ else:
171
+ return [{"text": text, "timestamp": (0.0, 30.0)}]
172
+ elif isinstance(result, list):
173
+ # Handle list of results
174
+ segments = []
175
+ for i, item in enumerate(result):
176
+ if isinstance(item, dict):
177
+ segments.append({
178
+ "text": item.get("text", ""),
179
+ "timestamp": item.get("timestamp", (i*30, (i+1)*30))
180
+ })
181
+ return segments
182
  else:
183
+ return [{"text": str(result), "timestamp": (0.0, 30.0)}]
184
+
185
  except Exception as e:
186
  print(f"Transcription error: {str(e)}")
187
+ return [{"text": "Transcription failed", "timestamp": (0.0, 30.0)}]
188
 
189
 
190
  @retry_on_rate_limit
191
  def summarize_text(summarizer_pipeline, text: str) -> str:
192
+ """Summarize text with proper length handling"""
193
  if not text.strip():
194
  return "No content to summarize."
195
 
196
+ # Clean and prepare text
197
+ text = text.strip()
198
+ words = text.split()
199
+
200
+ # Skip very short texts
201
+ if len(words) < 10:
202
+ return text # Return original if too short
203
+
204
+ # Truncate if too long
205
+ if len(words) > 500:
206
+ text = " ".join(words[:500])
207
 
208
  try:
209
+ # Calculate appropriate lengths
210
+ input_length = len(words)
211
+ max_new_tokens = min(100, max(20, input_length // 3))
212
+ min_length = min(15, max(5, input_length // 8))
 
 
213
 
214
  result = summarizer_pipeline(
215
  text,
216
+ max_new_tokens=max_new_tokens,
217
+ min_length=min_length,
218
+ do_sample=False,
219
+ early_stopping=True
220
  )
221
 
222
  if isinstance(result, list) and len(result) > 0:
223
+ summary = result[0]["summary_text"].strip()
224
+ return summary if summary else text
225
+ return text
226
+
227
  except Exception as e:
228
  print(f"Summarization error: {str(e)}")
229
+ return text # Return original text if summarization fails
230
 
231
 
232
  def format_timestamp(seconds: float) -> str:
233
+ """Format seconds into MM:SS format"""
234
  minutes = int(seconds // 60)
235
+ remaining_seconds = int(seconds % 60)
236
+ return f"{minutes:02d}:{remaining_seconds:02d}"
237
 
238
 
239
  def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
 
247
 
248
  progress(0.1, desc="Initializing models...")
249
 
250
+ # Initialize models with proper configuration
251
  try:
252
+ # Configure Whisper with proper settings
253
  asr = pipeline(
254
  "automatic-speech-recognition",
255
+ model="openai/whisper-tiny", # Use tiny model for better compatibility
256
+ device=0 if torch.cuda.is_available() else -1,
257
+ model_kwargs={
258
+ "attn_implementation": "eager" # Fix attention implementation warning
259
+ }
260
  )
261
  progress(0.2, desc="ASR model loaded...")
262
 
263
+ # Configure BART with proper settings
264
  summarizer = pipeline(
265
  "summarization",
266
+ model="facebook/bart-large-cnn",
267
+ device=0 if torch.cuda.is_available() else -1
268
  )
269
  progress(0.3, desc="Summarization model loaded...")
270
 
 
282
 
283
  progress(0.4, desc="Processing video chunks...")
284
 
285
+ # Process video with shorter chunks
286
+ chunks = chunk_video(video_file, chunk_length=180, output_dir=chunks_dir)
287
  if not chunks:
288
  return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
289
 
290
+ # Limit number of chunks for HF Spaces
291
+ chunks = chunks[:5] # Process max 5 chunks (15 minutes)
292
+
293
  progress(0.5, desc=f"Processing {len(chunks)} chunks...")
294
 
295
  # Process each chunk
 
301
 
302
  # Extract audio
303
  if not extract_audio(str(chunk), wav_path):
304
+ print(f"Failed to extract audio from chunk {i}")
305
  continue
306
 
307
+ # Transcribe with better error handling
308
  try:
309
  chunk_segments = transcribe_audio(asr, wav_path)
310
 
311
  # Calculate absolute timestamps
312
+ chunk_start_time = i * 180 # 180 seconds per chunk
313
 
314
  for seg in chunk_segments:
315
+ timestamp = seg.get("timestamp", (0.0, 30.0))
316
+ if isinstance(timestamp, tuple) and len(timestamp) == 2:
317
+ start_time = chunk_start_time + timestamp[0]
318
+ end_time = chunk_start_time + timestamp[1]
319
  else:
320
  start_time = chunk_start_time
321
+ end_time = chunk_start_time + 30
322
 
323
+ text = seg.get("text", "").strip()
324
+ if text: # Only add non-empty segments
325
+ all_segments.append({
326
+ "text": text,
327
+ "start": format_timestamp(start_time),
328
+ "end": format_timestamp(end_time),
329
+ "start_seconds": start_time,
330
+ "end_seconds": end_time
331
+ })
332
+
333
  except Exception as e:
334
  print(f"Error processing chunk {i}: {str(e)}")
335
  continue
336
+
337
+ # Clean up audio file immediately
338
+ try:
339
+ os.remove(wav_path)
340
+ except:
341
+ pass
342
 
343
  if not all_segments:
344
  return [{"error": "No segments were successfully processed"}]
 
348
  # Sort segments by start time
349
  all_segments.sort(key=lambda x: x["start_seconds"])
350
 
351
+ # Generate timeline (limit to 15 segments for HF Spaces)
352
  timeline = []
353
+ for i, segment in enumerate(all_segments[:15]):
354
  segment_text = segment["text"]
355
 
356
  # Generate summary
357
  try:
358
+ summary = summarize_text(summarizer, segment_text) if len(segment_text.split()) > 5 else segment_text
359
  except Exception as e:
360
+ summary = segment_text
361
 
362
  # Extract key phrases
363
  key_phrases = extract_key_phrases(segment_text) if segment_text else []
364
 
 
 
 
 
365
  timeline.append({
366
+ "segment": i + 1,
367
  "start_time": segment["start"],
368
  "end_time": segment["end"],
369
  "text": segment_text,
370
  "summary": summary,
371
+ "key_phrases": key_phrases
 
372
  })
373
 
374
  progress(1.0, desc="Processing complete!")
 
397
  - πŸ“‹ Summaries for each segment
398
  - πŸ”‘ Key phrases extraction
399
 
400
+ **Note**: Optimized for Hugging Face Spaces. Processing limited to 15 minutes of video.
401
  """)
402
 
403
  with gr.Row():
 
415
 
416
  gr.Markdown("""
417
  ### πŸ’‘ Tips:
418
+ - Videos up to 15 minutes work best
419
  - Clear audio improves transcription quality
420
+ - Processing takes 2-5 minutes
421
+ - Supported formats: MP4, AVI, MOV
422
  """)
423
 
424
  with gr.Column(scale=2):
 
436
 
437
  gr.Markdown("""
438
  ### πŸ”§ Technical Details:
439
+ - Uses Whisper (tiny) for speech recognition
440
  - BART for text summarization
441
  - spaCy for key phrase extraction
442
  - Optimized for Hugging Face Spaces environment