Spaces:

lochn
/

audio

Sleeping

App Files Files Community

lochn commited on May 25, 2025

Commit

7fd53c2

verified ·

1 Parent(s): fc8db39

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -75

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import List, Dict, Optional
 import spacy
 import gradio as gr
 from transformers import pipeline
 # ——— spaCy setup for HF Spaces ———
 def setup_spacy():
@@ -25,13 +26,12 @@ def setup_spacy():
             return nlp
         except Exception as e:
             print(f"Failed to download spaCy model: {e}")
-            # Return None if spaCy fails - we'll handle this gracefully
             return None
 nlp = setup_spacy()
-def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
     def wrapper(*args, **kwargs):
         delay = initial_delay
         for attempt in range(max_retries):
@@ -61,7 +61,7 @@ def check_ffmpeg():
         return False
-def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None) -> List[Path]:
     """Chunk video with temporary directory handling for HF Spaces"""
     if output_dir is None:
         output_dir = tempfile.mkdtemp(prefix="chunks_")
@@ -73,7 +73,7 @@ def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None
         cmd = [
             "ffmpeg", "-y", "-i", input_path,
             "-f", "segment", "-segment_time", str(chunk_length),
-            "-reset_timestamps", "1", "-c", "copy",  # Use copy to avoid re-encoding
             output_pattern
         ]
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
@@ -97,6 +97,7 @@ def extract_audio(video_path: str, audio_path: str) -> bool:
         cmd = [
             "ffmpeg", "-y", "-i", video_path,
             "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
             audio_path
         ]
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
@@ -118,14 +119,12 @@ def extract_key_phrases(text: str, top_n: int = 5) -> List[str]:
     if nlp is None:
         # Fallback: simple word extraction
         words = text.split()
-        # Get longer words as "key phrases"
         key_words = [w for w in words if len(w) > 4 and w.isalpha()]
         return list(dict.fromkeys(key_words))[:top_n]
     try:
         doc = nlp(text)
         phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
-        # Remove duplicates while preserving order
         seen = set()
         unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
         return unique_phrases[:top_n]
@@ -138,79 +137,103 @@ def extract_frame(video_path: str, timestamp: str, output_path: str) -> bool:
     """Extract frame with timeout for HF Spaces"""
     try:
         cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
         if result.returncode != 0:
-            print(f"Frame extraction error: {result.stderr}")
             return False
         return True
-    except subprocess.TimeoutExpired:
-        print("Frame extraction timed out")
-        return False
-    except Exception as e:
-        print(f"Error extracting frame: {str(e)}")
         return False
 @retry_on_rate_limit
 def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
-    """Transcribe audio with better error handling"""
     try:
-        result = asr_pipeline(audio_path)
         if isinstance(result, dict):
             if "chunks" in result:
                 return result["chunks"]
             else:
-                return [{"text": result.get("text", ""), "timestamp": (0.0, 0.0)}]
-        elif isinstance(result, str):
-            return [{"text": result, "timestamp": (0.0, 0.0)}]
         else:
-            return [{"text": str(result), "timestamp": (0.0, 0.0)}]
     except Exception as e:
         print(f"Transcription error: {str(e)}")
-        return [{"text": "Transcription failed", "timestamp": (0.0, 0.0)}]
 @retry_on_rate_limit
 def summarize_text(summarizer_pipeline, text: str) -> str:
-    """Summarize text with length constraints for HF Spaces"""
     if not text.strip():
         return "No content to summarize."
-    # Truncate text if too long for the model
-    max_length = 1024  # BART's max input length
-    if len(text) > max_length:
-        text = text[:max_length]
     try:
-        # Adjust parameters for shorter text
-        min_len = min(30, len(text.split()) // 4)
-        max_len = min(200, len(text.split()) // 2)
-        if min_len >= max_len:
-            min_len = max(10, max_len - 10)
         result = summarizer_pipeline(
             text,
-            max_length=max_len,
-            min_length=min_len,
-            do_sample=False
         )
         if isinstance(result, list) and len(result) > 0:
-            return result[0]["summary_text"].strip()
-        return "Failed to generate summary."
     except Exception as e:
         print(f"Summarization error: {str(e)}")
-        return f"Summary generation failed: {str(e)}"
 def format_timestamp(seconds: float) -> str:
-    """Format seconds into MM:SS.mmm format"""
     minutes = int(seconds // 60)
-    remaining_seconds = seconds % 60
-    return f"{minutes:02d}:{remaining_seconds:06.3f}"
 def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
@@ -224,20 +247,24 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
     progress(0.1, desc="Initializing models...")
-    # Initialize models with error handling
     try:
         asr = pipeline(
             "automatic-speech-recognition",
-            model="openai/whisper-base",  # Use smaller model for HF Spaces
-            chunk_length_s=30,
-            stride_length_s=(4, 2),
-            return_timestamps="word"
         )
         progress(0.2, desc="ASR model loaded...")
         summarizer = pipeline(
             "summarization",
-            model="facebook/bart-large-cnn"
         )
         progress(0.3, desc="Summarization model loaded...")
@@ -255,11 +282,14 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
         progress(0.4, desc="Processing video chunks...")
-        # Process video - use shorter chunks for HF Spaces
-        chunks = chunk_video(video_file, chunk_length=120, output_dir=chunks_dir)
         if not chunks:
             return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
         progress(0.5, desc=f"Processing {len(chunks)} chunks...")
         # Process each chunk
@@ -271,33 +301,44 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
             # Extract audio
             if not extract_audio(str(chunk), wav_path):
                 continue
-            # Transcribe
             try:
                 chunk_segments = transcribe_audio(asr, wav_path)
                 # Calculate absolute timestamps
-                chunk_start_time = i * 120  # 120 seconds per chunk
                 for seg in chunk_segments:
-                    if isinstance(seg.get("timestamp"), tuple) and len(seg["timestamp"]) == 2:
-                        start_time = chunk_start_time + seg["timestamp"][0]
-                        end_time = chunk_start_time + seg["timestamp"][1]
                     else:
                         start_time = chunk_start_time
-                        end_time = chunk_start_time + 120
-                    all_segments.append({
-                        "text": seg.get("text", ""),
-                        "start": format_timestamp(start_time),
-                        "end": format_timestamp(end_time),
-                        "start_seconds": start_time,
-                        "end_seconds": end_time
-                    })
             except Exception as e:
                 print(f"Error processing chunk {i}: {str(e)}")
                 continue
         if not all_segments:
             return [{"error": "No segments were successfully processed"}]
@@ -307,31 +348,27 @@ def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
         # Sort segments by start time
         all_segments.sort(key=lambda x: x["start_seconds"])
-        # Generate timeline
         timeline = []
-        for i, segment in enumerate(all_segments[:20]):  # Limit to 20 segments for HF Spaces
             segment_text = segment["text"]
             # Generate summary
             try:
-                summary = summarize_text(summarizer, segment_text) if segment_text else "No content"
             except Exception as e:
-                summary = f"Summary failed: {str(e)}"
             # Extract key phrases
             key_phrases = extract_key_phrases(segment_text) if segment_text else []
-            # Extract frame (optional, may fail in HF Spaces)
-            frame_path = os.path.join(frames_dir, f"frame_{i:03d}.jpg")
-            frame_extracted = extract_frame(video_file, segment["start"], frame_path)
             timeline.append({
                 "start_time": segment["start"],
                 "end_time": segment["end"],
                 "text": segment_text,
                 "summary": summary,
-                "key_phrases": key_phrases,
-                "frame_available": frame_extracted
             })
         progress(1.0, desc="Processing complete!")
@@ -360,7 +397,7 @@ def create_interface():
         - 📋 Summaries for each segment
         - 🔑 Key phrases extraction
-        **Note**: This runs on Hugging Face Spaces with limited resources. Processing may take time for longer videos.
         """)
         with gr.Row():
@@ -378,9 +415,10 @@ def create_interface():
                 gr.Markdown("""
                 ### 💡 Tips:
-                - Shorter videos (< 10 minutes) work best
                 - Clear audio improves transcription quality
-                - Processing may take 2-5 minutes depending on video length
                 """)
             with gr.Column(scale=2):
@@ -398,7 +436,7 @@ def create_interface():
         gr.Markdown("""
         ### 🔧 Technical Details:
-        - Uses Whisper (base) for speech recognition
         - BART for text summarization
         - spaCy for key phrase extraction
         - Optimized for Hugging Face Spaces environment

 import spacy
 import gradio as gr
 from transformers import pipeline
+import torch
 # ——— spaCy setup for HF Spaces ———
 def setup_spacy():
             return nlp
         except Exception as e:
             print(f"Failed to download spaCy model: {e}")
             return None
 nlp = setup_spacy()
+def retry_on_rate_limit(func, max_retries=2, initial_delay=3, backoff=1.5):
     def wrapper(*args, **kwargs):
         delay = initial_delay
         for attempt in range(max_retries):
         return False
+def chunk_video(input_path: str, chunk_length: int = 180, output_dir: str = None) -> List[Path]:
     """Chunk video with temporary directory handling for HF Spaces"""
     if output_dir is None:
         output_dir = tempfile.mkdtemp(prefix="chunks_")
         cmd = [
             "ffmpeg", "-y", "-i", input_path,
             "-f", "segment", "-segment_time", str(chunk_length),
+            "-reset_timestamps", "1", "-c", "copy",
             output_pattern
         ]
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
         cmd = [
             "ffmpeg", "-y", "-i", video_path,
             "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
+            "-t", "180",  # Limit to 3 minutes per chunk
             audio_path
         ]
         result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
     if nlp is None:
         # Fallback: simple word extraction
         words = text.split()
         key_words = [w for w in words if len(w) > 4 and w.isalpha()]
         return list(dict.fromkeys(key_words))[:top_n]
     try:
         doc = nlp(text)
         phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
         seen = set()
         unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
         return unique_phrases[:top_n]
     """Extract frame with timeout for HF Spaces"""
     try:
         cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
         if result.returncode != 0:
             return False
         return True
+    except (subprocess.TimeoutExpired, Exception):
         return False
 @retry_on_rate_limit
 def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
+    """Transcribe audio with improved error handling"""
     try:
+        # Use the pipeline with proper parameters
+        result = asr_pipeline(
+            audio_path,
+            return_timestamps=True,
+            chunk_length_s=30,
+            stride_length_s=5
+        )
         if isinstance(result, dict):
             if "chunks" in result:
                 return result["chunks"]
             else:
+                # Handle single result
+                text = result.get("text", "")
+                timestamps = result.get("timestamps", [(0.0, 30.0)])
+                if isinstance(timestamps, list) and len(timestamps) > 0:
+                    return [{"text": text, "timestamp": timestamps[0]}]
+                else:
+                    return [{"text": text, "timestamp": (0.0, 30.0)}]
+        elif isinstance(result, list):
+            # Handle list of results
+            segments = []
+            for i, item in enumerate(result):
+                if isinstance(item, dict):
+                    segments.append({
+                        "text": item.get("text", ""),
+                        "timestamp": item.get("timestamp", (i*30, (i+1)*30))
+                    })
+            return segments
         else:
+            return [{"text": str(result), "timestamp": (0.0, 30.0)}]
     except Exception as e:
         print(f"Transcription error: {str(e)}")
+        return [{"text": "Transcription failed", "timestamp": (0.0, 30.0)}]
 @retry_on_rate_limit
 def summarize_text(summarizer_pipeline, text: str) -> str:
+    """Summarize text with proper length handling"""
     if not text.strip():
         return "No content to summarize."
+    # Clean and prepare text
+    text = text.strip()
+    words = text.split()
+    # Skip very short texts
+    if len(words) < 10:
+        return text  # Return original if too short
+    # Truncate if too long
+    if len(words) > 500:
+        text = " ".join(words[:500])
     try:
+        # Calculate appropriate lengths
+        input_length = len(words)
+        max_new_tokens = min(100, max(20, input_length // 3))
+        min_length = min(15, max(5, input_length // 8))
         result = summarizer_pipeline(
             text,
+            max_new_tokens=max_new_tokens,
+            min_length=min_length,
+            do_sample=False,
+            early_stopping=True
         )
         if isinstance(result, list) and len(result) > 0:
+            summary = result[0]["summary_text"].strip()
+            return summary if summary else text
+        return text
     except Exception as e:
         print(f"Summarization error: {str(e)}")
+        return text  # Return original text if summarization fails
 def format_timestamp(seconds: float) -> str:
+    """Format seconds into MM:SS format"""
     minutes = int(seconds // 60)
+    remaining_seconds = int(seconds % 60)
+    return f"{minutes:02d}:{remaining_seconds:02d}"
 def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
     progress(0.1, desc="Initializing models...")
+    # Initialize models with proper configuration
     try:
+        # Configure Whisper with proper settings
         asr = pipeline(
             "automatic-speech-recognition",
+            model="openai/whisper-tiny",  # Use tiny model for better compatibility
+            device=0 if torch.cuda.is_available() else -1,
+            model_kwargs={
+                "attn_implementation": "eager"  # Fix attention implementation warning
+            }
         )
         progress(0.2, desc="ASR model loaded...")
+        # Configure BART with proper settings
         summarizer = pipeline(
             "summarization",
+            model="facebook/bart-large-cnn",
+            device=0 if torch.cuda.is_available() else -1
         )
         progress(0.3, desc="Summarization model loaded...")
         progress(0.4, desc="Processing video chunks...")
+        # Process video with shorter chunks
+        chunks = chunk_video(video_file, chunk_length=180, output_dir=chunks_dir)
         if not chunks:
             return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
+        # Limit number of chunks for HF Spaces
+        chunks = chunks[:5]  # Process max 5 chunks (15 minutes)
         progress(0.5, desc=f"Processing {len(chunks)} chunks...")
         # Process each chunk
             # Extract audio
             if not extract_audio(str(chunk), wav_path):
+                print(f"Failed to extract audio from chunk {i}")
                 continue
+            # Transcribe with better error handling
             try:
                 chunk_segments = transcribe_audio(asr, wav_path)
                 # Calculate absolute timestamps
+                chunk_start_time = i * 180  # 180 seconds per chunk
                 for seg in chunk_segments:
+                    timestamp = seg.get("timestamp", (0.0, 30.0))
+                    if isinstance(timestamp, tuple) and len(timestamp) == 2:
+                        start_time = chunk_start_time + timestamp[0]
+                        end_time = chunk_start_time + timestamp[1]
                     else:
                         start_time = chunk_start_time
+                        end_time = chunk_start_time + 30
+                    text = seg.get("text", "").strip()
+                    if text:  # Only add non-empty segments
+                        all_segments.append({
+                            "text": text,
+                            "start": format_timestamp(start_time),
+                            "end": format_timestamp(end_time),
+                            "start_seconds": start_time,
+                            "end_seconds": end_time
+                        })
             except Exception as e:
                 print(f"Error processing chunk {i}: {str(e)}")
                 continue
+            # Clean up audio file immediately
+            try:
+                os.remove(wav_path)
+            except:
+                pass
         if not all_segments:
             return [{"error": "No segments were successfully processed"}]
         # Sort segments by start time
         all_segments.sort(key=lambda x: x["start_seconds"])
+        # Generate timeline (limit to 15 segments for HF Spaces)
         timeline = []
+        for i, segment in enumerate(all_segments[:15]):
             segment_text = segment["text"]
             # Generate summary
             try:
+                summary = summarize_text(summarizer, segment_text) if len(segment_text.split()) > 5 else segment_text
             except Exception as e:
+                summary = segment_text
             # Extract key phrases
             key_phrases = extract_key_phrases(segment_text) if segment_text else []
             timeline.append({
+                "segment": i + 1,
                 "start_time": segment["start"],
                 "end_time": segment["end"],
                 "text": segment_text,
                 "summary": summary,
+                "key_phrases": key_phrases
             })
         progress(1.0, desc="Processing complete!")
         - 📋 Summaries for each segment
         - 🔑 Key phrases extraction
+        **Note**: Optimized for Hugging Face Spaces. Processing limited to 15 minutes of video.
         """)
         with gr.Row():
                 gr.Markdown("""
                 ### 💡 Tips:
+                - Videos up to 15 minutes work best
                 - Clear audio improves transcription quality
+                - Processing takes 2-5 minutes
+                - Supported formats: MP4, AVI, MOV
                 """)
             with gr.Column(scale=2):
         gr.Markdown("""
         ### 🔧 Technical Details:
+        - Uses Whisper (tiny) for speech recognition
         - BART for text summarization
         - spaCy for key phrase extraction
         - Optimized for Hugging Face Spaces environment