Spaces:

lochn
/

audio

Sleeping

App Files Files Community

lochn commited on May 23, 2025

Commit

fc8db39

verified ·

1 Parent(s): 7b1ae93

Update app.py

Browse files

Files changed (1) hide show

app.py +369 -120

app.py CHANGED Viewed

@@ -1,24 +1,34 @@
 import os
 import subprocess
 import time
 from pathlib import Path
 import spacy
 import gradio as gr
-from dotenv import load_dotenv
-from huggingface_hub import login
 from transformers import pipeline
-# ——— Load environment variables ———
-load_dotenv()
-# ——— spaCy setup ———
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    from spacy.cli import download as spacy_download
-    spacy_download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
 def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
@@ -28,136 +38,375 @@ def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
             try:
                 return func(*args, **kwargs)
             except Exception as e:
-                if attempt < max_retries - 1:
-                    print(f"Rate limit or error, retrying in {delay}s…")
-                    time.sleep(delay)
-                    delay *= backoff
                 else:
-                    print("Maximum retries reached; aborting.")
                     raise
     return wrapper
-def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = "chunks") -> list[Path]:
-    Path(output_dir).mkdir(exist_ok=True)
-    output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
-    cmd = [
-        "ffmpeg", "-y", "-i", input_path,
-        "-f", "segment", "-segment_time", str(chunk_length),
-        "-reset_timestamps", "1", output_pattern
-    ]
-    subprocess.run(cmd, check=True)
-    return sorted(Path(output_dir).glob("chunk_*.mp4"))
-def extract_audio(video_path: str, audio_path: str) -> None:
-    cmd = [
-        "ffmpeg", "-y", "-i", video_path,
-        "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
-        audio_path
-    ]
-    subprocess.run(cmd, check=True)
-def segment_text(segments: list[dict]) -> list[str]:
-    full = "\n\n".join(seg.get("text", "") for seg in segments)
-    return [b.strip() for b in full.split("\n\n") if b.strip()]
-def extract_key_phrases(text: str, top_n=5) -> list[str]:
-    doc = nlp(text)
-    phrases = [chunk.text for chunk in doc.noun_chunks]
-    return list(dict.fromkeys(phrases))[:top_n]
-def extract_frame(video_path: str, timestamp: str, output_path: str) -> None:
-    cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", output_path]
-    subprocess.run(cmd, check=True)
 @retry_on_rate_limit
-def transcribe_audio(asr_pipeline, audio_path: str) -> list[dict]:
-    result = asr_pipeline(audio_path)
-    return result.get("chunks", [{"text": result["text"], "timestamp": (0.0, 0.0)}])
 @retry_on_rate_limit
 def summarize_text(summarizer_pipeline, text: str) -> str:
-    out = summarizer_pipeline(
-        text,
-        max_length=200,
-        min_length=30,
-        do_sample=False
-    )
-    return out[0]["summary_text"].strip()
-def run_pipeline(video_file: str) -> list[dict]:
-    hf_token = os.getenv("HF_TOKEN")
-    if not hf_token:
-        raise EnvironmentError("HF_TOKEN environment variable is not set.")
-    login(token=hf_token)
-    asr = pipeline(
-        "automatic-speech-recognition",
-        model="openai/whisper-large-v2",
-        chunk_length_s=30,
-        stride_length_s=(5, 5),
-        return_timestamps="sentence",
-        token=hf_token
-    )
-    summarizer = pipeline(
-        "summarization",
-        model="facebook/bart-large-cnn",
-        token=hf_token
-    )
-    chunks = chunk_video(video_file)
-    segments = []
-    for chunk in chunks:
-        wav = str(chunk).replace(".mp4", ".wav")
-        extract_audio(str(chunk), wav)
-        for c in transcribe_audio(asr, wav):
-            segments.append({
-                "text": c["text"],
-                "start": f"{int(c['timestamp'][0]//60):02d}:{c['timestamp'][0]%60:06.3f}",
-                "end":   f"{int(c['timestamp'][1]//60):02d}:{c['timestamp'][1]%60:06.3f}"
             })
-    blocks = segment_text(segments)
-    summaries = [summarize_text(summarizer, b) for b in blocks]
-    phrases = [extract_key_phrases(b) for b in blocks]
-    Path("frames").mkdir(exist_ok=True)
-    frames = []
-    for seg in segments:
-        ts_clean = seg["start"].replace(":", "-")
-        out = f"frames/frame_{ts_clean}.jpg"
-        extract_frame(video_file, seg["start"], out)
-        frames.append(out)
-    timeline = []
-    for seg, sumry, ph, fr in zip(segments, summaries, phrases, frames):
-        timeline.append({
-            "start_time": seg["start"],
-            "end_time":   seg["end"],
-            "summary":    sumry,
-            "key_phrases": ph,
-            "frame":      fr
-        })
-    return timeline
-# ——— Gradio UI ———
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("# Lecture Capture AI Pipeline (HF-powered)")
-    vid = gr.Video(label="Lecture Video")
-    btn = gr.Button("Process")
-    out = gr.JSON(label="Timeline")
-    btn.click(fn=run_pipeline, inputs=[vid], outputs=out)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import subprocess
 import time
+import tempfile
+import shutil
 from pathlib import Path
+from typing import List, Dict, Optional
 import spacy
 import gradio as gr
 from transformers import pipeline
+# ——— spaCy setup for HF Spaces ———
+def setup_spacy():
+    """Setup spaCy model with proper error handling for HF Spaces"""
+    try:
+        nlp = spacy.load("en_core_web_sm")
+        return nlp
+    except OSError:
+        print("Downloading spaCy model...")
+        try:
+            from spacy.cli import download as spacy_download
+            spacy_download("en_core_web_sm")
+            nlp = spacy.load("en_core_web_sm")
+            return nlp
+        except Exception as e:
+            print(f"Failed to download spaCy model: {e}")
+            # Return None if spaCy fails - we'll handle this gracefully
+            return None
+nlp = setup_spacy()
 def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
             try:
                 return func(*args, **kwargs)
             except Exception as e:
+                if "rate limit" in str(e).lower() or "429" in str(e):
+                    if attempt < max_retries - 1:
+                        print(f"Rate limit detected, retrying in {delay}s...")
+                        time.sleep(delay)
+                        delay *= backoff
+                    else:
+                        print("Maximum retries reached for rate limit.")
+                        raise
                 else:
+                    # For non-rate-limit errors, raise immediately
                     raise
     return wrapper
+def check_ffmpeg():
+    """Check if ffmpeg is available in HF Spaces"""
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
+def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = None) -> List[Path]:
+    """Chunk video with temporary directory handling for HF Spaces"""
+    if output_dir is None:
+        output_dir = tempfile.mkdtemp(prefix="chunks_")
+    Path(output_dir).mkdir(exist_ok=True)
+    output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
+    try:
+        cmd = [
+            "ffmpeg", "-y", "-i", input_path,
+            "-f", "segment", "-segment_time", str(chunk_length),
+            "-reset_timestamps", "1", "-c", "copy",  # Use copy to avoid re-encoding
+            output_pattern
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+        if result.returncode != 0:
+            print(f"FFmpeg error: {result.stderr}")
+            return []
+        return sorted(Path(output_dir).glob("chunk_*.mp4"))
+    except subprocess.TimeoutExpired:
+        print("Video chunking timed out")
+        return []
+    except Exception as e:
+        print(f"Error chunking video: {str(e)}")
+        return []
+def extract_audio(video_path: str, audio_path: str) -> bool:
+    """Extract audio with better error handling for HF Spaces"""
+    try:
+        cmd = [
+            "ffmpeg", "-y", "-i", video_path,
+            "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
+            audio_path
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+        if result.returncode != 0:
+            print(f"Audio extraction error: {result.stderr}")
+            return False
+        return True
+    except subprocess.TimeoutExpired:
+        print("Audio extraction timed out")
+        return False
+    except Exception as e:
+        print(f"Error extracting audio: {str(e)}")
+        return False
+def extract_key_phrases(text: str, top_n: int = 5) -> List[str]:
+    """Extract key phrases with fallback if spaCy is not available"""
+    if nlp is None:
+        # Fallback: simple word extraction
+        words = text.split()
+        # Get longer words as "key phrases"
+        key_words = [w for w in words if len(w) > 4 and w.isalpha()]
+        return list(dict.fromkeys(key_words))[:top_n]
+    try:
+        doc = nlp(text)
+        phrases = [chunk.text.strip() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 2]
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_phrases = [p for p in phrases if not (p.lower() in seen or seen.add(p.lower()))]
+        return unique_phrases[:top_n]
+    except Exception as e:
+        print(f"Error extracting key phrases: {str(e)}")
+        return []
+def extract_frame(video_path: str, timestamp: str, output_path: str) -> bool:
+    """Extract frame with timeout for HF Spaces"""
+    try:
+        cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", "-q:v", "2", output_path]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if result.returncode != 0:
+            print(f"Frame extraction error: {result.stderr}")
+            return False
+        return True
+    except subprocess.TimeoutExpired:
+        print("Frame extraction timed out")
+        return False
+    except Exception as e:
+        print(f"Error extracting frame: {str(e)}")
+        return False
 @retry_on_rate_limit
+def transcribe_audio(asr_pipeline, audio_path: str) -> List[Dict]:
+    """Transcribe audio with better error handling"""
+    try:
+        result = asr_pipeline(audio_path)
+        if isinstance(result, dict):
+            if "chunks" in result:
+                return result["chunks"]
+            else:
+                return [{"text": result.get("text", ""), "timestamp": (0.0, 0.0)}]
+        elif isinstance(result, str):
+            return [{"text": result, "timestamp": (0.0, 0.0)}]
+        else:
+            return [{"text": str(result), "timestamp": (0.0, 0.0)}]
+    except Exception as e:
+        print(f"Transcription error: {str(e)}")
+        return [{"text": "Transcription failed", "timestamp": (0.0, 0.0)}]
 @retry_on_rate_limit
 def summarize_text(summarizer_pipeline, text: str) -> str:
+    """Summarize text with length constraints for HF Spaces"""
+    if not text.strip():
+        return "No content to summarize."
+    # Truncate text if too long for the model
+    max_length = 1024  # BART's max input length
+    if len(text) > max_length:
+        text = text[:max_length]
+    try:
+        # Adjust parameters for shorter text
+        min_len = min(30, len(text.split()) // 4)
+        max_len = min(200, len(text.split()) // 2)
+        if min_len >= max_len:
+            min_len = max(10, max_len - 10)
+        result = summarizer_pipeline(
+            text,
+            max_length=max_len,
+            min_length=min_len,
+            do_sample=False
+        )
+        if isinstance(result, list) and len(result) > 0:
+            return result[0]["summary_text"].strip()
+        return "Failed to generate summary."
+    except Exception as e:
+        print(f"Summarization error: {str(e)}")
+        return f"Summary generation failed: {str(e)}"
+def format_timestamp(seconds: float) -> str:
+    """Format seconds into MM:SS.mmm format"""
+    minutes = int(seconds // 60)
+    remaining_seconds = seconds % 60
+    return f"{minutes:02d}:{remaining_seconds:06.3f}"
+def run_pipeline(video_file: str, progress=gr.Progress()) -> List[Dict]:
+    """Main pipeline function optimized for HF Spaces"""
+    if not video_file:
+        return [{"error": "No video file provided"}]
+    # Check if ffmpeg is available
+    if not check_ffmpeg():
+        return [{"error": "FFmpeg is not available in this environment"}]
+    progress(0.1, desc="Initializing models...")
+    # Initialize models with error handling
+    try:
+        asr = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-base",  # Use smaller model for HF Spaces
+            chunk_length_s=30,
+            stride_length_s=(4, 2),
+            return_timestamps="word"
+        )
+        progress(0.2, desc="ASR model loaded...")
+        summarizer = pipeline(
+            "summarization",
+            model="facebook/bart-large-cnn"
+        )
+        progress(0.3, desc="Summarization model loaded...")
+    except Exception as e:
+        return [{"error": f"Failed to load models: {str(e)}"}]
+    # Create temporary directories
+    temp_dir = tempfile.mkdtemp(prefix="lecture_capture_")
+    chunks_dir = os.path.join(temp_dir, "chunks")
+    frames_dir = os.path.join(temp_dir, "frames")
+    try:
+        Path(chunks_dir).mkdir(exist_ok=True)
+        Path(frames_dir).mkdir(exist_ok=True)
+        progress(0.4, desc="Processing video chunks...")
+        # Process video - use shorter chunks for HF Spaces
+        chunks = chunk_video(video_file, chunk_length=120, output_dir=chunks_dir)
+        if not chunks:
+            return [{"error": "No video chunks were created. Video may be corrupted or unsupported format."}]
+        progress(0.5, desc=f"Processing {len(chunks)} chunks...")
+        # Process each chunk
+        all_segments = []
+        for i, chunk in enumerate(chunks):
+            progress(0.5 + (0.3 * i / len(chunks)), desc=f"Processing chunk {i+1}/{len(chunks)}...")
+            wav_path = str(chunk).replace(".mp4", ".wav")
+            # Extract audio
+            if not extract_audio(str(chunk), wav_path):
+                continue
+            # Transcribe
+            try:
+                chunk_segments = transcribe_audio(asr, wav_path)
+                # Calculate absolute timestamps
+                chunk_start_time = i * 120  # 120 seconds per chunk
+                for seg in chunk_segments:
+                    if isinstance(seg.get("timestamp"), tuple) and len(seg["timestamp"]) == 2:
+                        start_time = chunk_start_time + seg["timestamp"][0]
+                        end_time = chunk_start_time + seg["timestamp"][1]
+                    else:
+                        start_time = chunk_start_time
+                        end_time = chunk_start_time + 120
+                    all_segments.append({
+                        "text": seg.get("text", ""),
+                        "start": format_timestamp(start_time),
+                        "end": format_timestamp(end_time),
+                        "start_seconds": start_time,
+                        "end_seconds": end_time
+                    })
+            except Exception as e:
+                print(f"Error processing chunk {i}: {str(e)}")
+                continue
+        if not all_segments:
+            return [{"error": "No segments were successfully processed"}]
+        progress(0.8, desc="Generating summaries and extracting key phrases...")
+        # Sort segments by start time
+        all_segments.sort(key=lambda x: x["start_seconds"])
+        # Generate timeline
+        timeline = []
+        for i, segment in enumerate(all_segments[:20]):  # Limit to 20 segments for HF Spaces
+            segment_text = segment["text"]
+            # Generate summary
+            try:
+                summary = summarize_text(summarizer, segment_text) if segment_text else "No content"
+            except Exception as e:
+                summary = f"Summary failed: {str(e)}"
+            # Extract key phrases
+            key_phrases = extract_key_phrases(segment_text) if segment_text else []
+            # Extract frame (optional, may fail in HF Spaces)
+            frame_path = os.path.join(frames_dir, f"frame_{i:03d}.jpg")
+            frame_extracted = extract_frame(video_file, segment["start"], frame_path)
+            timeline.append({
+                "start_time": segment["start"],
+                "end_time": segment["end"],
+                "text": segment_text,
+                "summary": summary,
+                "key_phrases": key_phrases,
+                "frame_available": frame_extracted
             })
+        progress(1.0, desc="Processing complete!")
+        return timeline
+    except Exception as e:
+        import traceback
+        return [{"error": f"Pipeline failed: {str(e)}", "details": traceback.format_exc()}]
+    finally:
+        # Clean up temporary files
+        try:
+            shutil.rmtree(temp_dir)
+        except Exception as e:
+            print(f"Failed to clean up temp directory: {str(e)}")
+# ——— Gradio UI optimized for HF Spaces ———
+def create_interface():
+    with gr.Blocks(title="Lecture Capture AI Pipeline", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎓 Lecture Capture AI Pipeline
+        Upload a lecture video to automatically generate:
+        - 📝 Transcription with timestamps
+        - 📋 Summaries for each segment
+        - 🔑 Key phrases extraction
+        **Note**: This runs on Hugging Face Spaces with limited resources. Processing may take time for longer videos.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                video_input = gr.Video(
+                    label="📹 Upload Lecture Video",
+                    height=300
+                )
+                process_btn = gr.Button(
+                    "🚀 Process Video",
+                    variant="primary",
+                    size="lg"
+                )
+                gr.Markdown("""
+                ### 💡 Tips:
+                - Shorter videos (< 10 minutes) work best
+                - Clear audio improves transcription quality
+                - Processing may take 2-5 minutes depending on video length
+                """)
+            with gr.Column(scale=2):
+                output_json = gr.JSON(
+                    label="📊 Generated Timeline",
+                    height=600
+                )
+        process_btn.click(
+            fn=run_pipeline,
+            inputs=[video_input],
+            outputs=[output_json],
+            show_progress=True
+        )
+        gr.Markdown("""
+        ### 🔧 Technical Details:
+        - Uses Whisper (base) for speech recognition
+        - BART for text summarization
+        - spaCy for key phrase extraction
+        - Optimized for Hugging Face Spaces environment
+        """)
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()