Spaces:

lochn
/

audio

Sleeping

App Files Files Community

lochn commited on May 7, 2025

Commit

8ae984c

verified ·

1 Parent(s): e1a91ad

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -131

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import json
 import subprocess
 from pathlib import Path
 import openai
 import spacy
 import gradio as gr
 # Load spaCy model for key-phrase extraction, downloading if missing
 try:
@@ -16,79 +18,58 @@ except OSError:
     nlp = spacy.load("en_core_web_sm")
-def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = "chunks") -> list[Path]:
     """
-    Split input video into fixed-length chunks using ffmpeg CLI.
     """
     Path(output_dir).mkdir(exist_ok=True)
     output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
     cmd = [
-        "ffmpeg",
-        "-y",                       # overwrite existing files
-        "-i", input_path,
-        "-f", "segment",
-        "-segment_time", str(chunk_length),
-        "-reset_timestamps", "1",
         output_pattern
     ]
-    try:
-        subprocess.run(cmd, check=True, capture_output=True, text=True)
-    except subprocess.CalledProcessError as e:
-        print(f"Error during chunking: {e.stderr}")
-        raise
     return sorted(Path(output_dir).glob("chunk_*.mp4"))
 def extract_audio(video_path: str, audio_path: str) -> None:
-    """
-    Extract mono, 16kHz PCM audio from video using ffmpeg CLI.
-    """
     cmd = [
-        "ffmpeg",
-        "-y",                       # overwrite if exists
-        "-i", video_path,
-        "-vn",                      # disable video output
-        "-c:a", "pcm_s16le",      # audio codec
-        "-ar", "16000",           # sample rate
-        "-ac", "1",               # mono audio
         audio_path
     ]
-    try:
-        subprocess.run(cmd, check=True, capture_output=True, text=True)
-    except subprocess.CalledProcessError as e:
-        print(f"Error extracting audio: {e.stderr}")
-        raise
 def transcribe_audio(audio_path: str) -> list[dict]:
-    """
-    Transcribe audio using OpenAI Whisper API.
-    Returns list of segments with start, end, and text.
-    """
-    with open(audio_path, "rb") as audio_file:
-        transcript = openai.Audio.transcribe(
-            model="whisper-1",
-            file=audio_file,
-            response_format="verbose_json"
-        )
-    return transcript.get("segments", [])
-def segment_text(segments: list[dict]) -> list[str]:
-    """
-    Join segment texts and split into semantic blocks based on paragraph breaks.
-    """
-    full_text = "\n\n".join(seg.get("text", "") for seg in segments)
-    return [block.strip() for block in full_text.split("\n\n") if block.strip()]
 def summarize_text(text: str) -> str:
-    """
-    Summarize a chunk of transcript via GPT-4.
-    """
-    prompt = (
-        "Summarize the following lecture segment in 2-3 sentences:\n\n" + text
-    )
     response = openai.ChatCompletion.create(
         model="gpt-4o",
         messages=[{"role": "user", "content": prompt}],
@@ -97,100 +78,65 @@ def summarize_text(text: str) -> str:
     return response.choices[0].message.content.strip()
-def extract_key_phrases(text: str, top_n: int = 5) -> list[str]:
-    """
-    Extract noun chunks as key phrases from text.
-    """
     doc = nlp(text)
     phrases = [chunk.text for chunk in doc.noun_chunks]
     return list(dict.fromkeys(phrases))[:top_n]
 def extract_frame(video_path: str, timestamp: str, output_path: str) -> None:
-    """
-    Extract a single frame at given timestamp using ffmpeg CLI.
-    """
-    cmd = [
-        "ffmpeg",
-        "-y",
-        "-i", video_path,
-        "-ss", timestamp,
-        "-frames:v", "1",
-        output_path
-    ]
-    try:
-        subprocess.run(cmd, check=True, capture_output=True, text=True)
-    except subprocess.CalledProcessError as e:
-        print(f"Error extracting frame: {e.stderr}")
-        raise
 def run_pipeline(api_key: str, video_file: str) -> list[dict]:
-    """
-    Complete processing pipeline: chunk, audio, transcribe, summarize, key phrases, frames.
-    Returns structured timeline entries.
-    """
     openai.api_key = api_key
-    # 1. Chunk video
     chunks = chunk_video(video_file)
-    # 2. Extract audio & transcribe
-    all_segments = []
-    for chunk in chunks:
-        wav_path = str(chunk).replace(".mp4", ".wav")
-        extract_audio(str(chunk), wav_path)
-        segs = transcribe_audio(wav_path)
-        all_segments.extend(segs)
-    # 3. Segment transcript
-    transcript_blocks = segment_text(all_segments)
-    # 4. Summarization & key-phrase extraction
-    summaries = [summarize_text(b) for b in transcript_blocks]
-    phrases_list = [extract_key_phrases(b) for b in transcript_blocks]
-    # 5. Extract frames for each segment
-    frame_dir = Path("frames")
-    frame_dir.mkdir(exist_ok=True)
-    frame_paths = []
-    for seg in all_segments:
-        ts = seg.get("start", "00:00:00.000")
-        fname = f"frame_{ts.replace(':', '-')}.jpg"
-        out = frame_dir / fname
-        extract_frame(video_file, ts, str(out))
-        frame_paths.append(str(out))
-    # 6. Assemble timeline
     timeline = []
-    for seg, summary, keys, frame in zip(all_segments, summaries, phrases_list, frame_paths):
         timeline.append({
-            "start_time": seg.get("start"),
-            "end_time": seg.get("end"),
-            "summary": summary,
-            "key_phrases": keys,
-            "frame": frame
         })
     return timeline
-# Gradio UI for Hugging Face Spaces
 demo = gr.Blocks()
 with demo:
-    gr.Markdown(
-        "# Lecture Capture AI Pipeline\n"
-        "Enter your OpenAI API key and upload a lecture video to generate a summarized timeline."
-    )
-    api_key_input = gr.Textbox(
-        label="OpenAI API Key", type="password",
-        placeholder="sk-…", lines=1
-    )
-    video_input = gr.Video(label="Lecture Video File")
-    run_button = gr.Button("Process Video")
-    output = gr.JSON(label="Generated Timeline")
-    run_button.click(
-        fn=run_pipeline,
-        inputs=[api_key_input, video_input],
-        outputs=output
-    )
-if __name__ == "__main__":
     demo.launch()

 import os
 import json
 import subprocess
+import time
 from pathlib import Path
 import openai
 import spacy
 import gradio as gr
+from openai.error import RateLimitError
 # Load spaCy model for key-phrase extraction, downloading if missing
 try:
     nlp = spacy.load("en_core_web_sm")
+def retry_on_rate_limit(func, max_retries=3, initial_delay=5, backoff=2):
     """
+    Retry decorator for functions that may hit OpenAI rate limits.
     """
+    def wrapper(*args, **kwargs):
+        delay = initial_delay
+        for attempt in range(max_retries):
+            try:
+                return func(*args, **kwargs)
+            except RateLimitError as e:
+                if attempt < max_retries - 1:
+                    print(f"Rate limit hit, retrying in {delay}s...")
+                    time.sleep(delay)
+                    delay *= backoff
+                else:
+                    print("Maximum retries reached. Aborting.")
+                    raise
+    return wrapper
+def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = "chunks") -> list[Path]:
     Path(output_dir).mkdir(exist_ok=True)
     output_pattern = os.path.join(output_dir, "chunk_%03d.mp4")
     cmd = [
+        "ffmpeg", "-y", "-i", input_path,
+        "-f", "segment", "-segment_time", str(chunk_length), "-reset_timestamps", "1",
         output_pattern
     ]
+    subprocess.run(cmd, check=True)
     return sorted(Path(output_dir).glob("chunk_*.mp4"))
 def extract_audio(video_path: str, audio_path: str) -> None:
     cmd = [
+        "ffmpeg", "-y", "-i", video_path,
+        "-vn", "-c:a", "pcm_s16le", "-ar", "16000", "-ac", "1",
         audio_path
     ]
+    subprocess.run(cmd, check=True)
+@retry_on_rate_limit
 def transcribe_audio(audio_path: str) -> list[dict]:
+    with open(audio_path, "rb") as f:
+        return openai.Audio.transcribe(
+            model="whisper-1", file=f, response_format="verbose_json"
+        ).get("segments", [])
+@retry_on_rate_limit
 def summarize_text(text: str) -> str:
+    prompt = f"Summarize the following lecture segment in 2-3 sentences:\n\n{text}"
     response = openai.ChatCompletion.create(
         model="gpt-4o",
         messages=[{"role": "user", "content": prompt}],
     return response.choices[0].message.content.strip()
+def segment_text(segments: list[dict]) -> list[str]:
+    full = "\n\n".join(seg.get("text", "") for seg in segments)
+    return [b.strip() for b in full.split("\n\n") if b.strip()]
+def extract_key_phrases(text: str, top_n=5) -> list[str]:
     doc = nlp(text)
     phrases = [chunk.text for chunk in doc.noun_chunks]
     return list(dict.fromkeys(phrases))[:top_n]
 def extract_frame(video_path: str, timestamp: str, output_path: str) -> None:
+    cmd = ["ffmpeg", "-y", "-i", video_path, "-ss", timestamp, "-frames:v", "1", output_path]
+    subprocess.run(cmd, check=True)
 def run_pipeline(api_key: str, video_file: str) -> list[dict]:
     openai.api_key = api_key
+    # chunk
     chunks = chunk_video(video_file)
+    segments = []
+    for c in chunks:
+        wav = str(c).replace('.mp4', '.wav')
+        extract_audio(str(c), wav)
+        segments.extend(transcribe_audio(wav))
+    # segment text
+    blocks = segment_text(segments)
+    # summarize & phrases
+    summaries = [summarize_text(b) for b in blocks]
+    phrases = [extract_key_phrases(b) for b in blocks]
+    # extract frames
+    Path('frames').mkdir(exist_ok=True)
+    frames = []
+    for seg in segments:
+        ts = seg.get('start', '00:00:00.000')
+        out = f"frames/frame_{ts.replace(':','-')}.jpg"
+        extract_frame(video_file, ts, out)
+        frames.append(out)
+    # assemble
     timeline = []
+    for seg, sumry, ph, fr in zip(segments, summaries, phrases, frames):
         timeline.append({
+            'start_time': seg.get('start'),
+            'end_time': seg.get('end'),
+            'summary': sumry,
+            'key_phrases': ph,
+            'frame': fr
         })
     return timeline
+# Gradio UI
 demo = gr.Blocks()
 with demo:
+    gr.Markdown("# Lecture Capture AI Pipeline")
+    api = gr.Textbox(type='password', label='OpenAI API Key')
+    vid = gr.Video(label='Lecture Video')
+    btn = gr.Button('Process')
+    out = gr.JSON(label='Timeline')
+    btn.click(fn=run_pipeline, inputs=[api, vid], outputs=out)
+if __name__ == '__main__':
     demo.launch()