Spaces:

lochn
/

audio

Sleeping

App Files Files Community

lochn commited on Apr 30, 2025

Commit

18e0d67

verified ·

1 Parent(s): 55e8ba3

Create app.py

Browse files

Files changed (1) hide show

app.py +151 -0

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import json
+from pathlib import Path
+import ffmpeg
+import spacy
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+# Set your OpenAI API key
+processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo")
+# Load spaCy model for key-phrase extraction
+nlp = spacy.load("en_core_web_sm")
+def chunk_video(input_path: str, chunk_length: int = 300, output_dir: str = "chunks") -> list[Path]:
+    """
+    Split input video into fixed-length chunks.
+    """
+    Path(output_dir).mkdir(exist_ok=True)
+    (
+        ffmpeg
+        .input('AP World UNIT 1 REVIEW (Everything you NEED to KNOW!).mp4')
+        .output(f"{output_dir}/chunk_%03d.mp4",
+                f="segment", segment_time=chunk_length, reset_timestamps=1)
+        .run(overwrite_output=True)
+    )
+    return sorted(Path(output_dir).glob("chunk_*.mp4"))
+def extract_audio(video_path: str, audio_path: str) -> None:
+    """
+    Extract mono 16kHz PCM audio from video.
+    """
+    (
+        ffmpeg
+        .input(video_path)
+        .output(audio_path, acodec="pcm_s16le", ac=1, ar="16k")
+        .run(overwrite_output=True)
+    )
+def transcribe_audio(audio_path: str) -> list[dict]:
+    """
+    Transcribe audio using OpenAI Whisper.
+    Returns list of segments with start, end, and text.
+    """
+    model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3-turbo")
+    result = model.transcribe(audio_path)
+    return result.get("segments", [])
+def segment_text(segments: list[dict]) -> list[str]:
+    """
+    Join segment texts and naively split into semantic blocks.
+    """
+    full_text = "\n\n".join(seg["text"] for seg in segments)
+    return [block.strip() for block in full_text.split("\n\n") if block.strip()]
+def summarize_text(text: str) -> str:
+    """
+    Summarize a chunk of transcript via GPT.
+    """
+    prompt = f"Summarize the following lecture segment in 2-3 sentences:\n\n{text}"
+    response = openai.ChatCompletion.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=200
+    )
+    return response.choices[0].message.content.strip()
+def extract_key_phrases(text: str, top_n: int = 5) -> list[str]:
+    """
+    Extract noun chunks as key phrases.
+    """
+    doc = nlp(text)
+    phrases = [chunk.text for chunk in doc.noun_chunks]
+    # Keep unique, preserve order
+    return list(dict.fromkeys(phrases))[:top_n]
+def extract_frame(video_path: str, timestamp: str, output_path: str) -> None:
+    """
+    Extract a single frame at given timestamp.
+    """
+    (
+        ffmpeg
+        .input(video_path, ss=timestamp)
+        .output(output_path, vframes=1)
+        .run(overwrite_output=True)
+    )
+def build_timeline(segments: list[dict], summaries: list[str], keys: list[list[str]], frames: list[str]) -> list[dict]:
+    """
+    Assemble timeline entries into a list of dictionaries.
+    """
+    timeline = []
+    for seg, summary, key_list, frame in zip(segments, summaries, keys, frames):
+        timeline.append({
+            "start_time": seg.get("start"),
+            "end_time": seg.get("end"),
+            "summary": summary,
+            "key_phrases": key_list,
+            "frame_path": frame
+        })
+    return timeline
+def main(video_file: str):
+    # 1. Chunk video
+    chunks = chunk_video(video_file)
+    # 2. Transcribe all chunks
+    all_segments = []
+    for chunk in chunks:
+        wav_path = str(chunk).replace(".mp4", ".wav")
+        extract_audio(str(chunk), wav_path)
+        all_segments.extend(transcribe_audio(wav_path))
+    # 3. Segment transcript
+    transcript_blocks = segment_text(all_segments)
+    # 4. Summarize and extract key phrases
+    summaries = [summarize_text(block) for block in transcript_blocks]
+    key_phrases = [extract_key_phrases(block) for block in transcript_blocks]
+    # 5. Extract frames
+    frame_dir = Path("frames"); frame_dir.mkdir(exist_ok=True)
+    frame_paths = []
+    for seg in all_segments:
+        ts = seg.get("start")
+        fname = f"frame_{ts.replace(':', '-')}.jpg"
+        out_path = frame_dir / fname
+        extract_frame(video_file, ts, str(out_path))
+        frame_paths.append(str(out_path))
+    # 6. Build timeline and save
+    timeline = build_timeline(all_segments, summaries, key_phrases, frame_paths)
+    with open("timeline.json", "w") as f:
+        json.dump(timeline, f, indent=2)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Lecture capture AI pipeline")
+    parser.add_argument("video_file", help="Path to the input lecture video")
+    args = parser.parse_args()
+    main(args.video_file)