Spaces:

MCP-1st-Birthday
/

aileen3-core

Running

App Files Files Community

ndurner commited on 27 days ago

Commit

8d7df12

1 Parent(s): ee16725

fix slide extraction

Browse files

Files changed (4) hide show

demo/requirements.txt +1 -0
mcp/pyproject.toml +2 -1
mcp/src/aileen3_mcp/cli_slides.py +66 -0
mcp/src/aileen3_mcp/media_tools.py +110 -28

demo/requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ yt-dlp[default]>=2025.11.12
 fastmcp>=0.1.11
 google-genai>=0.8.0
 ffmpeg-python>=0.2.0

 fastmcp>=0.1.11
 google-genai>=0.8.0
 ffmpeg-python>=0.2.0
+pillow>=10.4.0

mcp/pyproject.toml CHANGED Viewed

@@ -11,11 +11,12 @@ dependencies = [
     "fastmcp>=0.1.11",
     "yt-dlp[default]>=2025.11.12",
     "google-genai>=0.8.0",
-    "ffmpeg-python>=0.2.0"
 ]
 [project.scripts]
 aileen3-mcp = "aileen3_mcp.server:main"
 [build-system]
 requires = ["setuptools>=64", "wheel"]

     "fastmcp>=0.1.11",
     "yt-dlp[default]>=2025.11.12",
     "google-genai>=0.8.0",
+    "ffmpeg-python>=0.2.0",
 ]
 [project.scripts]
 aileen3-mcp = "aileen3_mcp.server:main"
+aileen3-slides = "aileen3_mcp.cli_slides:main"
 [build-system]
 requires = ["setuptools>=64", "wheel"]

mcp/src/aileen3_mcp/cli_slides.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Manual slide extraction helper.
+Run slide extraction against a local video using the same pipeline the MCP
+server uses. Useful for debugging model responses when no slides are returned.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from .media_tools import _build_reference, _extract_slides_flow, _probe_duration, _slides_json_path
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Extract slides from a video using the Gemini pipeline.")
+    parser.add_argument("video", type=Path, help="Path to the video file (mp4 recommended).")
+    parser.add_argument(
+        "--reference",
+        help="Reference id to use for cache/output. Defaults to one derived from the filename.",
+    )
+    parser.add_argument(
+        "--duration",
+        type=float,
+        help="Override duration in seconds (optional). If omitted, ffprobe is used.",
+    )
+    args = parser.parse_args(argv)
+    video_path: Path = args.video.expanduser().resolve()
+    if not video_path.exists():
+        parser.error(f"Video not found: {video_path}")
+    reference = args.reference or _build_reference(None, str(video_path))
+    duration = args.duration or _probe_duration(video_path)
+    metadata = {
+        "reference": reference,
+        "download_path": str(video_path),
+        "duration": duration,
+        "source": str(video_path),
+    }
+    try:
+        result = _extract_slides_flow(metadata)
+    except Exception as exc:  # pragma: no cover - CLI convenience
+        print(f"[error] slide extraction failed: {exc}", file=sys.stderr)
+        return 1
+    slides = result.get("slides", [])
+    print(f"Extracted {len(slides)} slides for reference '{reference}'.")
+    if slides:
+        print("First few slides:")
+        for slide in slides[:5]:
+            start = slide.get("from")
+            end = slide.get("to")
+            label = slide.get("label") or ""
+            print(f"  index={slide.get('index')} from={start:.2f}s to={end:.2f}s label={label}")
+    slides_json = _slides_json_path(reference)
+    print(f"Slides JSON saved to {slides_json}")
+    return 0
+if __name__ == "__main__":  # pragma: no cover - CLI execution
+    raise SystemExit(main())

mcp/src/aileen3_mcp/media_tools.py CHANGED Viewed

@@ -18,6 +18,7 @@ import ffmpeg
 from fastmcp import Context, FastMCP
 from contextlib import redirect_stdout, redirect_stderr, contextmanager
 import io
 log = logging.getLogger(__name__)
@@ -182,6 +183,63 @@ def _build_reference(info: dict | None, source: str) -> str:
     return f"media_{digest}"
 def _job_payload(job: JobRecord, include_result: bool = True) -> dict:
     payload = {
         "job_id": job.id,
@@ -416,6 +474,7 @@ def _wait_for_upload(client, upload):
 def _gemini_structured_slide_times(client, video_path: Path, reference: str) -> list[dict]:
     from google.genai import types
     upload = client.files.upload(
         file=str(video_path),
         config=types.UploadFileConfig(
@@ -424,45 +483,67 @@ def _gemini_structured_slide_times(client, video_path: Path, reference: str) ->
         ),
     )
     upload = _wait_for_upload(client, upload)
-    schema = types.Schema(
-        type=types.Type.OBJECT,
-        properties={
-            "slides": types.Schema(
-                type=types.Type.ARRAY,
-                items=types.Schema(
-                    type=types.Type.OBJECT,
-                    properties={
-                        "label": types.Schema(type=types.Type.STRING),
-                        "from_seconds": types.Schema(type=types.Type.NUMBER),
-                        "to_seconds": types.Schema(type=types.Type.NUMBER),
                     },
-                    required=["from_seconds", "to_seconds"],
-                ),
-            )
         },
-        required=["slides"],
-    )
     file = types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "video/mp4")
     response = client.models.generate_content(
         model="gemini-flash-lite-latest",
         contents=[file, "What are the timestamps of individual slides presented?"],
     )
     raw = getattr(response, "text", None) or getattr(response, "raw", None)
-    if not raw and hasattr(response, "output_text"):
         raw = response.output_text  # type: ignore[attr-defined]
     if not raw:
-        # try candidates
         candidates = getattr(response, "candidates", None)
-        if candidates:
             raw = candidates[0].content.parts[0].text  # type: ignore[index]
     if not raw:
         raise RuntimeError("Slide analysis model returned empty response")
     _write_debug(reference, "slides_raw.json", raw or "")
     try:
         payload = json.loads(raw) if raw else {"slides": []}
@@ -473,10 +554,9 @@ def _gemini_structured_slide_times(client, video_path: Path, reference: str) ->
     slides = payload.get("slides") or []
     sanitized: list[dict] = []
     for slide in slides:
-        try:
-            start = float(slide.get("from_seconds"))
-            end = float(slide.get("to_seconds"))
-        except Exception:
             continue
         label = (slide.get("label") or "").strip()
         sanitized.append({"from": start, "to": end, "label": label})
@@ -559,7 +639,7 @@ def _extract_slides_flow(metadata: dict) -> dict:
     with _silence_stdio():  # silence any ffmpeg/yt-dlp noise during upload
         slides_raw = _gemini_structured_slide_times(client, video_path, reference)
-    seen_hashes: set[str] = set()
     slide_entries: list[dict] = []
     for idx, slide in enumerate(slides_raw):
@@ -575,10 +655,12 @@ def _extract_slides_flow(metadata: dict) -> dict:
         if not frame_bytes:
             continue
-        digest = hashlib.sha1(frame_bytes).hexdigest()
-        if digest in seen_hashes:
             continue
-        seen_hashes.add(digest)
         data_uri = "data:image/png;base64," + base64.b64encode(frame_bytes).decode("ascii")

 from fastmcp import Context, FastMCP
 from contextlib import redirect_stdout, redirect_stderr, contextmanager
 import io
+from PIL import Image
 log = logging.getLogger(__name__)
     return f"media_{digest}"
+def _parse_timestamp(value: Any) -> float | None:
+    """Accept mm:ss or hh:mm:ss strings (optionally with fractional seconds) and numbers."""
+    if value is None:
+        return None
+    # Allow numeric input for backward compatibility
+    if isinstance(value, (int, float)):
+        return float(value)
+    text = str(value).strip()
+    if not text:
+        return None
+    if text.isdigit():
+        return float(text)
+    parts = text.split(":")
+    try:
+        parts_f = [float(p) for p in parts]
+    except ValueError:
+        return None
+    if len(parts_f) == 2:  # mm:ss
+        minutes, seconds = parts_f
+        return max(0.0, minutes * 60 + seconds)
+    if len(parts_f) == 3:  # hh:mm:ss
+        hours, minutes, seconds = parts_f
+        return max(0.0, hours * 3600 + minutes * 60 + seconds)
+    return None
+def _average_hash(frame_bytes: bytes, hash_size: int = 8) -> int | None:
+    """Compute a lightweight perceptual hash (aHash) tolerant to minor artifacts."""
+    try:
+        with Image.open(io.BytesIO(frame_bytes)) as img:
+            img = img.convert("L").resize((hash_size, hash_size), Image.LANCZOS)
+            pixels = list(img.getdata())
+    except Exception:
+        return None
+    if not pixels:
+        return None
+    avg = sum(pixels) / len(pixels)
+    bits = 0
+    for idx, val in enumerate(pixels):
+        if val >= avg:
+            bits |= 1 << idx
+    return bits
+def _hamming_distance(a: int, b: int) -> int:
+    return bin(a ^ b).count("1")
 def _job_payload(job: JobRecord, include_result: bool = True) -> dict:
     payload = {
         "job_id": job.id,
 def _gemini_structured_slide_times(client, video_path: Path, reference: str) -> list[dict]:
     from google.genai import types
+    log.debug("uploading %s to Gemini", video_path)
     upload = client.files.upload(
         file=str(video_path),
         config=types.UploadFileConfig(
         ),
     )
     upload = _wait_for_upload(client, upload)
+    log.debug("upload finished")
+    # JSON Schema as dict per structured outputs guide
+    schema = {
+        "type": "object",
+        "description": "List of slide timestamps within the video.",
+        "properties": {
+            "slides": {
+                "type": "array",
+                "description": "Collection of detected slides in chronological order.",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "label": {
+                            "type": "string",
+                            "description": "Short optional title inferred from the slide content.",
+                        },
+                        "from": {
+                            "type": "string",
+                            "description": "Start timestamp of the slide as mm:ss or hh:mm:ss (e.g., 01:12:30).",
+                        },
+                        "to": {
+                            "type": "string",
+                            "description": "End timestamp of the slide as mm:ss or hh:mm:ss (e.g., 01:13:05).",
+                        },
                     },
+                    "required": ["from", "to"],
+                    "additionalProperties": False,
+                },
+            }
         },
+        "required": ["slides"],
+        "additionalProperties": False,
+    }
     file = types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "video/mp4")
+    log.debug("running Gemini slide timestamping")
     response = client.models.generate_content(
         model="gemini-flash-lite-latest",
         contents=[file, "What are the timestamps of individual slides presented?"],
+        config={
+            "response_mime_type": "application/json",
+            "response_json_schema": schema,
+        },
     )
+    log.debug("slide timestamping done")
     raw = getattr(response, "text", None) or getattr(response, "raw", None)
+    if not raw and hasattr(response, "output_text"):  # structured outputs still populate .text
         raw = response.output_text  # type: ignore[attr-defined]
     if not raw:
+        # try candidates (defensive)
         candidates = getattr(response, "candidates", None)
+        if candidates and getattr(candidates[0].content.parts[0], "text", None):
             raw = candidates[0].content.parts[0].text  # type: ignore[index]
     if not raw:
         raise RuntimeError("Slide analysis model returned empty response")
     _write_debug(reference, "slides_raw.json", raw or "")
+    log.debug("Gemini slide timestamp response: %s", raw)
     try:
         payload = json.loads(raw) if raw else {"slides": []}
     slides = payload.get("slides") or []
     sanitized: list[dict] = []
     for slide in slides:
+        start = _parse_timestamp(slide.get("from"))
+        end = _parse_timestamp(slide.get("to"))
+        if start is None or end is None:
             continue
         label = (slide.get("label") or "").strip()
         sanitized.append({"from": start, "to": end, "label": label})
     with _silence_stdio():  # silence any ffmpeg/yt-dlp noise during upload
         slides_raw = _gemini_structured_slide_times(client, video_path, reference)
+    seen_hashes: list[int] = []
     slide_entries: list[dict] = []
     for idx, slide in enumerate(slides_raw):
         if not frame_bytes:
             continue
+        digest = _average_hash(frame_bytes)
+        if digest is None:
+            continue
+        if any(_hamming_distance(digest, existing) <= 6 for existing in seen_hashes):
             continue
+        seen_hashes.append(digest)
         data_uri = "data:image/png;base64," + base64.b64encode(frame_bytes).decode("ascii")