Spaces:

MCP-1st-Birthday
/

aileen3-core

Sleeping

App Files Files Community

ndurner commited on Nov 16, 2025

Commit

6447e9a

1 Parent(s): f897a90

YouTube analysis works

Browse files

Files changed (7) hide show

Dockerfile +1 -1
demo/health.py +23 -0
demo/requirements.txt +1 -0
mcp/pyproject.toml +1 -0
mcp/src/aileen3_mcp/__init__.py +1 -1
mcp/src/aileen3_mcp/media_tools.py +940 -0
mcp/src/aileen3_mcp/server.py +43 -2

Dockerfile CHANGED Viewed

@@ -10,7 +10,7 @@ ARG DENO_VERSION=2.0.0
 WORKDIR /app
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends curl unzip ca-certificates && \
     curl -fsSL "https://github.com/denoland/deno/releases/download/v${DENO_VERSION}/deno-x86_64-unknown-linux-gnu.zip" -o /tmp/deno.zip && \
     unzip -q /tmp/deno.zip -d /tmp && \
     mv /tmp/deno /usr/local/bin/deno && \

 WORKDIR /app
 RUN apt-get update && \
+    apt-get install -y --no-install-recommends curl unzip ca-certificates ffmpeg && \
     curl -fsSL "https://github.com/denoland/deno/releases/download/v${DENO_VERSION}/deno-x86_64-unknown-linux-gnu.zip" -o /tmp/deno.zip && \
     unzip -q /tmp/deno.zip -d /tmp && \
     mv /tmp/deno /usr/local/bin/deno && \

demo/health.py CHANGED Viewed

@@ -17,6 +17,7 @@ from typing import Iterable
 MIN_DENO_VERSION = (2, 0, 0)
 MIN_YTDLP_VERSION = (2025, 11, 12)
 GEMINI_ENV_VAR = "GEMINI_API_KEY"
@@ -102,6 +103,27 @@ def _check_yt_dlp_python() -> ToolStatus:
     return ToolStatus(label, False, "yt_dlp_ejs missing (JS sites will fail)")
 def _check_gemini_env() -> ToolStatus:
     label = "Gemini API key"
     if os.environ.get(GEMINI_ENV_VAR):
@@ -174,6 +196,7 @@ def run_health_report() -> HealthReport:
     tool_statuses = [
         _check_deno(),
         _check_yt_dlp_python(),
         _check_gemini_env(),
         _check_mcp_health(),
     ]

 MIN_DENO_VERSION = (2, 0, 0)
 MIN_YTDLP_VERSION = (2025, 11, 12)
+MIN_FFMPEG_VERSION = (4, 0)
 GEMINI_ENV_VAR = "GEMINI_API_KEY"
     return ToolStatus(label, False, "yt_dlp_ejs missing (JS sites will fail)")
+def _check_ffmpeg() -> ToolStatus:
+    label = "ffmpeg"
+    binary = shutil.which("ffmpeg")
+    if not binary:
+        return ToolStatus(label, False, "`ffmpeg` binary not found on PATH")
+    try:
+        completed = subprocess.run(
+            [binary, "-version"],
+            capture_output=True,
+            text=True,
+            check=False,
+            timeout=5,
+        )
+    except Exception as exc:
+        return ToolStatus(label, False, f"failed to exec: {exc}")
+    if completed.returncode != 0:
+        return ToolStatus(label, False, completed.stderr.strip() or "ffmpeg returned error")
+    first_line = (completed.stdout or "").splitlines()[0] if completed.stdout else "ffmpeg present"
+    return ToolStatus(label, True, first_line)
 def _check_gemini_env() -> ToolStatus:
     label = "Gemini API key"
     if os.environ.get(GEMINI_ENV_VAR):
     tool_statuses = [
         _check_deno(),
         _check_yt_dlp_python(),
+        _check_ffmpeg(),
         _check_gemini_env(),
         _check_mcp_health(),
     ]

demo/requirements.txt CHANGED Viewed

@@ -2,3 +2,4 @@ gradio>=6.0.0.dev0
 yt-dlp[default]>=2025.11.12
 fastmcp>=0.1.11
 google-genai>=0.8.0

 yt-dlp[default]>=2025.11.12
 fastmcp>=0.1.11
 google-genai>=0.8.0
+ffmpeg-python>=0.2.0

mcp/pyproject.toml CHANGED Viewed

@@ -11,6 +11,7 @@ dependencies = [
     "fastmcp>=0.1.11",
     "yt-dlp[default]>=2025.11.12",
     "google-genai>=0.8.0",
 ]
 [project.scripts]

     "fastmcp>=0.1.11",
     "yt-dlp[default]>=2025.11.12",
     "google-genai>=0.8.0",
+    "ffmpeg-python>=0.2.0"
 ]
 [project.scripts]

mcp/src/aileen3_mcp/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """Aileen3 MCP server package."""
-__all__ = ["server"]


1	"""Aileen3 MCP server package."""
2
3	+ __all__ = ["server", "media_tools"]

mcp/src/aileen3_mcp/media_tools.py ADDED Viewed

	@@ -0,0 +1,940 @@

+from __future__ import annotations
+import asyncio
+import base64
+import hashlib
+import json
+import logging
+import os
+import re
+import secrets
+import tempfile
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+import ffmpeg
+from fastmcp import Context, FastMCP
+from contextlib import redirect_stdout, redirect_stderr, contextmanager
+import io
+log = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------------------------------------------------
+# Paths & storage
+# ---------------------------------------------------------------------------------------------------------------------
+BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
+MEDIA_CACHE = BASE_CACHE / "media"
+SLIDE_CACHE = BASE_CACHE / "slides"
+ANALYSIS_CACHE = BASE_CACHE / "analysis"
+for _path in (MEDIA_CACHE, SLIDE_CACHE, ANALYSIS_CACHE):
+    _path.mkdir(parents=True, exist_ok=True)
+DEBUG = os.environ.get("AILEEN3_DEBUG", "").lower() in {"1", "true", "yes", "on"}
+DEBUG_DIR = Path(tempfile.gettempdir()) / "aileen3-debug"
+if DEBUG:
+    DEBUG_DIR.mkdir(parents=True, exist_ok=True)
+LOG_DIR = BASE_CACHE / "logs"
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+LOG_FILE = LOG_DIR / "aileen3-mcp.log"
+def _ensure_file_logging():
+    root = logging.getLogger()
+    # Avoid adding duplicate handlers to root
+    for h in root.handlers:
+        if isinstance(h, logging.FileHandler) and Path(getattr(h, "baseFilename", "")) == LOG_FILE:
+            return
+    handler = logging.FileHandler(LOG_FILE, encoding="utf-8")
+    handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
+    fmt = logging.Formatter("%(asctime)s [%(levelname)s] %(name)s: %(message)s")
+    handler.setFormatter(fmt)
+    root.addHandler(handler)
+_ensure_file_logging()
+def _write_debug(reference: str, suffix: str, data: Any) -> None:
+    if not DEBUG:
+        return
+    path = DEBUG_DIR / f"{reference}_{suffix}"
+    try:
+        if isinstance(data, (bytes, bytearray)):
+            path.write_bytes(data)
+        else:
+            path.write_text(json.dumps(data, indent=2, default=str))
+    except Exception:
+        log.debug("Failed to write debug artifact %s", path)
+class _YDLLogger:
+    """Silence yt-dlp stdout/stderr while keeping messages in Python logging."""
+    def debug(self, msg):
+        log.debug("yt-dlp: %s", msg)
+    def info(self, msg):
+        log.info("yt-dlp: %s", msg)
+    def warning(self, msg):
+        log.warning("yt-dlp: %s", msg)
+    def error(self, msg):
+        log.error("yt-dlp: %s", msg)
+@contextmanager
+def _silence_stdio():
+    buf_out = io.StringIO()
+    buf_err = io.StringIO()
+    with redirect_stdout(buf_out), redirect_stderr(buf_err):
+        yield
+# ---------------------------------------------------------------------------------------------------------------------
+# Job bookkeeping
+# ---------------------------------------------------------------------------------------------------------------------
+class JobStatus:
+    PENDING = "pending"
+    RUNNING = "running"
+    DONE = "done"
+    FAILED = "failed"
+@dataclass
+class Priors:
+    """User-supplied and media-derived context to steer analysis."""
+    context: str = ""
+    expectations: str = ""
+    prior_knowledge: str = ""
+    questions: str = ""
+    media_context: str = ""
+    @classmethod
+    def from_obj(cls, obj: dict | None, media_context: str = "") -> "Priors":
+        obj = obj or {}
+        return cls(
+            context=str(obj.get("context", "") or ""),
+            expectations=str(obj.get("expectations", "") or ""),
+            prior_knowledge=str(obj.get("prior_knowledge") or obj.get("prior knowledge") or ""),
+            questions=str(obj.get("questions", "") or ""),
+            media_context=media_context,
+        )
+    def as_prompt_text(self) -> str:
+        sections = []
+        for label, value in (
+            ("User context", self.context),
+            ("Expectations", self.expectations),
+            ("Prior knowledge", self.prior_knowledge),
+            ("Questions", self.questions),
+            ("Media context", self.media_context),
+        ):
+            if value:
+                sections.append(f"``` {label}\n{value}\n```\n")
+        return "\n".join(sections) if sections else "No specific priors provided."
+@dataclass
+class JobRecord:
+    id: str
+    kind: str
+    reference: str
+    status: str = JobStatus.PENDING
+    progress: float = 0.0
+    error: Optional[str] = None
+    result: Optional[dict] = None
+    created_at: float = field(default_factory=time.time)
+    finished_at: Optional[float] = None
+    task: Optional[asyncio.Task] = field(default=None, repr=False)
+JOBS: Dict[str, JobRecord] = {}
+REFERENCE_INDEX: Dict[tuple[str, str], str] = {}
+JOB_LOCK = asyncio.Lock()
+def _error(detail: str, reference: str | None = None, status: str = "error") -> dict:
+    payload = {"status": status, "detail": detail, "is_error": True}
+    if reference:
+        payload["reference"] = reference
+    return payload
+def _build_reference(info: dict | None, source: str) -> str:
+    source = source.strip()
+    if info:
+        extractor = (info.get("extractor_key") or "media").lower()
+        vid = info.get("id")
+        if vid and re.fullmatch(r"[A-Za-z0-9_-]+", str(vid)):
+            safe_id = re.sub(r"[^A-Za-z0-9_-]", "_", str(vid))
+            return f"{extractor}_{safe_id}"[:200]
+    digest = hashlib.sha256(source.encode()).hexdigest()[:32]
+    return f"media_{digest}"
+def _job_payload(job: JobRecord, include_result: bool = True) -> dict:
+    payload = {
+        "job_id": job.id,
+        "reference": job.reference,
+        "kind": job.kind,
+        "status": job.status,
+        "progress": job.progress,
+        "created_at": job.created_at,
+        "finished_at": job.finished_at,
+    }
+    if job.error:
+        payload["error"] = job.error
+        payload["is_error"] = True
+    if job.status == JobStatus.FAILED:
+        payload["is_error"] = True
+    if include_result and job.status == JobStatus.DONE:
+        payload["result"] = job.result
+    return payload
+async def _maybe_wait(job: JobRecord, wait_seconds: int) -> dict:
+    """Wait briefly for completion; otherwise return running status."""
+    task = job.task
+    if not task:
+        return _job_payload(job, include_result=False)
+    try:
+        await asyncio.wait_for(asyncio.shield(task), timeout=max(0, wait_seconds))
+    except asyncio.TimeoutError:
+        return _job_payload(job, include_result=False)
+    except asyncio.CancelledError:
+        job.status = JobStatus.FAILED
+        job.error = "task cancelled"
+        job.finished_at = time.time()
+        return _job_payload(job, include_result=False)
+    # If we reach here, task finished
+    return _job_payload(job, include_result=True)
+async def _get_or_create_job(kind: str, reference: str, factory: Callable[[], JobRecord]) -> JobRecord:
+    async with JOB_LOCK:
+        existing_id = REFERENCE_INDEX.get((kind, reference))
+        if existing_id and existing_id in JOBS:
+            return JOBS[existing_id]
+        job = factory()
+        JOBS[job.id] = job
+        REFERENCE_INDEX[(kind, reference)] = job.id
+        return job
+# ---------------------------------------------------------------------------------------------------------------------
+# Helpers: media metadata & ffmpeg probes
+# ---------------------------------------------------------------------------------------------------------------------
+def _media_dir(reference: str) -> Path:
+    return MEDIA_CACHE / reference
+def _metadata_path(reference: str) -> Path:
+    return _media_dir(reference) / "metadata.json"
+def _slides_json_path(reference: str) -> Path:
+    return SLIDE_CACHE / f"{reference}.json"
+def _analysis_json_path(reference: str) -> Path:
+    return ANALYSIS_CACHE / f"{reference}.json"
+def _load_json(path: Path) -> dict | None:
+    if path.exists():
+        try:
+            return json.loads(path.read_text())
+        except Exception:
+            log.warning("Failed to parse JSON from %s", path)
+    return None
+def _save_json(path: Path, payload: dict) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(payload, indent=2))
+def _probe_duration(video_path: Path) -> Optional[float]:
+    try:
+        probe = ffmpeg.probe(str(video_path))
+        fmt = probe.get("format", {})
+        duration_str = fmt.get("duration")
+        return float(duration_str) if duration_str else None
+    except Exception:
+        return None
+def _extract_frame(video_path: Path, timestamp: float) -> Optional[bytes]:
+    if timestamp < 0:
+        return None
+    try:
+        out, err = (
+            ffmpeg.input(str(video_path), ss=timestamp)
+            .output("pipe:", vframes=1, format="image2", vcodec="png")
+            .run(capture_stdout=True, capture_stderr=True, overwrite_output=True)
+        )
+    except ffmpeg.Error as exc:  # pragma: no cover - runtime dependency
+        log.debug("ffmpeg extract error for %s at %.2fs: %s", video_path, timestamp, exc.stderr.decode(errors="ignore")[:200])
+        return None
+    return out
+# ---------------------------------------------------------------------------------------------------------------------
+# yt-dlp based download
+# ---------------------------------------------------------------------------------------------------------------------
+def _run_ytdlp_download(source: str, reference: str, prefer_audio_only: bool) -> dict:
+    from yt_dlp import YoutubeDL  # local import to keep module import light
+    target_dir = _media_dir(reference)
+    target_dir.mkdir(parents=True, exist_ok=True)
+    ytdlp_opts: dict[str, Any] = {
+        "outtmpl": str(target_dir / "%(id)s.%(ext)s"),
+        "quiet": True,
+        "noplaylist": True,
+        "ignoreerrors": False,
+    }
+    # Prefer combined AV for slides; fall back to audio only if requested or video unavailable
+    if prefer_audio_only:
+        ytdlp_opts["format"] = "bestaudio/best"
+    else:
+        ytdlp_opts["format"] = "bestvideo+bestaudio/best"
+    shared_opts = {
+        "skip_download": True,
+        "quiet": True,
+        "no_warnings": True,
+        "noprogress": True,
+        "noplaylist": True,
+        "logger": _YDLLogger(),
+        "extractor_args": {"youtube": {"player_client": ["default"]}},
+    }
+    with _silence_stdio():
+        with YoutubeDL(params=shared_opts) as ydl:
+            info = ydl.extract_info(source, download=False)
+    if not info:
+        raise RuntimeError("Unable to resolve media info via yt-dlp")
+    with _silence_stdio():
+        with YoutubeDL(params=ytdlp_opts) as ydl:
+            result = ydl.extract_info(source, download=True)
+            download_path = Path(ydl.prepare_filename(result))
+    if not download_path.exists():
+        raise RuntimeError("yt-dlp finished without producing a file")
+    metadata = {
+        "reference": reference,
+        "source": source,
+        "title": result.get("title"),
+        "duration": result.get("duration"),
+        "ext": result.get("ext"),
+        "download_path": str(download_path),
+        "thumbnail": result.get("thumbnail"),
+        "channel": result.get("channel"),
+        "channel_id": result.get("channel_id"),
+        "uploader": result.get("uploader"),
+        "id": result.get("id"),
+        "description": result.get("description"),
+        "webpage_url": result.get("webpage_url"),
+        "extractor_key": result.get("extractor_key"),
+    }
+    _save_json(_metadata_path(reference), metadata)
+    return metadata
+def _ensure_audio_sidecar(video_path: Path, reference: str) -> Path:
+    """Create an AAC sidecar for the video (preferred by Gemini)."""
+    audio_path = video_path.with_suffix(".m4a")
+    if audio_path.exists():
+        return audio_path
+    audio_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        (
+            ffmpeg.input(str(video_path))
+            .output(str(audio_path), acodec="aac", audio_bitrate="128k", ac=2, ar=16000, vn=None)
+            .overwrite_output()
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+    except ffmpeg.Error as exc:  # pragma: no cover - runtime dependency
+        msg = exc.stderr.decode("utf-8", "ignore") if exc.stderr else str(exc)
+        raise RuntimeError(f"ffmpeg failed to extract audio: {msg[:400]}")
+    return audio_path
+# ---------------------------------------------------------------------------------------------------------------------
+# Gemini helpers
+# ---------------------------------------------------------------------------------------------------------------------
+def _build_gemini_client():
+    try:
+        from google import genai
+    except Exception as exc:  # pragma: no cover - runtime dependency
+        raise RuntimeError(f"google-genai not available: {exc}")
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        raise RuntimeError("GEMINI_API_KEY environment variable is required")
+    return genai.Client(api_key=api_key)
+def _wait_for_upload(client, upload):
+    from google.genai import types
+    while upload.state.name == "PROCESSING":
+        time.sleep(1)
+        upload = client.files.get(name=upload.name)
+    if upload.state.name != "ACTIVE":
+        raise RuntimeError(f"Upload failed: {upload.state.name}")
+    return upload
+def _gemini_structured_slide_times(client, video_path: Path, reference: str) -> list[dict]:
+    from google.genai import types
+    upload = client.files.upload(
+        file=str(video_path),
+        config=types.UploadFileConfig(
+            display_name=video_path.name,
+            mime_type="video/mp4",
+        ),
+    )
+    upload = _wait_for_upload(client, upload)
+    schema = types.Schema(
+        type=types.Type.OBJECT,
+        properties={
+            "slides": types.Schema(
+                type=types.Type.ARRAY,
+                items=types.Schema(
+                    type=types.Type.OBJECT,
+                    properties={
+                        "label": types.Schema(type=types.Type.STRING),
+                        "from_seconds": types.Schema(type=types.Type.NUMBER),
+                        "to_seconds": types.Schema(type=types.Type.NUMBER),
+                    },
+                    required=["from_seconds", "to_seconds"],
+                ),
+            )
+        },
+        required=["slides"],
+    )
+    file = types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "video/mp4")
+    response = client.models.generate_content(
+        model="gemini-flash-lite-latest",
+        contents=[file, "What are the timestamps of individual slides presented?"],
+    )
+    raw = getattr(response, "text", None) or getattr(response, "raw", None)
+    if not raw and hasattr(response, "output_text"):
+        raw = response.output_text  # type: ignore[attr-defined]
+    if not raw:
+        # try candidates
+        candidates = getattr(response, "candidates", None)
+        if candidates:
+            raw = candidates[0].content.parts[0].text  # type: ignore[index]
+    if not raw:
+        raise RuntimeError("Slide analysis model returned empty response")
+    _write_debug(reference, "slides_raw.json", raw or "")
+    try:
+        payload = json.loads(raw) if raw else {"slides": []}
+    except Exception:
+        log.warning("Gemini slide response not JSON: %s", raw[:200])
+        payload = {"slides": []}
+    slides = payload.get("slides") or []
+    sanitized: list[dict] = []
+    for slide in slides:
+        try:
+            start = float(slide.get("from_seconds"))
+            end = float(slide.get("to_seconds"))
+        except Exception:
+            continue
+        label = (slide.get("label") or "").strip()
+        sanitized.append({"from": start, "to": end, "label": label})
+    return sanitized
+def _gemini_analyze_audio(client, audio_path: Path, slides: list[dict], priors: Priors) -> dict:
+    from google.genai import types
+    upload = client.files.upload(
+        file=str(audio_path),
+        config=types.UploadFileConfig(
+            display_name=audio_path.name,
+            mime_type="audio/mp4",  # AAC in M4A container
+        ),
+    )
+    upload = _wait_for_upload(client, upload)
+    slide_files = []
+    for slide in slides:
+        uri = slide.get("file_uri")
+        if not uri:
+            continue
+        slide_files.append(types.Part.from_uri(file_uri=uri, mime_type="image/png"))
+    priors_text = priors.as_prompt_text()
+    # FIXME: improve prompt: conceptualize "surprises", "priors"
+    contents = [
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_text(text=priors_text),
+                types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "audio/wav"),
+                *slide_files,
+                types.Part.from_text(
+                    text=
+                    "Provide concise analysis and key insights using the supplied context, expected takeaways,"
+                    " and questions. Base the reasoning on the audio transcript and the slide snapshots; do not"
+                    " assume access to the full video."
+                ),
+            ],
+        )
+    ]
+    response = client.models.generate_content(
+        model="gemini-flash-latest",
+        contents=contents,
+    )
+    text = getattr(response, "text", None)
+    if not text and hasattr(response, "output_text"):
+        text = response.output_text  # type: ignore[attr-defined]
+    if not text:
+        candidates = getattr(response, "candidates", None)
+        if candidates:
+            text = candidates[0].content.parts[0].text  # type: ignore[index]
+    if not text:
+        raise RuntimeError("Gemini returned no analysis")
+    return {
+        "analysis": text,
+        "audio_file_uri": upload.uri,
+        "slide_count": len(slide_files),
+    }
+# ---------------------------------------------------------------------------------------------------------------------
+# Slide extraction pipeline
+# ---------------------------------------------------------------------------------------------------------------------
+def _extract_slides_flow(metadata: dict) -> dict:
+    reference = metadata["reference"]
+    video_path = Path(metadata["download_path"])
+    duration = metadata.get("duration")
+    duration_seconds = float(duration) if duration else _probe_duration(video_path)
+    client = _build_gemini_client()
+    with _silence_stdio():  # silence any ffmpeg/yt-dlp noise during upload
+        slides_raw = _gemini_structured_slide_times(client, video_path, reference)
+    seen_hashes: set[str] = set()
+    slide_entries: list[dict] = []
+    for idx, slide in enumerate(slides_raw):
+        start = float(slide.get("from", 0))
+        end = float(slide.get("to", start))
+        if duration_seconds and start >= duration_seconds:
+            continue
+        midpoint = start + (abs(end - start) / 2.0)
+        if duration_seconds and midpoint > duration_seconds:
+            continue
+        frame_bytes = _extract_frame(video_path, midpoint)
+        if not frame_bytes:
+            continue
+        digest = hashlib.sha1(frame_bytes).hexdigest()
+        if digest in seen_hashes:
+            continue
+        seen_hashes.add(digest)
+        data_uri = "data:image/png;base64," + base64.b64encode(frame_bytes).decode("ascii")
+        image_path = SLIDE_CACHE / reference / f"slide_{idx:03d}.png"
+        image_path.parent.mkdir(parents=True, exist_ok=True)
+        image_path.write_bytes(frame_bytes)
+        slide_entries.append(
+            {
+                "index": len(slide_entries),
+                "from": start,
+                "to": end,
+                "mid": midpoint,
+                "label": slide.get("label") or "",
+                "image_data_uri": data_uri,
+            }
+        )
+    payload = {
+        "reference": reference,
+        "count": len(slide_entries),
+        "slides": slide_entries,
+        "source": metadata.get("source"),
+    }
+    _save_json(_slides_json_path(reference), payload)
+    _write_debug(reference, "slides_sanitized.json", payload)
+    return payload
+# ---------------------------------------------------------------------------------------------------------------------
+# Analysis pipeline
+# ---------------------------------------------------------------------------------------------------------------------
+def _media_context_from_metadata(metadata: dict) -> str:
+    parts = []
+    title = metadata.get("title")
+    description = metadata.get("description")
+    channel = metadata.get("channel") or metadata.get("uploader")
+    url = metadata.get("webpage_url") or metadata.get("source")
+    if title:
+        parts.append(f"Title: {title}")
+    if channel:
+        parts.append(f"Channel: {channel}")
+    if url:
+        parts.append(f"URL: {url}")
+    if description:
+        parts.append(f"Description:\n{description}")
+    return "\n".join(parts)
+def _analysis_flow(metadata: dict, priors_obj: Priors | dict) -> dict:
+    reference = metadata["reference"]
+    video_path = Path(metadata["download_path"])
+    audio_path = _ensure_audio_sidecar(video_path, reference)
+    priors = priors_obj if isinstance(priors_obj, Priors) else Priors.from_obj(priors_obj)
+    priors.media_context = _media_context_from_metadata(metadata)
+    # Ensure slides exist; reuse cache if available
+    slides_payload = _load_json(_slides_json_path(reference))
+    if not slides_payload:
+        slides_payload = _extract_slides_flow(metadata)
+    slides = slides_payload.get("slides", [])
+    # Upload slide stills to Gemini for context
+    client = _build_gemini_client()
+    uploaded_slides = []
+    for slide in slides:
+        data_uri = slide.get("image_data_uri")
+        if not data_uri:
+            continue
+        _, b64 = data_uri.split(",", 1)
+        image_bytes = base64.b64decode(b64)
+        path = SLIDE_CACHE / reference / "_tmp_upload.png"
+        path.write_bytes(image_bytes)
+        upload = client.files.upload(
+            file=str(path),
+            config=None,
+        )
+        upload = _wait_for_upload(client, upload)
+        slide["file_uri"] = upload.uri
+        uploaded_slides.append(slide)
+    with _silence_stdio():  # suppress any upload chatter
+        analysis_result = _gemini_analyze_audio(client, audio_path, uploaded_slides, priors)
+    payload = {
+        "reference": reference,
+        "analysis": analysis_result.get("analysis"),
+        "slide_count": len(uploaded_slides),
+        "audio_uri": analysis_result.get("audio_file_uri"),
+        "source": metadata.get("source"),
+        "title": metadata.get("title"),
+    }
+    _save_json(_analysis_json_path(reference), payload)
+    _write_debug(reference, "analysis.json", payload)
+    return payload
+# ---------------------------------------------------------------------------------------------------------------------
+# Public MCP registration
+# ---------------------------------------------------------------------------------------------------------------------
+def register_media_tools(app: FastMCP) -> None:
+    """Register media-related MCP tools on the given app."""
+    @app.tool()
+    async def start_media_retrieval(
+        ctx: Context,
+        source: str,
+        prefer_audio_only: bool = False,
+        wait_seconds: int = 54,
+    ) -> dict:
+        """
+        Retrieve long-form media (conference session, lecture, webinar, podcast episode, or direct HTTP media URL).
+        Designed for MCP clients / LLM tools that have short time limits: will wait up to
+        `wait_seconds` for completion, otherwise returns in-progress status plus a `reference`
+        token that can be used with `get_media_retrieval_status`, `start_media_analysis`, and slide tools.
+        Note:
+            - Claude uses an internal timeout of 240 seconds. `wait_seconds` should be in the same order of magnitude with Claude, and a minimum of 55 seconds if in doubt.
+        Parameters:
+            source: YouTube URL/ID, podcast/HTTP media URL, or any supported locator supported by yt-dlp.
+            prefer_audio_only: If true, download audio-first formats to save bandwidth when visuals (e.g. slides) are not needed. Default is False, as visuals often allow richer analysis. Audio-only should only be used if asked for by the user specifically.
+            wait_seconds: Time to await before returning; helps fast-complete short downloads without extra calls.
+        Returns (happy path):
+            { reference, status="done", metadata={title, description, duration, download_path, ...}, cached? }
+        Returns (in progress):
+            { reference, status="running" | "pending", progress?, job_id }
+        Returns (error):
+            { is_error: true, status: "error"|"failed", detail, reference }
+        """
+        info_reference = None
+        try:
+            from yt_dlp import YoutubeDL
+            with YoutubeDL(params={"skip_download": True, "quiet": True, "noplaylist": True}) as ydl:
+                info = ydl.extract_info(source, download=False)
+            info_reference = _build_reference(info, source)
+        except Exception:
+            info_reference = _build_reference(None, source)
+        reference = info_reference
+        # If already cached, skip job creation
+        metadata = _load_json(_metadata_path(reference))
+        if metadata and Path(metadata.get("download_path", "")).exists():
+            return {
+                "reference": reference,
+                "status": JobStatus.DONE,
+                "cached": True,
+                "metadata": metadata,
+            }
+        def factory() -> JobRecord:
+            return JobRecord(id=secrets.token_urlsafe(16), kind="media_retrieval", reference=reference)
+        job = await _get_or_create_job("media_retrieval", reference, factory)
+        if job.status in (JobStatus.DONE, JobStatus.RUNNING):
+            return await _maybe_wait(job, wait_seconds)
+        async def runner():
+            job.status = JobStatus.RUNNING
+            try:
+                metadata_result = await asyncio.to_thread(
+                    _run_ytdlp_download, source, reference, prefer_audio_only
+                )
+                job.result = metadata_result
+                job.status = JobStatus.DONE
+            except Exception as exc:  # pragma: no cover - defensive
+                log.exception("media retrieval failed for %s", reference)
+                job.error = str(exc)
+                job.status = JobStatus.FAILED
+            finally:
+                job.finished_at = time.time()
+        job.task = asyncio.create_task(runner())
+        return await _maybe_wait(job, wait_seconds)
+    @app.tool()
+    async def get_media_retrieval_status(ctx: Context, reference: str, wait_seconds: int = 0) -> dict:
+        """Poll download status for a `reference` returned by start_media_retrieval.
+        Returns cached metadata immediately when available; otherwise echoes job status or {status: "not_found"}.
+        Errors include `is_error: true`.
+        """
+        metadata = _load_json(_metadata_path(reference))
+        if metadata and Path(metadata.get("download_path", "")).exists():
+            return {
+                "reference": reference,
+                "status": JobStatus.DONE,
+                "metadata": metadata,
+            }
+        job_id = REFERENCE_INDEX.get(("media_retrieval", reference))
+        if job_id and job_id in JOBS:
+            job = JOBS[job_id]
+            if wait_seconds > 0:
+                return await _maybe_wait(job, wait_seconds)
+            return _job_payload(job, include_result=True)
+        return {"status": "not_found", "reference": reference}
+    @app.tool()
+    async def start_slide_extraction(ctx: Context, reference: str, wait_seconds: int = 55) -> dict:
+        """Extract representative slide stills from a downloaded video.
+           Note: media analysis (start_media_analysis) includes slides extraction, so no need to call this function explicitely when aiming for full media analysis
+        """
+        metadata = _load_json(_metadata_path(reference))
+        if not metadata or not Path(metadata.get("download_path", "")).exists():
+            return _error("media not downloaded", reference)
+        existing = _load_json(_slides_json_path(reference))
+        if existing:
+            return {
+                "status": JobStatus.DONE,
+                "reference": reference,
+                "slides": existing,
+                "cached": True,
+            }
+        def factory() -> JobRecord:
+            return JobRecord(id=secrets.token_urlsafe(16), kind="slide_extraction", reference=reference)
+        job = await _get_or_create_job("slide_extraction", reference, factory)
+        if job.status in (JobStatus.DONE, JobStatus.RUNNING):
+            return await _maybe_wait(job, wait_seconds)
+        async def runner():
+            job.status = JobStatus.RUNNING
+            try:
+                slide_payload = await asyncio.to_thread(_extract_slides_flow, metadata)
+                job.result = slide_payload
+                job.status = JobStatus.DONE
+            except Exception as exc:
+                log.exception("slide extraction failed for %s", reference)
+                job.status = JobStatus.FAILED
+                job.error = str(exc)
+            finally:
+                job.finished_at = time.time()
+        job.task = asyncio.create_task(runner())
+        return await _maybe_wait(job, wait_seconds)
+    @app.tool()
+    async def get_extracted_slides(ctx: Context, reference: str, wait_seconds: int = 0) -> dict:
+        """Fetch extracted slides for a reference, or current slide-extraction job status."""
+        existing = _load_json(_slides_json_path(reference))
+        if existing:
+            return {
+                "status": JobStatus.DONE,
+                "reference": reference,
+                "slides": existing,
+            }
+        job_id = REFERENCE_INDEX.get(("slide_extraction", reference))
+        if job_id and job_id in JOBS:
+            job = JOBS[job_id]
+            if wait_seconds > 0:
+                return await _maybe_wait(job, wait_seconds)
+            return _job_payload(job, include_result=True)
+        return {"status": "not_found", "reference": reference}
+    @app.tool()
+    async def start_media_analysis(
+        ctx: Context,
+        reference: str,
+        priors: dict,
+        wait_seconds: int = 55,
+    ) -> dict:
+        """
+        Analyze the primary audio plus extracted slides, guided by rich "priors" (analysis hints).
+        Priors object schema (all strings, optional):
+            - context: User-supplied scene-setting (participants, venue, meeting goal, etc.). Used to establish topical background and spelling of names, abbreviations, etc. Optional field, should only be filled with data explicitely supplied by the user.
+            - expectations: What factuals, insights or takeaways are anticipated by the user. These serve as a baseline, and surprises from this will be surfaced. Optional field, should only be filled with data explicitely supplied by the user.
+            - prior_knowledge: What the user already knows (acronyms, previous meetings). This again serves as a basis for efficient information foraging efforts by the user. Optional field, should only be filled with data explicitely supplied by the user.
+            - questions: Specific questions from the user to answer. Optional field, should only be filled with data explicitely supplied by the user.
+        The analysis automatically builds on priors with media-derived context (title, description, channel, URL) and supplies
+        user context to the analysis pipeline as well.
+        Parameters:
+            - reference: the reference token obtained from `start_media_retrieval`
+            - priors: prior information and user-supplied background per the "Priors object schema" definition
+        Note:
+            - Claude uses an internal timeout of 240 seconds. `wait_seconds` should be in the same order of magnitude with Claude, and a minimum of 55 seconds if in doubt.
+        Returns:
+            - in-progress status if still running; use the 'get_media_analysis_result' to monitor for further progress and to retrieve the final result
+            - if process has already finished, the final analysis text is returned
+            - errors are flagged with  `is_error: true`.
+        """
+        metadata = _load_json(_metadata_path(reference))
+        if not metadata or not Path(metadata.get("download_path", "")).exists():
+            return _error("media not downloaded", reference)
+        if not isinstance(priors, dict):
+            return _error("priors must be an object with string fields: context, expectations, prior_knowledge, questions", reference)
+        existing = _load_json(_analysis_json_path(reference))
+        if existing:
+            return {"status": JobStatus.DONE, "reference": reference, "analysis": existing, "cached": True}
+        def factory() -> JobRecord:
+            return JobRecord(id=secrets.token_urlsafe(16), kind="media_analysis", reference=reference)
+        job = await _get_or_create_job("media_analysis", reference, factory)
+        if job.status in (JobStatus.DONE, JobStatus.RUNNING):
+            return await _maybe_wait(job, wait_seconds)
+        async def runner():
+            job.status = JobStatus.RUNNING
+            try:
+                result = await asyncio.to_thread(_analysis_flow, metadata, priors)
+                job.result = result
+                job.status = JobStatus.DONE
+            except Exception as exc:
+                log.exception("media analysis failed for %s", reference)
+                job.status = JobStatus.FAILED
+                job.error = str(exc)
+            finally:
+                job.finished_at = time.time()
+        job.task = asyncio.create_task(runner())
+        return await _maybe_wait(job, wait_seconds)
+    @app.tool()
+    async def get_media_analysis_result(ctx: Context, reference: str, wait_seconds: int = 0) -> dict:
+        """Return completed analysis for a reference, or current job status, with `is_error` on failures."""
+        existing = _load_json(_analysis_json_path(reference))
+        if existing:
+            return {"status": JobStatus.DONE, "reference": reference, "analysis": existing}
+        job_id = REFERENCE_INDEX.get(("media_analysis", reference))
+        if job_id and job_id in JOBS:
+            job = JOBS[job_id]
+            if wait_seconds > 0:
+                return await _maybe_wait(job, wait_seconds)
+            return _job_payload(job, include_result=True)
+        return {"status": "not_found", "reference": reference}
+__all__ = ["register_media_tools"]

mcp/src/aileen3_mcp/server.py CHANGED Viewed

@@ -1,10 +1,16 @@
 from __future__ import annotations
 import logging
 from dataclasses import asdict, dataclass
 from fastmcp import FastMCP
 log = logging.getLogger(__name__)
@@ -20,6 +26,27 @@ def make_app() -> FastMCP:
     @app.tool()
     def health() -> dict:
         def _gemini_key_ok() -> tuple[bool, str]:
             key = bool(os.environ.get("GEMINI_API_KEY"))
             return (key, "GEMINI_API_KEY is set" if key else "GEMINI_API_KEY missing")
@@ -62,15 +89,20 @@ def make_app() -> FastMCP:
         capped_results = max(1, min(max_results, 50))
         opts = {
             "quiet": True,
             "skip_download": True,
             "extract_flat": "in_playlist",
         }
         search_spec = f"ytsearch{capped_results}:{query}"
         log.info("search_youtube query=%r max_results=%d", query, capped_results)
-        with YoutubeDL(opts) as ydl:
-            info = ydl.extract_info(search_spec, download=False)
         entries = info.get("entries", []) if info else []
         videos = []
@@ -93,6 +125,15 @@ def make_app() -> FastMCP:
         return {"videos": videos}
     return app

 from __future__ import annotations
 import logging
+import os
 from dataclasses import asdict, dataclass
+import shutil
+import subprocess
 from fastmcp import FastMCP
+from aileen3_mcp.media_tools import register_media_tools, _silence_stdio, _YDLLogger
 log = logging.getLogger(__name__)
     @app.tool()
     def health() -> dict:
+        """Return a basic health payload including ffmpeg and Gemini env availability."""
+        def _ffmpeg_ok() -> tuple[bool, str]:
+            binary = shutil.which("ffmpeg")
+            if not binary:
+                return False, "ffmpeg not found on PATH"
+            try:
+                completed = subprocess.run(
+                    [binary, "-version"],
+                    capture_output=True,
+                    text=True,
+                    timeout=5,
+                    check=False,
+                )
+            except Exception as exc:  # pragma: no cover - defensive
+                return False, f"ffmpeg exec failed: {exc}"
+            if completed.returncode != 0:
+                return False, completed.stderr.strip() or "ffmpeg returned error"
+            first = (completed.stdout or "").splitlines()[0] if completed.stdout else "ffmpeg present"
+            return True, first
         def _gemini_key_ok() -> tuple[bool, str]:
             key = bool(os.environ.get("GEMINI_API_KEY"))
             return (key, "GEMINI_API_KEY is set" if key else "GEMINI_API_KEY missing")
         capped_results = max(1, min(max_results, 50))
         opts = {
             "quiet": True,
+            "no_warnings": True,
+            "noprogress": True,
             "skip_download": True,
             "extract_flat": "in_playlist",
+            "logger": _YDLLogger(),
+            "extractor_args": {"youtube": {"player_client": ["default"]}},
         }
         search_spec = f"ytsearch{capped_results}:{query}"
         log.info("search_youtube query=%r max_results=%d", query, capped_results)
+        with _silence_stdio():
+            with YoutubeDL(opts) as ydl:
+                info = ydl.extract_info(search_spec, download=False)
         entries = info.get("entries", []) if info else []
         videos = []
         return {"videos": videos}
+    # Register media analysis tools:
+    #   - start_media_retrieval
+    #   - get_media_retrieval_status
+    #   - start_slide_extraction
+    #   - get_extracted_slides
+    #   - start_media_analysis
+    #   - get_media_analysis_result
+    register_media_tools(app)
     return app