ndurner commited on
Commit
8d7df12
·
1 Parent(s): ee16725

fix slide extraction

Browse files
demo/requirements.txt CHANGED
@@ -3,3 +3,4 @@ yt-dlp[default]>=2025.11.12
3
  fastmcp>=0.1.11
4
  google-genai>=0.8.0
5
  ffmpeg-python>=0.2.0
 
 
3
  fastmcp>=0.1.11
4
  google-genai>=0.8.0
5
  ffmpeg-python>=0.2.0
6
+ pillow>=10.4.0
mcp/pyproject.toml CHANGED
@@ -11,11 +11,12 @@ dependencies = [
11
  "fastmcp>=0.1.11",
12
  "yt-dlp[default]>=2025.11.12",
13
  "google-genai>=0.8.0",
14
- "ffmpeg-python>=0.2.0"
15
  ]
16
 
17
  [project.scripts]
18
  aileen3-mcp = "aileen3_mcp.server:main"
 
19
 
20
  [build-system]
21
  requires = ["setuptools>=64", "wheel"]
 
11
  "fastmcp>=0.1.11",
12
  "yt-dlp[default]>=2025.11.12",
13
  "google-genai>=0.8.0",
14
+ "ffmpeg-python>=0.2.0",
15
  ]
16
 
17
  [project.scripts]
18
  aileen3-mcp = "aileen3_mcp.server:main"
19
+ aileen3-slides = "aileen3_mcp.cli_slides:main"
20
 
21
  [build-system]
22
  requires = ["setuptools>=64", "wheel"]
mcp/src/aileen3_mcp/cli_slides.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Manual slide extraction helper.
2
+
3
+ Run slide extraction against a local video using the same pipeline the MCP
4
+ server uses. Useful for debugging model responses when no slides are returned.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ from .media_tools import _build_reference, _extract_slides_flow, _probe_duration, _slides_json_path
14
+
15
+
16
+ def main(argv: list[str] | None = None) -> int:
17
+ parser = argparse.ArgumentParser(description="Extract slides from a video using the Gemini pipeline.")
18
+ parser.add_argument("video", type=Path, help="Path to the video file (mp4 recommended).")
19
+ parser.add_argument(
20
+ "--reference",
21
+ help="Reference id to use for cache/output. Defaults to one derived from the filename.",
22
+ )
23
+ parser.add_argument(
24
+ "--duration",
25
+ type=float,
26
+ help="Override duration in seconds (optional). If omitted, ffprobe is used.",
27
+ )
28
+
29
+ args = parser.parse_args(argv)
30
+
31
+ video_path: Path = args.video.expanduser().resolve()
32
+ if not video_path.exists():
33
+ parser.error(f"Video not found: {video_path}")
34
+
35
+ reference = args.reference or _build_reference(None, str(video_path))
36
+ duration = args.duration or _probe_duration(video_path)
37
+
38
+ metadata = {
39
+ "reference": reference,
40
+ "download_path": str(video_path),
41
+ "duration": duration,
42
+ "source": str(video_path),
43
+ }
44
+
45
+ try:
46
+ result = _extract_slides_flow(metadata)
47
+ except Exception as exc: # pragma: no cover - CLI convenience
48
+ print(f"[error] slide extraction failed: {exc}", file=sys.stderr)
49
+ return 1
50
+
51
+ slides = result.get("slides", [])
52
+ print(f"Extracted {len(slides)} slides for reference '{reference}'.")
53
+ if slides:
54
+ print("First few slides:")
55
+ for slide in slides[:5]:
56
+ start = slide.get("from")
57
+ end = slide.get("to")
58
+ label = slide.get("label") or ""
59
+ print(f" index={slide.get('index')} from={start:.2f}s to={end:.2f}s label={label}")
60
+ slides_json = _slides_json_path(reference)
61
+ print(f"Slides JSON saved to {slides_json}")
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__": # pragma: no cover - CLI execution
66
+ raise SystemExit(main())
mcp/src/aileen3_mcp/media_tools.py CHANGED
@@ -18,6 +18,7 @@ import ffmpeg
18
  from fastmcp import Context, FastMCP
19
  from contextlib import redirect_stdout, redirect_stderr, contextmanager
20
  import io
 
21
 
22
  log = logging.getLogger(__name__)
23
 
@@ -182,6 +183,63 @@ def _build_reference(info: dict | None, source: str) -> str:
182
  return f"media_{digest}"
183
 
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  def _job_payload(job: JobRecord, include_result: bool = True) -> dict:
186
  payload = {
187
  "job_id": job.id,
@@ -416,6 +474,7 @@ def _wait_for_upload(client, upload):
416
  def _gemini_structured_slide_times(client, video_path: Path, reference: str) -> list[dict]:
417
  from google.genai import types
418
 
 
419
  upload = client.files.upload(
420
  file=str(video_path),
421
  config=types.UploadFileConfig(
@@ -424,45 +483,67 @@ def _gemini_structured_slide_times(client, video_path: Path, reference: str) ->
424
  ),
425
  )
426
  upload = _wait_for_upload(client, upload)
427
-
428
- schema = types.Schema(
429
- type=types.Type.OBJECT,
430
- properties={
431
- "slides": types.Schema(
432
- type=types.Type.ARRAY,
433
- items=types.Schema(
434
- type=types.Type.OBJECT,
435
- properties={
436
- "label": types.Schema(type=types.Type.STRING),
437
- "from_seconds": types.Schema(type=types.Type.NUMBER),
438
- "to_seconds": types.Schema(type=types.Type.NUMBER),
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  },
440
- required=["from_seconds", "to_seconds"],
441
- ),
442
- )
 
443
  },
444
- required=["slides"],
445
- )
 
446
 
447
  file = types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "video/mp4")
448
 
 
449
  response = client.models.generate_content(
450
  model="gemini-flash-lite-latest",
451
  contents=[file, "What are the timestamps of individual slides presented?"],
 
 
 
 
452
  )
 
453
 
454
  raw = getattr(response, "text", None) or getattr(response, "raw", None)
455
- if not raw and hasattr(response, "output_text"):
456
  raw = response.output_text # type: ignore[attr-defined]
457
  if not raw:
458
- # try candidates
459
  candidates = getattr(response, "candidates", None)
460
- if candidates:
461
  raw = candidates[0].content.parts[0].text # type: ignore[index]
462
  if not raw:
463
  raise RuntimeError("Slide analysis model returned empty response")
464
 
465
  _write_debug(reference, "slides_raw.json", raw or "")
 
466
 
467
  try:
468
  payload = json.loads(raw) if raw else {"slides": []}
@@ -473,10 +554,9 @@ def _gemini_structured_slide_times(client, video_path: Path, reference: str) ->
473
  slides = payload.get("slides") or []
474
  sanitized: list[dict] = []
475
  for slide in slides:
476
- try:
477
- start = float(slide.get("from_seconds"))
478
- end = float(slide.get("to_seconds"))
479
- except Exception:
480
  continue
481
  label = (slide.get("label") or "").strip()
482
  sanitized.append({"from": start, "to": end, "label": label})
@@ -559,7 +639,7 @@ def _extract_slides_flow(metadata: dict) -> dict:
559
  with _silence_stdio(): # silence any ffmpeg/yt-dlp noise during upload
560
  slides_raw = _gemini_structured_slide_times(client, video_path, reference)
561
 
562
- seen_hashes: set[str] = set()
563
  slide_entries: list[dict] = []
564
 
565
  for idx, slide in enumerate(slides_raw):
@@ -575,10 +655,12 @@ def _extract_slides_flow(metadata: dict) -> dict:
575
  if not frame_bytes:
576
  continue
577
 
578
- digest = hashlib.sha1(frame_bytes).hexdigest()
579
- if digest in seen_hashes:
 
 
580
  continue
581
- seen_hashes.add(digest)
582
 
583
  data_uri = "data:image/png;base64," + base64.b64encode(frame_bytes).decode("ascii")
584
 
 
18
  from fastmcp import Context, FastMCP
19
  from contextlib import redirect_stdout, redirect_stderr, contextmanager
20
  import io
21
+ from PIL import Image
22
 
23
  log = logging.getLogger(__name__)
24
 
 
183
  return f"media_{digest}"
184
 
185
 
186
+ def _parse_timestamp(value: Any) -> float | None:
187
+ """Accept mm:ss or hh:mm:ss strings (optionally with fractional seconds) and numbers."""
188
+
189
+ if value is None:
190
+ return None
191
+
192
+ # Allow numeric input for backward compatibility
193
+ if isinstance(value, (int, float)):
194
+ return float(value)
195
+
196
+ text = str(value).strip()
197
+ if not text:
198
+ return None
199
+
200
+ if text.isdigit():
201
+ return float(text)
202
+
203
+ parts = text.split(":")
204
+ try:
205
+ parts_f = [float(p) for p in parts]
206
+ except ValueError:
207
+ return None
208
+
209
+ if len(parts_f) == 2: # mm:ss
210
+ minutes, seconds = parts_f
211
+ return max(0.0, minutes * 60 + seconds)
212
+ if len(parts_f) == 3: # hh:mm:ss
213
+ hours, minutes, seconds = parts_f
214
+ return max(0.0, hours * 3600 + minutes * 60 + seconds)
215
+ return None
216
+
217
+
218
+ def _average_hash(frame_bytes: bytes, hash_size: int = 8) -> int | None:
219
+ """Compute a lightweight perceptual hash (aHash) tolerant to minor artifacts."""
220
+
221
+ try:
222
+ with Image.open(io.BytesIO(frame_bytes)) as img:
223
+ img = img.convert("L").resize((hash_size, hash_size), Image.LANCZOS)
224
+ pixels = list(img.getdata())
225
+ except Exception:
226
+ return None
227
+
228
+ if not pixels:
229
+ return None
230
+
231
+ avg = sum(pixels) / len(pixels)
232
+ bits = 0
233
+ for idx, val in enumerate(pixels):
234
+ if val >= avg:
235
+ bits |= 1 << idx
236
+ return bits
237
+
238
+
239
+ def _hamming_distance(a: int, b: int) -> int:
240
+ return bin(a ^ b).count("1")
241
+
242
+
243
  def _job_payload(job: JobRecord, include_result: bool = True) -> dict:
244
  payload = {
245
  "job_id": job.id,
 
474
  def _gemini_structured_slide_times(client, video_path: Path, reference: str) -> list[dict]:
475
  from google.genai import types
476
 
477
+ log.debug("uploading %s to Gemini", video_path)
478
  upload = client.files.upload(
479
  file=str(video_path),
480
  config=types.UploadFileConfig(
 
483
  ),
484
  )
485
  upload = _wait_for_upload(client, upload)
486
+ log.debug("upload finished")
487
+
488
+ # JSON Schema as dict per structured outputs guide
489
+ schema = {
490
+ "type": "object",
491
+ "description": "List of slide timestamps within the video.",
492
+ "properties": {
493
+ "slides": {
494
+ "type": "array",
495
+ "description": "Collection of detected slides in chronological order.",
496
+ "items": {
497
+ "type": "object",
498
+ "properties": {
499
+ "label": {
500
+ "type": "string",
501
+ "description": "Short optional title inferred from the slide content.",
502
+ },
503
+ "from": {
504
+ "type": "string",
505
+ "description": "Start timestamp of the slide as mm:ss or hh:mm:ss (e.g., 01:12:30).",
506
+ },
507
+ "to": {
508
+ "type": "string",
509
+ "description": "End timestamp of the slide as mm:ss or hh:mm:ss (e.g., 01:13:05).",
510
+ },
511
  },
512
+ "required": ["from", "to"],
513
+ "additionalProperties": False,
514
+ },
515
+ }
516
  },
517
+ "required": ["slides"],
518
+ "additionalProperties": False,
519
+ }
520
 
521
  file = types.Part.from_uri(file_uri=upload.uri, mime_type=upload.mime_type or "video/mp4")
522
 
523
+ log.debug("running Gemini slide timestamping")
524
  response = client.models.generate_content(
525
  model="gemini-flash-lite-latest",
526
  contents=[file, "What are the timestamps of individual slides presented?"],
527
+ config={
528
+ "response_mime_type": "application/json",
529
+ "response_json_schema": schema,
530
+ },
531
  )
532
+ log.debug("slide timestamping done")
533
 
534
  raw = getattr(response, "text", None) or getattr(response, "raw", None)
535
+ if not raw and hasattr(response, "output_text"): # structured outputs still populate .text
536
  raw = response.output_text # type: ignore[attr-defined]
537
  if not raw:
538
+ # try candidates (defensive)
539
  candidates = getattr(response, "candidates", None)
540
+ if candidates and getattr(candidates[0].content.parts[0], "text", None):
541
  raw = candidates[0].content.parts[0].text # type: ignore[index]
542
  if not raw:
543
  raise RuntimeError("Slide analysis model returned empty response")
544
 
545
  _write_debug(reference, "slides_raw.json", raw or "")
546
+ log.debug("Gemini slide timestamp response: %s", raw)
547
 
548
  try:
549
  payload = json.loads(raw) if raw else {"slides": []}
 
554
  slides = payload.get("slides") or []
555
  sanitized: list[dict] = []
556
  for slide in slides:
557
+ start = _parse_timestamp(slide.get("from"))
558
+ end = _parse_timestamp(slide.get("to"))
559
+ if start is None or end is None:
 
560
  continue
561
  label = (slide.get("label") or "").strip()
562
  sanitized.append({"from": start, "to": end, "label": label})
 
639
  with _silence_stdio(): # silence any ffmpeg/yt-dlp noise during upload
640
  slides_raw = _gemini_structured_slide_times(client, video_path, reference)
641
 
642
+ seen_hashes: list[int] = []
643
  slide_entries: list[dict] = []
644
 
645
  for idx, slide in enumerate(slides_raw):
 
655
  if not frame_bytes:
656
  continue
657
 
658
+ digest = _average_hash(frame_bytes)
659
+ if digest is None:
660
+ continue
661
+ if any(_hamming_distance(digest, existing) <= 6 for existing in seen_hashes):
662
  continue
663
+ seen_hashes.append(digest)
664
 
665
  data_uri = "data:image/png;base64," + base64.b64encode(frame_bytes).decode("ascii")
666