agrim12345 commited on
Commit
2db58d0
·
1 Parent(s): b903d41

Deploy deployed-meet Gradio app

Browse files
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .venv
4
+ __pycache__
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ *.log
9
+ out*
10
+ tmp
11
+ runs
12
+
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pipelines/models/*.pt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ .venv/
5
+ out*/
6
+ tmp/
7
+
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ PIP_NO_CACHE_DIR=1 \
5
+ PYTHONUNBUFFERED=1 \
6
+ PYTHONIOENCODING=utf-8 \
7
+ PIPELINE_WORKDIR=/data/deployed-meet-runs
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ git \
11
+ ffmpeg \
12
+ curl \
13
+ libgl1 \
14
+ libglib2.0-0 \
15
+ libgomp1 \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ WORKDIR /app
19
+ COPY . /app
20
+
21
+ RUN pip install --upgrade pip setuptools wheel && \
22
+ pip install -r requirements.txt && \
23
+ pip install --no-build-isolation "git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1"
24
+
25
+ RUN mkdir -p /data/deployed-meet-runs
26
+
27
+ EXPOSE 7860
28
+
29
+ CMD ["python", "-m", "uvicorn", "api.index:app", "--host", "0.0.0.0", "--port", "7860"]
30
+
HF_SPACE_DEPLOY.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deploy to Hugging Face Spaces (Gradio SDK)
2
+
3
+ This package is now Gradio-native (`app.py`) and does not require Docker on Spaces.
4
+ The `demo-code` variant is configured as demo-only Gemini:
5
+ - Gemini + YOLO only for `demo` frames.
6
+ - `slides`/`code`/`none` use OCR + transcript output.
7
+
8
+ ## 1) Create the Space
9
+ 1. Go to `https://huggingface.co/new-space`.
10
+ 2. Choose:
11
+ - SDK: `Gradio`
12
+ - Space name: your choice (for example `deployed-meet`)
13
+ - Visibility: your choice
14
+ 3. Click **Create Space**.
15
+
16
+ ## 2) Clone the Space repo
17
+ ```powershell
18
+ git clone https://huggingface.co/spaces/<YOUR_USER>/<YOUR_SPACE_NAME> hf-space-deployed-meet
19
+ cd hf-space-deployed-meet
20
+ ```
21
+
22
+ ## 3) Copy this folder into the Space repo
23
+ Copy everything from local `deployed-meet/` into this cloned Space repo root.
24
+
25
+ Required root files after copy:
26
+ - `app.py`
27
+ - `run_manager.py`
28
+ - `requirements.txt`
29
+ - `README.md`
30
+ - `pipelines/...`
31
+
32
+ ## 4) Track model weights with Git LFS
33
+ ```powershell
34
+ git lfs install
35
+ git lfs track "pipelines/models/*.pt"
36
+ git add .gitattributes
37
+ ```
38
+
39
+ ## 5) Add secrets in Space Settings
40
+ In **Settings -> Variables and secrets**, add:
41
+ - `GEMINI_API_KEY`
42
+ - `DEEPGRAM_API_KEY`
43
+
44
+ Optional:
45
+ - `PIPELINE_WORKDIR=/data/deployed-meet-runs`
46
+ - `YOLO_DEVICE=cpu` (if your Space has no GPU)
47
+ - `OCR_GPU=false` (if your Space has no GPU)
48
+
49
+ ## 6) Commit and push
50
+ ```powershell
51
+ git add .
52
+ git commit -m "Deploy deployed-meet Gradio app"
53
+ git push
54
+ ```
55
+
56
+ Wait for the build to complete.
57
+
58
+ ## 7) Open the app and run
59
+ - App URL: `https://<YOUR_SPACE_NAME>.hf.space`
60
+ - Start from **Start Run** tab, then monitor from **Track Run** tab.
61
+
62
+ ## ZeroGPU note
63
+ - ZeroGPU only works with Gradio Spaces, which this repo now uses.
64
+ - This pipeline is long-running and model-heavy, so ZeroGPU sessions may be unstable for long videos.
65
+ - For reliable long jobs, CPU upgraded hardware or a dedicated GPU Space is recommended.
README.md CHANGED
@@ -1,12 +1,50 @@
1
  ---
2
- title: Deployed Meet
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: deployed-meet
 
 
 
3
  sdk: gradio
 
4
  app_file: app.py
5
  pinned: false
6
  ---
7
 
8
+ # deployed-meet
9
+
10
+ Gradio-based deployment package for the meeting pipeline.
11
+
12
+ ## Pipeline variants
13
+ - `full`:
14
+ - Gemini is called for all keyframe types (`slides`, `code`, `demo`, `none` as applicable).
15
+ - `demo-code`:
16
+ - Gemini is called only for `demo` keyframes.
17
+ - `slides`/`code`/`none` are built from OCR + transcript.
18
+ - `smart_keyframes_and_classify.py` runs with `--no-yolo-for-non-demo` in this variant.
19
+
20
+ ## Run locally (Gradio)
21
+ ```powershell
22
+ cd deployed-meet
23
+ C:/meet-agent/.venv/Scripts/activate
24
+ pip install -r requirements.txt
25
+ python app.py
26
+ ```
27
+
28
+ Open: `http://127.0.0.1:7860`
29
+
30
+ ## How to use UI
31
+ 1. Go to **Start Run**.
32
+ 2. Select variant (`full` or `demo-code`).
33
+ 3. Choose input mode (`Upload File` or `Video URL`).
34
+ 4. Click **Start Pipeline** and copy the generated `run_id`.
35
+ 5. Go to **Track Run**, paste `run_id`, then use:
36
+ - **Refresh Status + Logs**
37
+ - **Watch Live**
38
+ - **Fetch Final Output**
39
+ - **Fetch Condensed Output**
40
+
41
+ ## Required environment variables
42
+ Set these before starting:
43
+ - `GEMINI_API_KEY`
44
+ - `DEEPGRAM_API_KEY`
45
+
46
+ Optional:
47
+ - `PIPELINE_WORKDIR` (defaults to temp directory)
48
+
49
+ ## Legacy FastAPI
50
+ Existing FastAPI code is still in `api/index.py`, but Hugging Face Gradio Spaces will run `app.py`.
api/index.py ADDED
@@ -0,0 +1,672 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import subprocess
7
+ import sys
8
+ import tempfile
9
+ import threading
10
+ import time
11
+ import uuid
12
+ from html import unescape
13
+ from pathlib import Path
14
+ from typing import Any, Dict, Optional
15
+ from urllib.parse import parse_qs, urljoin, urlparse
16
+
17
+ import httpx
18
+ from fastapi import FastAPI, HTTPException
19
+ from fastapi.responses import JSONResponse, PlainTextResponse
20
+ from pydantic import BaseModel, Field, HttpUrl
21
+
22
+
23
+ BASE_DIR = Path(__file__).resolve().parents[1]
24
+ PIPELINES_DIR = BASE_DIR / "pipelines"
25
+ DEFAULT_WORKDIR = Path(os.getenv("PIPELINE_WORKDIR", tempfile.gettempdir())) / "deployed-meet-runs"
26
+ DEFAULT_WORKDIR.mkdir(parents=True, exist_ok=True)
27
+ RUNS_DIR = DEFAULT_WORKDIR / "runs"
28
+ RUNS_DIR.mkdir(parents=True, exist_ok=True)
29
+
30
+
31
+ class PipelineRequest(BaseModel):
32
+ video_path: Optional[str] = Field(default=None, description="Absolute or server-local path to input video.")
33
+ video_url: Optional[HttpUrl] = Field(default=None, description="Optional URL to download input video from.")
34
+ out_dir: Optional[str] = Field(default=None, description="Optional output directory. Defaults to /tmp run folder.")
35
+
36
+ deepgram_model: str = "nova-3"
37
+ deepgram_language: Optional[str] = None
38
+ deepgram_request_timeout_sec: float = 1200.0
39
+ deepgram_connect_timeout_sec: float = 30.0
40
+ deepgram_retries: int = 3
41
+ deepgram_retry_backoff_sec: float = 2.0
42
+ force_deepgram: bool = False
43
+
44
+ force_keyframes: bool = False
45
+ pre_roll_sec: float = 3.0
46
+ gemini_model: str = "gemini-2.5-flash"
47
+ similarity_threshold: float = 0.82
48
+ temperature: float = 0.2
49
+ python_bin: Optional[str] = Field(
50
+ default=None,
51
+ description="Optional Python executable path for running pipeline subprocesses.",
52
+ )
53
+ log_heartbeat_sec: float = Field(
54
+ default=10.0,
55
+ description="Seconds between heartbeat progress lines written to run logs.",
56
+ )
57
+
58
+
59
+ app = FastAPI(title="deployed-meet", version="1.0.0")
60
+
61
+
62
+ def _tail(text: str, max_lines: int = 220) -> str:
63
+ lines = (text or "").splitlines()
64
+ if len(lines) <= max_lines:
65
+ return "\n".join(lines)
66
+ return "\n".join(lines[-max_lines:])
67
+
68
+
69
+ def _run_dir(run_id: str) -> Path:
70
+ return RUNS_DIR / run_id
71
+
72
+
73
+ def _meta_path(run_id: str) -> Path:
74
+ return _run_dir(run_id) / "run_meta.json"
75
+
76
+
77
+ def _logs_path(run_id: str) -> Path:
78
+ return _run_dir(run_id) / "pipeline.log"
79
+
80
+
81
+ def _write_json(path: Path, data: Dict[str, Any]) -> None:
82
+ path.parent.mkdir(parents=True, exist_ok=True)
83
+ tmp = path.with_suffix(path.suffix + ".tmp")
84
+ tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
85
+ tmp.replace(path)
86
+
87
+
88
+ def _read_json(path: Path) -> Dict[str, Any]:
89
+ return json.loads(path.read_text(encoding="utf-8"))
90
+
91
+
92
+ def _get_meta_or_404(run_id: str) -> Dict[str, Any]:
93
+ p = _meta_path(run_id)
94
+ if not p.exists():
95
+ raise HTTPException(status_code=404, detail=f"Unknown run_id: {run_id}")
96
+ try:
97
+ return _read_json(p)
98
+ except Exception as e:
99
+ raise HTTPException(status_code=500, detail=f"Failed to read run metadata: {type(e).__name__}: {e}") from e
100
+
101
+
102
+ def _resolve_video_input(req: PipelineRequest, run_id: str, run_dir: Path) -> Path:
103
+ if req.video_path:
104
+ p = Path(req.video_path).expanduser().resolve()
105
+ if not p.exists():
106
+ raise HTTPException(status_code=400, detail=f"video_path does not exist: {p}")
107
+ return p
108
+
109
+ if req.video_url:
110
+ suffix = Path(str(req.video_url)).suffix or ".mp4"
111
+ local = run_dir / f"input_{run_id}{suffix}"
112
+ try:
113
+ url = str(req.video_url)
114
+ if _extract_gdrive_file_id(url):
115
+ _download_google_drive(url, local)
116
+ else:
117
+ with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as r:
118
+ r.raise_for_status()
119
+ with open(local, "wb") as f:
120
+ for chunk in r.iter_bytes():
121
+ f.write(chunk)
122
+ except HTTPException:
123
+ raise
124
+ except Exception as e:
125
+ raise HTTPException(status_code=400, detail=f"Failed to download video_url: {type(e).__name__}: {e}") from e
126
+ return local
127
+
128
+ raise HTTPException(status_code=400, detail="Provide one of: video_path or video_url.")
129
+
130
+
131
+ def _extract_gdrive_file_id(url: str) -> Optional[str]:
132
+ parsed = urlparse(url)
133
+ host = (parsed.netloc or "").lower()
134
+ if "drive.google.com" not in host:
135
+ return None
136
+
137
+ m = re.search(r"/file/d/([a-zA-Z0-9_-]+)", parsed.path or "")
138
+ if m:
139
+ return m.group(1)
140
+
141
+ qs = parse_qs(parsed.query or "")
142
+ if "id" in qs and qs["id"]:
143
+ return qs["id"][0]
144
+
145
+ return None
146
+
147
+
148
+ def _download_google_drive(url: str, out_path: Path) -> None:
149
+ file_id = _extract_gdrive_file_id(url)
150
+ if not file_id:
151
+ raise HTTPException(status_code=400, detail="Could not parse Google Drive file id from video_url.")
152
+
153
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
154
+
155
+ def _is_html_response(resp: httpx.Response) -> bool:
156
+ ctype = (resp.headers.get("content-type") or "").lower()
157
+ if "html" in ctype or "text/plain" in ctype:
158
+ return True
159
+ head = (resp.content[:256] or b"").lower()
160
+ return b"<html" in head or b"<!doctype html" in head
161
+
162
+ def _write_if_file(resp: httpx.Response) -> bool:
163
+ if _is_html_response(resp):
164
+ return False
165
+ if not resp.content or len(resp.content) < 1024:
166
+ return False
167
+ out_path.write_bytes(resp.content)
168
+ return True
169
+
170
+ try:
171
+ with httpx.Client(timeout=120.0, follow_redirects=True) as client:
172
+ # Try a couple of direct download endpoints first.
173
+ candidates = [
174
+ direct_url,
175
+ f"https://drive.usercontent.google.com/download?id={file_id}&export=download&confirm=t",
176
+ ]
177
+
178
+ for c in candidates:
179
+ rr = client.get(c)
180
+ rr.raise_for_status()
181
+ if _write_if_file(rr):
182
+ return
183
+
184
+ # Parse Drive HTML interstitial page and submit download form if present.
185
+ page = client.get(f"https://drive.google.com/file/d/{file_id}/view")
186
+ page.raise_for_status()
187
+ html = page.text or ""
188
+
189
+ # Pattern A: explicit download form.
190
+ form_action_match = re.search(r'id="download-form"[^>]*action="([^"]+)"', html)
191
+ if form_action_match:
192
+ action = unescape(form_action_match.group(1))
193
+ action_url = urljoin("https://drive.google.com", action)
194
+ params = {k: v for k, v in re.findall(r'<input[^>]+name="([^"]+)"[^>]+value="([^"]*)"', html)}
195
+ form_resp = client.get(action_url, params=params)
196
+ form_resp.raise_for_status()
197
+ if _write_if_file(form_resp):
198
+ return
199
+
200
+ # Pattern B: direct download link in page HTML.
201
+ link_match = re.search(r'href="(/uc\?export=download[^"]+)"', html)
202
+ if link_match:
203
+ href = unescape(link_match.group(1)).replace("&amp;", "&")
204
+ link_url = urljoin("https://drive.google.com", href)
205
+ link_resp = client.get(link_url)
206
+ link_resp.raise_for_status()
207
+ if _write_if_file(link_resp):
208
+ return
209
+
210
+ # Pattern C: download_warning cookie + confirm token flow.
211
+ cookie_confirm = None
212
+ for k, v in page.cookies.items():
213
+ if str(k).startswith("download_warning"):
214
+ cookie_confirm = v
215
+ break
216
+ if cookie_confirm:
217
+ confirm_url = f"https://drive.google.com/uc?export=download&confirm={cookie_confirm}&id={file_id}"
218
+ confirm_resp = client.get(confirm_url)
219
+ confirm_resp.raise_for_status()
220
+ if _write_if_file(confirm_resp):
221
+ return
222
+
223
+ msg = "Google Drive link did not provide a downloadable file."
224
+ low = html.lower()
225
+ if "you need access" in low or "request access" in low:
226
+ msg += " File is not publicly accessible."
227
+ elif "quota exceeded" in low or "too many users have viewed or downloaded" in low:
228
+ msg += " File appears to be quota-limited by Google Drive."
229
+ else:
230
+ msg += " Use a publicly accessible direct file link or local video_path."
231
+ raise HTTPException(status_code=400, detail=msg)
232
+ except HTTPException:
233
+ raise
234
+ except Exception as e:
235
+ raise HTTPException(status_code=400, detail=f"Failed to download Google Drive file: {type(e).__name__}: {e}") from e
236
+
237
+
238
+ def _validate_video_file(path: Path) -> None:
239
+ if not path.exists() or not path.is_file():
240
+ raise HTTPException(status_code=400, detail=f"Input video file not found: {path}")
241
+
242
+ size = path.stat().st_size
243
+ if size < 1024:
244
+ raise HTTPException(status_code=400, detail=f"Input file is too small to be valid media: {path} ({size} bytes)")
245
+
246
+ # Common case for bad video_url: downloaded HTML/JSON page saved as .mp4.
247
+ try:
248
+ head = path.read_bytes()[:4096].lower()
249
+ if b"<html" in head or b"<!doctype html" in head or b"{\"error\"" in head:
250
+ raise HTTPException(
251
+ status_code=400,
252
+ detail=(
253
+ "Downloaded input is not a media file (looks like HTML/JSON response). "
254
+ "Use a direct video file URL or provide video_path."
255
+ ),
256
+ )
257
+ except HTTPException:
258
+ raise
259
+ except Exception:
260
+ pass
261
+
262
+ # Lightweight decode check.
263
+ try:
264
+ import cv2 # local import to avoid import cost at startup
265
+
266
+ cap = cv2.VideoCapture(str(path))
267
+ ok = cap.isOpened()
268
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
269
+ cap.release()
270
+ if (not ok) or frame_count <= 0:
271
+ raise HTTPException(
272
+ status_code=400,
273
+ detail=(
274
+ "Input file is not a decodable video for this runtime. "
275
+ "Provide a valid MP4 (H.264/AAC recommended) or use a direct media URL."
276
+ ),
277
+ )
278
+ except HTTPException:
279
+ raise
280
+ except Exception:
281
+ # If cv2 probing fails unexpectedly, let pipeline attempt process rather than hard-fail.
282
+ pass
283
+
284
+
285
+ def _resolve_python_executable(req: PipelineRequest) -> str:
286
+ if req.python_bin:
287
+ p = Path(req.python_bin).expanduser()
288
+ if not p.exists():
289
+ raise HTTPException(status_code=400, detail=f"python_bin does not exist: {p}")
290
+ return str(p.resolve())
291
+
292
+ # Prefer project virtualenv if available.
293
+ candidates = [
294
+ BASE_DIR.parent / ".venv" / "Scripts" / "python.exe", # Windows, repo root venv
295
+ BASE_DIR / ".venv" / "Scripts" / "python.exe", # Windows, deployed-meet local venv
296
+ BASE_DIR.parent / ".venv" / "bin" / "python", # Unix, repo root venv
297
+ BASE_DIR / ".venv" / "bin" / "python", # Unix, deployed-meet local venv
298
+ ]
299
+ for c in candidates:
300
+ if c.exists():
301
+ return str(c.resolve())
302
+
303
+ # Fallback to currently running interpreter.
304
+ return sys.executable or os.getenv("PYTHON_BIN") or "python"
305
+
306
+
307
+ def _resolve_out_dir(req: PipelineRequest, run_id: str) -> Path:
308
+ if req.out_dir:
309
+ p = Path(req.out_dir)
310
+ if not p.is_absolute():
311
+ p = DEFAULT_WORKDIR / p
312
+ else:
313
+ p = DEFAULT_WORKDIR / f"run_{run_id}"
314
+ p.mkdir(parents=True, exist_ok=True)
315
+ return p.resolve()
316
+
317
+
318
+ def _build_common_args(req: PipelineRequest, video_path: Path, out_dir: Path) -> list[str]:
319
+ args = [
320
+ "--video",
321
+ str(video_path),
322
+ "--out",
323
+ str(out_dir),
324
+ "--deepgram-model",
325
+ req.deepgram_model,
326
+ "--deepgram-request-timeout-sec",
327
+ str(req.deepgram_request_timeout_sec),
328
+ "--deepgram-connect-timeout-sec",
329
+ str(req.deepgram_connect_timeout_sec),
330
+ "--deepgram-retries",
331
+ str(req.deepgram_retries),
332
+ "--deepgram-retry-backoff-sec",
333
+ str(req.deepgram_retry_backoff_sec),
334
+ "--pre-roll-sec",
335
+ str(req.pre_roll_sec),
336
+ "--gemini-model",
337
+ req.gemini_model,
338
+ "--similarity-threshold",
339
+ str(req.similarity_threshold),
340
+ "--temperature",
341
+ str(req.temperature),
342
+ ]
343
+ if req.deepgram_language:
344
+ args.extend(["--deepgram-language", req.deepgram_language])
345
+ if req.force_deepgram:
346
+ args.append("--force-deepgram")
347
+ if req.force_keyframes:
348
+ args.append("--force-keyframes")
349
+ return args
350
+
351
+
352
+ def _build_output_files(out_dir: Path, variant: str) -> Dict[str, str]:
353
+ return {
354
+ "utterances": str(out_dir / "utterances.json"),
355
+ "keyframes_parsed": str(out_dir / "keyframes_parsed.json"),
356
+ "keyframes_with_utterances": str(out_dir / "keyframes_with_utterances.json"),
357
+ "final_output": str(
358
+ out_dir / ("final_output.json" if variant == "full" else "final_output_demo_code.json")
359
+ ),
360
+ "final_output_condensed": str(
361
+ out_dir / ("final_output_condensed.json" if variant == "full" else "final_output_demo_code_condensed.json")
362
+ ),
363
+ }
364
+
365
+
366
+ def _artifact_state(output_files: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
367
+ state: Dict[str, Dict[str, Any]] = {}
368
+ for key, p in output_files.items():
369
+ path = Path(p)
370
+ if path.exists():
371
+ try:
372
+ st = path.stat()
373
+ state[key] = {
374
+ "size_bytes": int(st.st_size),
375
+ "mtime": float(st.st_mtime),
376
+ }
377
+ except Exception:
378
+ state[key] = {"size_bytes": -1, "mtime": -1.0}
379
+ return state
380
+
381
+
382
+ def _format_artifact_compact(state: Dict[str, Dict[str, Any]]) -> str:
383
+ if not state:
384
+ return "none"
385
+ parts = []
386
+ for k in sorted(state.keys()):
387
+ sz = float(state[k].get("size_bytes", 0))
388
+ parts.append(f"{k}:{sz/1024.0:.1f}KB")
389
+ return ", ".join(parts)
390
+
391
+
392
+ def _watch_run(
393
+ run_id: str,
394
+ proc: subprocess.Popen,
395
+ started_at: float,
396
+ log_fh,
397
+ heartbeat_sec: float,
398
+ ) -> None:
399
+ heartbeat_sec = max(2.0, float(heartbeat_sec))
400
+ last_hb = 0.0
401
+ last_artifact_change = started_at
402
+ last_state: Dict[str, Dict[str, Any]] = {}
403
+
404
+ # Emit periodic progress so logs are not "stuck" during long calls.
405
+ while True:
406
+ now = time.time()
407
+ rc = proc.poll()
408
+
409
+ if (now - last_hb) >= heartbeat_sec:
410
+ try:
411
+ meta_file = _meta_path(run_id)
412
+ meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
413
+ out_files = meta.get("output_files", {}) or {}
414
+ cur_state = _artifact_state(out_files)
415
+ changed = cur_state != last_state
416
+ if changed:
417
+ last_artifact_change = now
418
+ unchanged_for = now - last_artifact_change
419
+ elapsed = now - started_at
420
+
421
+ log_fh.write(
422
+ "[runner] heartbeat "
423
+ f"elapsed={elapsed:.1f}s pid={proc.pid} "
424
+ f"artifacts={len(cur_state)}/{len(out_files)} "
425
+ f"changed={'yes' if changed else 'no'} "
426
+ f"unchanged_for={unchanged_for:.1f}s "
427
+ f"[{_format_artifact_compact(cur_state)}]\n"
428
+ )
429
+ log_fh.flush()
430
+
431
+ meta["last_heartbeat_epoch"] = now
432
+ meta["last_heartbeat_elapsed_sec"] = round(elapsed, 3)
433
+ meta["artifacts_ready_count"] = len(cur_state)
434
+ meta["artifacts_total_count"] = len(out_files)
435
+ meta["artifacts_unchanged_for_sec"] = round(unchanged_for, 3)
436
+ _write_json(meta_file, meta)
437
+ last_state = cur_state
438
+ except Exception as e:
439
+ try:
440
+ log_fh.write(f"[runner] heartbeat_error: {type(e).__name__}: {e}\n")
441
+ log_fh.flush()
442
+ except Exception:
443
+ pass
444
+ last_hb = now
445
+
446
+ if rc is not None:
447
+ return_code = int(rc)
448
+ break
449
+
450
+ time.sleep(1.0)
451
+
452
+ finished_at = time.time()
453
+ try:
454
+ meta_file = _meta_path(run_id)
455
+ meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
456
+ meta["status"] = "succeeded" if return_code == 0 else "failed"
457
+ meta["exit_code"] = int(return_code)
458
+ meta["finished_at_epoch"] = finished_at
459
+ meta["duration_sec"] = round(finished_at - started_at, 3)
460
+ _write_json(meta_file, meta)
461
+ except Exception as e:
462
+ try:
463
+ log_fh.write(f"\n[runner] failed to update metadata: {type(e).__name__}: {e}\n")
464
+ log_fh.flush()
465
+ except Exception:
466
+ pass
467
+
468
+ try:
469
+ log_fh.write(f"\n[runner] process finished with exit_code={return_code}\n")
470
+ log_fh.flush()
471
+ except Exception:
472
+ pass
473
+ finally:
474
+ try:
475
+ log_fh.close()
476
+ except Exception:
477
+ pass
478
+
479
+
480
+ def _start_pipeline(pipeline_script: Path, req: PipelineRequest, variant: str) -> Dict[str, Any]:
481
+ if not pipeline_script.exists():
482
+ raise HTTPException(status_code=500, detail=f"Missing pipeline script: {pipeline_script}")
483
+
484
+ run_id = uuid.uuid4().hex[:12]
485
+ run_dir = _run_dir(run_id)
486
+ run_dir.mkdir(parents=True, exist_ok=True)
487
+
488
+ video_path = _resolve_video_input(req, run_id, run_dir)
489
+ _validate_video_file(video_path)
490
+ out_dir = _resolve_out_dir(req, run_id)
491
+ python_exe = _resolve_python_executable(req)
492
+
493
+ cmd = [
494
+ python_exe,
495
+ "-u",
496
+ str(pipeline_script),
497
+ "--python",
498
+ python_exe,
499
+ *_build_common_args(req, video_path, out_dir),
500
+ ]
501
+
502
+ started = time.time()
503
+ logs_path = _logs_path(run_id)
504
+ log_fh = open(logs_path, "a", encoding="utf-8", buffering=1)
505
+ log_fh.write(
506
+ f"[runner] run_id={run_id} variant={variant} started_at_epoch={started}\n"
507
+ f"[runner] command={' '.join(cmd)}\n"
508
+ f"[runner] cwd={PIPELINES_DIR}\n\n"
509
+ f"[runner] heartbeat_interval_sec={req.log_heartbeat_sec}\n"
510
+ f"[runner] python_unbuffered=1\n\n"
511
+ )
512
+ log_fh.flush()
513
+
514
+ child_env = os.environ.copy()
515
+ child_env["PYTHONUNBUFFERED"] = "1"
516
+ child_env.setdefault("PYTHONIOENCODING", "utf-8")
517
+
518
+ proc = subprocess.Popen(
519
+ cmd,
520
+ cwd=str(PIPELINES_DIR),
521
+ stdout=log_fh,
522
+ stderr=subprocess.STDOUT,
523
+ text=True,
524
+ env=child_env,
525
+ )
526
+
527
+ meta = {
528
+ "variant": variant,
529
+ "run_id": run_id,
530
+ "python_executable": python_exe,
531
+ "command": cmd,
532
+ "status": "running",
533
+ "exit_code": None,
534
+ "pid": proc.pid,
535
+ "started_at_epoch": started,
536
+ "finished_at_epoch": None,
537
+ "duration_sec": None,
538
+ "out_dir": str(out_dir),
539
+ "logs_path": str(logs_path),
540
+ "heartbeat_interval_sec": float(req.log_heartbeat_sec),
541
+ "output_files": _build_output_files(out_dir, variant),
542
+ }
543
+ _write_json(_meta_path(run_id), meta)
544
+
545
+ watcher = threading.Thread(
546
+ target=_watch_run,
547
+ args=(run_id, proc, started, log_fh, float(req.log_heartbeat_sec)),
548
+ daemon=True,
549
+ )
550
+ watcher.start()
551
+
552
+ return {
553
+ "run_id": run_id,
554
+ "variant": variant,
555
+ "status": "running",
556
+ "python_executable": python_exe,
557
+ "status_path": f"/runs/{run_id}",
558
+ "logs_path": f"/runs/{run_id}/logs",
559
+ "final_output_path": f"/runs/{run_id}/final-output",
560
+ "final_output_condensed_path": f"/runs/{run_id}/final-output/condensed",
561
+ "out_dir": str(out_dir),
562
+ }
563
+
564
+
565
+ @app.get("/health")
566
+ def health() -> Dict[str, str]:
567
+ return {"status": "ok"}
568
+
569
+
570
+ @app.get("/")
571
+ def root() -> Dict[str, Any]:
572
+ return {
573
+ "service": "deployed-meet",
574
+ "status": "ok",
575
+ "docs": "/docs",
576
+ "routes": [
577
+ "/pipeline/full",
578
+ "/pipeline/demo-code",
579
+ "/runs/{run_id}",
580
+ "/runs/{run_id}/logs",
581
+ "/runs/{run_id}/final-output",
582
+ "/runs/{run_id}/final-output/condensed",
583
+ ],
584
+ }
585
+
586
+
587
+ @app.post("/pipeline/full")
588
+ def pipeline_full(req: PipelineRequest) -> Dict[str, Any]:
589
+ return _start_pipeline(PIPELINES_DIR / "run_pipeline_all.py", req, variant="full")
590
+
591
+
592
+ @app.post("/pipeline/demo-code")
593
+ def pipeline_demo_code(req: PipelineRequest) -> Dict[str, Any]:
594
+ return _start_pipeline(PIPELINES_DIR / "run_pipeline_demo_code.py", req, variant="demo_code")
595
+
596
+
597
+ @app.get("/runs/{run_id}")
598
+ def run_status(run_id: str) -> Dict[str, Any]:
599
+ return _get_meta_or_404(run_id)
600
+
601
+
602
+ @app.get("/runs/{run_id}/logs")
603
+ def run_logs(run_id: str, tail_lines: int = 300) -> PlainTextResponse:
604
+ meta = _get_meta_or_404(run_id)
605
+ p = Path(meta.get("logs_path", ""))
606
+ if not p.exists():
607
+ return PlainTextResponse("")
608
+ txt = p.read_text(encoding="utf-8", errors="replace")
609
+ limit = max(1, min(int(tail_lines), 5000))
610
+ return PlainTextResponse(_tail(txt, max_lines=limit))
611
+
612
+
613
+ @app.get("/runs/{run_id}/final-output")
614
+ def run_final_output(run_id: str) -> Any:
615
+ meta = _get_meta_or_404(run_id)
616
+ status = meta.get("status")
617
+ out_file = Path(meta["output_files"]["final_output"])
618
+
619
+ if status == "running":
620
+ return JSONResponse(
621
+ status_code=202,
622
+ content={
623
+ "run_id": run_id,
624
+ "status": status,
625
+ "message": "Pipeline is still running. Check /runs/{run_id}/logs for live progress.",
626
+ "logs_path": f"/runs/{run_id}/logs",
627
+ },
628
+ )
629
+ if status == "failed":
630
+ raise HTTPException(
631
+ status_code=409,
632
+ detail={
633
+ "run_id": run_id,
634
+ "status": status,
635
+ "message": "Pipeline failed. Check logs for details.",
636
+ "logs_path": f"/runs/{run_id}/logs",
637
+ },
638
+ )
639
+ if not out_file.exists():
640
+ raise HTTPException(status_code=404, detail=f"Final output not found: {out_file}")
641
+ return _read_json(out_file)
642
+
643
+
644
+ @app.get("/runs/{run_id}/final-output/condensed")
645
+ def run_final_output_condensed(run_id: str) -> Any:
646
+ meta = _get_meta_or_404(run_id)
647
+ status = meta.get("status")
648
+ out_file = Path(meta["output_files"]["final_output_condensed"])
649
+
650
+ if status == "running":
651
+ return JSONResponse(
652
+ status_code=202,
653
+ content={
654
+ "run_id": run_id,
655
+ "status": status,
656
+ "message": "Pipeline is still running. Check /runs/{run_id}/logs for live progress.",
657
+ "logs_path": f"/runs/{run_id}/logs",
658
+ },
659
+ )
660
+ if status == "failed":
661
+ raise HTTPException(
662
+ status_code=409,
663
+ detail={
664
+ "run_id": run_id,
665
+ "status": status,
666
+ "message": "Pipeline failed. Check logs for details.",
667
+ "logs_path": f"/runs/{run_id}/logs",
668
+ },
669
+ )
670
+ if not out_file.exists():
671
+ raise HTTPException(status_code=404, detail=f"Condensed final output not found: {out_file}")
672
+ return _read_json(out_file)
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ from typing import Any, Dict, Optional, Tuple
6
+
7
+ import gradio as gr
8
+
9
+ from run_manager import get_final_output, get_logs, get_status, start_run
10
+
11
+
12
+ def _clean_optional(value: Optional[str]) -> Optional[str]:
13
+ if value is None:
14
+ return None
15
+ text = str(value).strip()
16
+ return text or None
17
+
18
+
19
+ def _err_payload(message: str) -> Dict[str, Any]:
20
+ return {"status": "error", "message": message}
21
+
22
+
23
+ def start_pipeline(
24
+ variant: str,
25
+ input_mode: str,
26
+ video_file_path: Optional[str],
27
+ video_url: Optional[str],
28
+ out_dir: Optional[str],
29
+ python_bin: Optional[str],
30
+ deepgram_model: str,
31
+ deepgram_language: Optional[str],
32
+ deepgram_request_timeout_sec: float,
33
+ deepgram_connect_timeout_sec: float,
34
+ deepgram_retries: int,
35
+ deepgram_retry_backoff_sec: float,
36
+ force_deepgram: bool,
37
+ force_keyframes: bool,
38
+ pre_roll_sec: float,
39
+ gemini_model: str,
40
+ similarity_threshold: float,
41
+ temperature: float,
42
+ log_heartbeat_sec: float,
43
+ ) -> Tuple[str, Dict[str, Any], str, str]:
44
+ try:
45
+ chosen_video_file = None
46
+ chosen_video_url = None
47
+ mode = (input_mode or "").strip().lower()
48
+
49
+ if mode == "upload file":
50
+ chosen_video_file = _clean_optional(video_file_path)
51
+ if not chosen_video_file:
52
+ raise ValueError("Select a video file for Upload File mode.")
53
+ elif mode == "video url":
54
+ chosen_video_url = _clean_optional(video_url)
55
+ if not chosen_video_url:
56
+ raise ValueError("Provide video_url for Video URL mode.")
57
+ else:
58
+ raise ValueError("Invalid input mode.")
59
+
60
+ result = start_run(
61
+ variant=variant,
62
+ video_file_path=chosen_video_file,
63
+ video_url=chosen_video_url,
64
+ out_dir=_clean_optional(out_dir),
65
+ python_bin=_clean_optional(python_bin),
66
+ deepgram_model=deepgram_model,
67
+ deepgram_language=_clean_optional(deepgram_language),
68
+ deepgram_request_timeout_sec=float(deepgram_request_timeout_sec),
69
+ deepgram_connect_timeout_sec=float(deepgram_connect_timeout_sec),
70
+ deepgram_retries=int(deepgram_retries),
71
+ deepgram_retry_backoff_sec=float(deepgram_retry_backoff_sec),
72
+ force_deepgram=bool(force_deepgram),
73
+ force_keyframes=bool(force_keyframes),
74
+ pre_roll_sec=float(pre_roll_sec),
75
+ gemini_model=gemini_model,
76
+ similarity_threshold=float(similarity_threshold),
77
+ temperature=float(temperature),
78
+ log_heartbeat_sec=float(log_heartbeat_sec),
79
+ )
80
+ run_id = str(result["run_id"])
81
+ logs = get_logs(run_id, tail_lines=120)
82
+ return run_id, result, logs, run_id
83
+ except Exception as e:
84
+ msg = f"{type(e).__name__}: {e}"
85
+ return "", _err_payload(msg), msg, ""
86
+
87
+
88
+ def refresh_status_logs(run_id: str, tail_lines: int) -> Tuple[Dict[str, Any], str]:
89
+ rid = _clean_optional(run_id)
90
+ if not rid:
91
+ return _err_payload("Enter a run_id."), ""
92
+ try:
93
+ status = get_status(rid)
94
+ logs = get_logs(rid, tail_lines=int(tail_lines))
95
+ return status, logs
96
+ except Exception as e:
97
+ return _err_payload(f"{type(e).__name__}: {e}"), ""
98
+
99
+
100
+ def fetch_output(run_id: str, condensed: bool) -> Dict[str, Any]:
101
+ rid = _clean_optional(run_id)
102
+ if not rid:
103
+ return _err_payload("Enter a run_id.")
104
+ try:
105
+ return get_final_output(rid, condensed=condensed)
106
+ except Exception as e:
107
+ return _err_payload(f"{type(e).__name__}: {e}")
108
+
109
+
110
+ def watch_run(
111
+ run_id: str,
112
+ tail_lines: int,
113
+ poll_sec: float,
114
+ ):
115
+ rid = _clean_optional(run_id)
116
+ if not rid:
117
+ yield _err_payload("Enter a run_id."), "", None, None
118
+ return
119
+
120
+ sleep_sec = max(1.0, float(poll_sec))
121
+ max_tail = max(10, min(int(tail_lines), 5000))
122
+
123
+ while True:
124
+ try:
125
+ status = get_status(rid)
126
+ logs = get_logs(rid, tail_lines=max_tail)
127
+ except Exception as e:
128
+ yield _err_payload(f"{type(e).__name__}: {e}"), "", None, None
129
+ return
130
+
131
+ state = str(status.get("status", "unknown")).lower()
132
+ if state in {"succeeded", "failed"}:
133
+ full_payload = None
134
+ condensed_payload = None
135
+ if state == "succeeded":
136
+ try:
137
+ full_payload = get_final_output(rid, condensed=False)
138
+ except Exception as e:
139
+ full_payload = _err_payload(f"{type(e).__name__}: {e}")
140
+ try:
141
+ condensed_payload = get_final_output(rid, condensed=True)
142
+ except Exception as e:
143
+ condensed_payload = _err_payload(f"{type(e).__name__}: {e}")
144
+ yield status, logs, full_payload, condensed_payload
145
+ return
146
+
147
+ yield status, logs, None, None
148
+ time.sleep(sleep_sec)
149
+
150
+
151
+ with gr.Blocks(title="deployed-meet") as demo:
152
+ gr.Markdown(
153
+ """
154
+ # deployed-meet (Gradio)
155
+ Start either pipeline variant, then monitor logs and fetch final outputs by `run_id`.
156
+ - `full`: Gemini on all keyframe types.
157
+ - `demo-code`: Gemini only on demo keyframes, slides+code are OCR/transcript based.
158
+ """
159
+ )
160
+
161
+ with gr.Tab("Start Run"):
162
+ variant = gr.Dropdown(
163
+ choices=[
164
+ ("Full pipeline (Gemini on slides/code/demo)", "full"),
165
+ ("Demo-only Gemini pipeline (slides+code OCR)", "demo-code"),
166
+ ],
167
+ value="demo-code",
168
+ label="Pipeline Variant",
169
+ )
170
+ input_mode = gr.Radio(
171
+ choices=["Upload File", "Video URL"],
172
+ value="Upload File",
173
+ label="Input Mode",
174
+ )
175
+ video_file = gr.File(label="Video File", type="filepath")
176
+ video_url = gr.Textbox(label="Video URL", placeholder="https://.../meeting.mp4")
177
+
178
+ out_dir = gr.Textbox(
179
+ label="Output Directory (optional)",
180
+ placeholder="run_001",
181
+ )
182
+ python_bin = gr.Textbox(
183
+ label="Python Executable (optional)",
184
+ placeholder="Leave blank to auto-resolve",
185
+ )
186
+
187
+ with gr.Accordion("Advanced Settings", open=False):
188
+ deepgram_model = gr.Textbox(label="Deepgram Model", value="nova-3")
189
+ deepgram_language = gr.Textbox(label="Deepgram Language (optional)", value="")
190
+ deepgram_request_timeout_sec = gr.Number(label="Deepgram Request Timeout (sec)", value=1200.0)
191
+ deepgram_connect_timeout_sec = gr.Number(label="Deepgram Connect Timeout (sec)", value=30.0)
192
+ deepgram_retries = gr.Number(label="Deepgram Retries", value=3, precision=0)
193
+ deepgram_retry_backoff_sec = gr.Number(label="Deepgram Retry Backoff (sec)", value=2.0)
194
+ force_deepgram = gr.Checkbox(label="Force Deepgram Re-run", value=False)
195
+ force_keyframes = gr.Checkbox(label="Force Keyframe Re-run", value=False)
196
+ pre_roll_sec = gr.Number(label="Pre-roll Seconds", value=3.0)
197
+ gemini_model = gr.Textbox(label="Gemini Model", value="gemini-2.5-flash")
198
+ similarity_threshold = gr.Number(label="Similarity Threshold", value=0.82)
199
+ temperature = gr.Number(label="Temperature", value=0.2)
200
+ log_heartbeat_sec = gr.Number(label="Heartbeat Log Interval (sec)", value=10.0)
201
+
202
+ start_btn = gr.Button("Start Pipeline", variant="primary")
203
+ start_run_id = gr.Textbox(label="Run ID", interactive=False)
204
+ start_status = gr.JSON(label="Start Response / Error")
205
+ start_logs = gr.Textbox(label="Initial Logs", lines=14)
206
+
207
+ with gr.Tab("Track Run"):
208
+ track_run_id = gr.Textbox(label="Run ID", placeholder="Paste run_id from Start tab")
209
+ tail_lines = gr.Slider(label="Log Tail Lines", minimum=50, maximum=3000, value=300, step=50)
210
+ poll_sec = gr.Slider(label="Live Poll Interval (sec)", minimum=1, maximum=20, value=3, step=1)
211
+
212
+ with gr.Row():
213
+ refresh_btn = gr.Button("Refresh Status + Logs")
214
+ watch_btn = gr.Button("Watch Live")
215
+ full_btn = gr.Button("Fetch Final Output")
216
+ condensed_btn = gr.Button("Fetch Condensed Output")
217
+
218
+ track_status = gr.JSON(label="Run Status")
219
+ track_logs = gr.Textbox(label="Run Logs", lines=22)
220
+ track_full_output = gr.JSON(label="Final Output")
221
+ track_condensed_output = gr.JSON(label="Condensed Final Output")
222
+
223
+ start_btn.click(
224
+ fn=start_pipeline,
225
+ inputs=[
226
+ variant,
227
+ input_mode,
228
+ video_file,
229
+ video_url,
230
+ out_dir,
231
+ python_bin,
232
+ deepgram_model,
233
+ deepgram_language,
234
+ deepgram_request_timeout_sec,
235
+ deepgram_connect_timeout_sec,
236
+ deepgram_retries,
237
+ deepgram_retry_backoff_sec,
238
+ force_deepgram,
239
+ force_keyframes,
240
+ pre_roll_sec,
241
+ gemini_model,
242
+ similarity_threshold,
243
+ temperature,
244
+ log_heartbeat_sec,
245
+ ],
246
+ outputs=[start_run_id, start_status, start_logs, track_run_id],
247
+ )
248
+
249
+ refresh_btn.click(
250
+ fn=refresh_status_logs,
251
+ inputs=[track_run_id, tail_lines],
252
+ outputs=[track_status, track_logs],
253
+ )
254
+
255
+ watch_btn.click(
256
+ fn=watch_run,
257
+ inputs=[track_run_id, tail_lines, poll_sec],
258
+ outputs=[track_status, track_logs, track_full_output, track_condensed_output],
259
+ )
260
+
261
+ full_btn.click(
262
+ fn=lambda rid: fetch_output(rid, False),
263
+ inputs=[track_run_id],
264
+ outputs=[track_full_output],
265
+ )
266
+
267
+ condensed_btn.click(
268
+ fn=lambda rid: fetch_output(rid, True),
269
+ inputs=[track_run_id],
270
+ outputs=[track_condensed_output],
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ demo.queue(default_concurrency_limit=2).launch(
276
+ server_name="0.0.0.0",
277
+ server_port=int(os.getenv("PORT", "7860")),
278
+ show_error=True,
279
+ )
pipelines/assign_utterances_to_keyframes.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+
6
+ def safe_str(x: Any) -> str:
7
+ return "" if x is None else str(x)
8
+
9
+
10
+ def extract_list(data: Any) -> List[Dict[str, Any]]:
11
+ # Accept either a list of items, or a dict that contains a list under common keys.
12
+ if isinstance(data, list):
13
+ return [x for x in data if isinstance(x, dict)]
14
+ if isinstance(data, dict):
15
+ for k in ["utterances", "items", "segments", "results", "data"]:
16
+ if k in data and isinstance(data[k], list):
17
+ return [x for x in data[k] if isinstance(x, dict)]
18
+ return []
19
+
20
+
21
+ def extract_keyframes(data: Any) -> List[Dict[str, Any]]:
22
+ # Accept either a list of keyframes, or a dict that contains a list under common keys.
23
+ if isinstance(data, list):
24
+ return [x for x in data if isinstance(x, dict)]
25
+ if isinstance(data, dict):
26
+ for k in ["keyframes", "items", "results", "data"]:
27
+ if k in data and isinstance(data[k], list):
28
+ return [x for x in data[k] if isinstance(x, dict)]
29
+ return []
30
+
31
+
32
+ def get_time_field(d: Dict[str, Any], keys: List[str]) -> Optional[float]:
33
+ for k in keys:
34
+ if k in d:
35
+ try:
36
+ v = d[k]
37
+ if v is None:
38
+ continue
39
+ return float(v)
40
+ except Exception:
41
+ continue
42
+ return None
43
+
44
+
45
+ def get_utterance_times(u: Dict[str, Any]) -> Tuple[Optional[float], Optional[float]]:
46
+ # Try common fields for start/end times
47
+ start = get_time_field(u, ["start_sec", "start_s", "start", "start_time", "t_start", "begin", "from"])
48
+ end = get_time_field(u, ["end_sec", "end_s", "end", "end_time", "t_end", "finish", "to"])
49
+
50
+ # If only one is present, treat utterance as a point-in-time
51
+ if start is not None and end is None:
52
+ end = start
53
+ if end is not None and start is None:
54
+ start = end
55
+
56
+ return start, end
57
+
58
+
59
+ def get_utterance_text(u: Dict[str, Any]) -> str:
60
+ for k in ["text", "utterance", "content", "transcript", "sentence"]:
61
+ if k in u and safe_str(u[k]).strip():
62
+ return safe_str(u[k]).strip()
63
+
64
+ # Some formats store words list
65
+ if "words" in u and isinstance(u["words"], list):
66
+ parts = []
67
+ for w in u["words"]:
68
+ if isinstance(w, dict):
69
+ t = w.get("word") or w.get("text")
70
+ if t:
71
+ parts.append(str(t))
72
+ elif isinstance(w, str):
73
+ parts.append(w)
74
+ if parts:
75
+ return " ".join(parts).strip()
76
+
77
+ return ""
78
+
79
+
80
+ def overlaps(a0: float, a1: float, b0: float, b1: float) -> bool:
81
+ # Closed-open overlap check: [a0, a1) overlaps [b0, b1) iff max(starts) < min(ends)
82
+ return max(a0, b0) < min(a1, b1)
83
+
84
+
85
+ def main():
86
+ ap = argparse.ArgumentParser()
87
+ ap.add_argument("keyframes_json", help="Path to keyframes JSON (e.g. keyframes_parsed.json)")
88
+ ap.add_argument("utterances_json", help="Path to utterances.json")
89
+ ap.add_argument("-o", "--out", default="keyframes_with_utterances.json", help="Output JSON path")
90
+ ap.add_argument(
91
+ "--pre-roll-sec",
92
+ type=float,
93
+ default=3.0,
94
+ help="Seconds before each keyframe start that should also belong to that keyframe.",
95
+ )
96
+ args = ap.parse_args()
97
+
98
+ # Load keyframes
99
+ with open(args.keyframes_json, "r", encoding="utf-8") as f:
100
+ kf_raw = json.load(f)
101
+
102
+ keyframes_list = extract_keyframes(kf_raw)
103
+ if not keyframes_list:
104
+ raise ValueError(
105
+ "No keyframes found. Expected a list, or an object containing keyframes under one of: "
106
+ "keyframes/items/results/data."
107
+ )
108
+
109
+ # Sort keyframes by time
110
+ keyframes = sorted(
111
+ keyframes_list,
112
+ key=lambda k: (
113
+ float(k.get("t_sec", 0.0) or 0.0),
114
+ int(k.get("keyframe_idx", 0) or 0),
115
+ ),
116
+ )
117
+ if not keyframes:
118
+ raise ValueError("No keyframes found in keyframes JSON")
119
+
120
+ pre_roll_sec = max(0.0, float(args.pre_roll_sec))
121
+
122
+ # Precompute keyframe times and windows.
123
+ # window i:
124
+ # - first keyframe: [t_0, t_1)
125
+ # - others: [max(t_i - pre_roll_sec, t_{i-1}), t_{i+1})
126
+ # This makes [t_i - pre_roll_sec, t_i) belong to BOTH keyframe i and keyframe i-1.
127
+ t = [float(kf.get("t_sec", 0.0) or 0.0) for kf in keyframes]
128
+ n = len(t)
129
+ windows: List[Tuple[float, float]] = []
130
+ for i in range(n):
131
+ if i == 0:
132
+ start = t[i]
133
+ else:
134
+ start = max(t[i] - pre_roll_sec, t[i - 1])
135
+ end = t[i + 1] if i < n - 1 else float("inf")
136
+ windows.append((start, end))
137
+
138
+ # Prepare output keyframes (copy + add assigned_utterances)
139
+ out_keyframes: List[Dict[str, Any]] = []
140
+ for kf in keyframes:
141
+ kf_out = dict(kf)
142
+ kf_out["assigned_utterances"] = []
143
+ out_keyframes.append(kf_out)
144
+
145
+ # Load utterances
146
+ with open(args.utterances_json, "r", encoding="utf-8") as f:
147
+ u_raw = json.load(f)
148
+
149
+ utterances = extract_list(u_raw)
150
+ if not utterances:
151
+ raise ValueError(
152
+ "No utterances found. Expected utterances.json to be a list, or a dict containing a list under "
153
+ "one of: utterances/items/segments/results/data."
154
+ )
155
+
156
+ unassigned = []
157
+ multi_assigned = 0
158
+ assigned_total = 0
159
+
160
+ for u in utterances:
161
+ text = get_utterance_text(u).strip()
162
+ u_start, u_end = get_utterance_times(u)
163
+
164
+ if u_start is None or u_end is None or not text:
165
+ unassigned.append({"reason": "missing_text_or_time", "utterance": u})
166
+ continue
167
+
168
+ u_start = float(u_start)
169
+ u_end = float(u_end)
170
+ if u_end < u_start:
171
+ u_start, u_end = u_end, u_start
172
+
173
+ # Make point-in-time utterances half-open with tiny duration
174
+ if u_end == u_start:
175
+ u_end = u_start + 1e-6
176
+
177
+ matched_indexes = []
178
+ for i, (w0, w1) in enumerate(windows):
179
+ if overlaps(u_start, u_end, w0, w1):
180
+ matched_indexes.append(i)
181
+
182
+ if not matched_indexes:
183
+ # Fallback for degenerate boundary conditions.
184
+ for i, (w0, w1) in enumerate(windows):
185
+ eps = 1e-9
186
+ if overlaps(u_start - eps, u_end + eps, w0, w1):
187
+ matched_indexes.append(i)
188
+
189
+ if not matched_indexes:
190
+ unassigned.append({"reason": "no_overlapping_keyframe_window", "utterance": u})
191
+ continue
192
+
193
+ # Keep indexes sorted and unique.
194
+ matched_indexes = sorted(set(matched_indexes))
195
+
196
+ if len(matched_indexes) > 1:
197
+ multi_assigned += 1
198
+
199
+ payload = dict(u)
200
+ payload["_text"] = text
201
+ payload["_start_sec"] = u_start
202
+ payload["_end_sec"] = u_end
203
+ payload["_overlaps_sorted_indexes"] = matched_indexes
204
+
205
+ for idx in matched_indexes:
206
+ payload2 = dict(payload)
207
+ payload2["_assigned_sorted_index"] = idx
208
+ payload2["_assigned_keyframe_idx"] = out_keyframes[idx].get("keyframe_idx")
209
+ payload2["_assigned_t_sec"] = out_keyframes[idx].get("t_sec")
210
+ out_keyframes[idx]["assigned_utterances"].append(payload2)
211
+ assigned_total += 1
212
+
213
+ # Sort utterances inside each keyframe by start time
214
+ for kf in out_keyframes:
215
+ kf["assigned_utterances"].sort(key=lambda x: float(x.get("_start_sec", 0.0) or 0.0))
216
+
217
+ out = {
218
+ "meta": {
219
+ "keyframes_file": args.keyframes_json,
220
+ "utterances_file": args.utterances_json,
221
+ "keyframes_count": len(out_keyframes),
222
+ "utterances_count": len(utterances),
223
+ "assigned_total": assigned_total, # counts duplicates if an utterance overlaps multiple keyframes
224
+ "multi_assigned_utterances": multi_assigned,
225
+ "unassigned_count": len(unassigned),
226
+ "pre_roll_sec": pre_roll_sec,
227
+ "window_strategy": (
228
+ "pre-roll overlap windows: "
229
+ "first [t_0, t_1), others [max(t_i-pre_roll_sec, t_{i-1}), t_{i+1}), "
230
+ "last ends at +inf"
231
+ ),
232
+ },
233
+ "keyframes": out_keyframes,
234
+ "unassigned_utterances": unassigned,
235
+ }
236
+
237
+ with open(args.out, "w", encoding="utf-8") as f:
238
+ json.dump(out, f, ensure_ascii=False, indent=2)
239
+
240
+ print(f"Done. Wrote: {args.out}")
241
+ print(f"Keyframes: {len(out_keyframes)}")
242
+ print(f"Utterances: {len(utterances)}")
243
+ print(f"Assigned total (including duplicates): {assigned_total}")
244
+ print(f"Utterances that overlapped multiple keyframes: {multi_assigned}")
245
+ print(f"Unassigned utterances: {len(unassigned)}")
246
+
247
+
248
+ if __name__ == "__main__":
249
+ main()
pipelines/build_final_output.py ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_final_output.py
2
+ # Usage:
3
+ # pip install google-genai pydantic python-dotenv
4
+ # set GEMINI_API_KEY=...
5
+ # python build_final_output.py ^
6
+ # --keyframes "C:\meet-agent\out_folder\keyframes_with_utterances.json" ^
7
+ # --out "C:\meet-agent\out_folder\final_output.json" ^
8
+ # --model "gemini-2.5-flash"
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import re
14
+ import time
15
+ from dataclasses import dataclass
16
+ from typing import Any, Dict, List, Optional, Tuple
17
+
18
+ from dotenv import load_dotenv
19
+ from pydantic import BaseModel, Field
20
+ from google import genai
21
+ from google.genai import types
22
+
23
+
24
+ # -----------------------------
25
+ # Helpers
26
+ # -----------------------------
27
+ def log(msg: str) -> None:
28
+ print(msg, flush=True)
29
+
30
+
31
+ def load_json(path: str) -> Any:
32
+ with open(path, "r", encoding="utf-8") as f:
33
+ return json.load(f)
34
+
35
+
36
+ def save_json(path: str, obj: Any) -> None:
37
+ out_dir = os.path.dirname(path)
38
+ if out_dir:
39
+ os.makedirs(out_dir, exist_ok=True)
40
+ with open(path, "w", encoding="utf-8") as f:
41
+ json.dump(obj, f, ensure_ascii=False, indent=2)
42
+
43
+
44
+ def sec_to_hhmmss(t: float) -> str:
45
+ t = max(0.0, float(t))
46
+ hh = int(t // 3600)
47
+ mm = int((t % 3600) // 60)
48
+ ss = int(t % 60)
49
+ return f"{hh:02d}:{mm:02d}:{ss:02d}"
50
+
51
+
52
+ def tokenize(s: str) -> List[str]:
53
+ s = s.lower()
54
+ s = re.sub(r"[^a-z0-9_]+", " ", s)
55
+ toks = [t for t in s.split() if t]
56
+ return toks
57
+
58
+
59
+ def jaccard_similarity(a: str, b: str) -> float:
60
+ sa, sb = set(tokenize(a)), set(tokenize(b))
61
+ if not sa and not sb:
62
+ return 1.0
63
+ if not sa or not sb:
64
+ return 0.0
65
+ return len(sa & sb) / max(1, len(sa | sb))
66
+
67
+
68
+ def safe_join_text(lines: List[str], max_chars: int = 8000) -> str:
69
+ """Join lines but prevent prompt bloat."""
70
+ out = []
71
+ total = 0
72
+ for ln in lines:
73
+ if total + len(ln) + 1 > max_chars:
74
+ break
75
+ out.append(ln)
76
+ total += len(ln) + 1
77
+ return "\n".join(out)
78
+
79
+
80
+ def frame_signature(frame: Optional[Dict[str, Any]]) -> str:
81
+ """Build a signature string for similarity comparison to previous keyframe."""
82
+ if not frame:
83
+ return ""
84
+ on_screen = frame.get("on_screen_text") or []
85
+ screen_parse = frame.get("screen_parse") or {}
86
+ screen_parse_text = summarize_screen_parse(screen_parse, max_regions=3, max_region_lines=6, max_ocr_lines=30, max_chars=2500)
87
+ on_screen_small = safe_join_text(on_screen[:80], max_chars=2500)
88
+ return f"{on_screen_small}\n{screen_parse_text}"
89
+
90
+
91
+ def diff_lists(prev: List[str], cur: List[str], max_items: int = 25) -> Tuple[List[str], List[str]]:
92
+ prev_set, cur_set = set(prev), set(cur)
93
+ added = [x for x in cur if x not in prev_set][:max_items]
94
+ removed = [x for x in prev if x not in cur_set][:max_items]
95
+ return added, removed
96
+
97
+
98
+ def summarize_screen_parse(
99
+ screen_parse: Optional[Dict[str, Any]],
100
+ max_regions: int = 8,
101
+ max_region_lines: int = 12,
102
+ max_ocr_lines: int = 120,
103
+ max_chars: int = 9000,
104
+ ) -> str:
105
+ if not isinstance(screen_parse, dict) or not screen_parse:
106
+ return "unknown"
107
+
108
+ parts: List[str] = []
109
+ frame_w = screen_parse.get("frame_w")
110
+ frame_h = screen_parse.get("frame_h")
111
+ if frame_w is not None and frame_h is not None:
112
+ parts.append(f"frame_size: {frame_w}x{frame_h}")
113
+
114
+ regions = screen_parse.get("layout_regions") or []
115
+ if regions:
116
+ region_lines: List[str] = []
117
+ for i, region in enumerate(regions[:max_regions]):
118
+ label = region.get("label", "unknown")
119
+ conf = region.get("conf", "unknown")
120
+ box = region.get("box", [])
121
+ text_lines = region.get("text_lines") or []
122
+ text_lines_clean = [str(x).strip() for x in text_lines if str(x).strip()][:max_region_lines]
123
+ text_preview = " | ".join(text_lines_clean)
124
+ region_lines.append(
125
+ f"region[{i}] label={label}, conf={conf}, box={box}, text_lines={text_preview}"
126
+ )
127
+ parts.append("layout_regions:\n" + "\n".join(region_lines))
128
+
129
+ ocr_lines = screen_parse.get("ocr_lines") or []
130
+ if ocr_lines:
131
+ ocr_text: List[str] = []
132
+ for item in ocr_lines[:max_ocr_lines]:
133
+ txt = str(item.get("text", "")).strip()
134
+ if txt:
135
+ ocr_text.append(txt)
136
+ if ocr_text:
137
+ parts.append("ocr_lines:\n" + safe_join_text(ocr_text, max_chars=max_chars))
138
+
139
+ merged = "\n\n".join(parts).strip()
140
+ if not merged:
141
+ return "unknown"
142
+ return merged[:max_chars]
143
+
144
+
145
+ def split_sentences(text: str) -> List[str]:
146
+ if not text:
147
+ return []
148
+ parts = re.split(r"(?<=[.!?])\s+", str(text).strip())
149
+ out = []
150
+ for p in parts:
151
+ p = p.strip()
152
+ if p:
153
+ out.append(p)
154
+ return out
155
+
156
+
157
+ def build_content_change_summary(
158
+ prev_content_summary: Optional[str],
159
+ cur_content_summary: Optional[str],
160
+ max_items: int = 6,
161
+ ) -> str:
162
+ prev = (prev_content_summary or "").strip()
163
+ cur = (cur_content_summary or "").strip()
164
+ if not prev:
165
+ return "Initial keyframe in sequence; no previous content summary to diff against."
166
+ if not cur:
167
+ return "Current content summary is empty or unknown; unable to compute precise content diff."
168
+ if prev == cur:
169
+ return "No material content-summary change from the previous keyframe."
170
+
171
+ prev_sentences = split_sentences(prev)
172
+ cur_sentences = split_sentences(cur)
173
+ prev_set = set(prev_sentences)
174
+ cur_set = set(cur_sentences)
175
+
176
+ added = [s for s in cur_sentences if s not in prev_set][:max_items]
177
+ removed = [s for s in prev_sentences if s not in cur_set][:max_items]
178
+
179
+ # If sentence-level diff fails (e.g., heavy rewrites), use token-level fallback.
180
+ if not added and not removed:
181
+ prev_tokens = set(tokenize(prev))
182
+ cur_tokens = set(tokenize(cur))
183
+ added_tokens = sorted(list(cur_tokens - prev_tokens))[:12]
184
+ removed_tokens = sorted(list(prev_tokens - cur_tokens))[:12]
185
+ if not added_tokens and not removed_tokens:
186
+ return "Content summary wording changed but underlying content differences are unclear."
187
+ out = []
188
+ if added_tokens:
189
+ out.append("Added/updated terms: " + ", ".join(added_tokens))
190
+ if removed_tokens:
191
+ out.append("Removed/de-emphasized terms: " + ", ".join(removed_tokens))
192
+ return " ".join(out)
193
+
194
+ chunks = []
195
+ if added:
196
+ chunks.append(
197
+ "Added/updated in current content summary: "
198
+ + " ; ".join(a[:240] for a in added)
199
+ )
200
+ if removed:
201
+ chunks.append(
202
+ "Removed/de-emphasized vs previous content summary: "
203
+ + " ; ".join(r[:240] for r in removed)
204
+ )
205
+ return " ".join(chunks).strip()
206
+
207
+
208
+ def extract_speakers_from_utterances(utterances: List[Dict[str, Any]]) -> List[str]:
209
+ """Unique speakers in order of first appearance."""
210
+ seen = set()
211
+ out = []
212
+ for u in utterances or []:
213
+ spk = str(u.get("speaker", "")).strip()
214
+ if not spk:
215
+ spk = "unknown"
216
+ if spk not in seen:
217
+ seen.add(spk)
218
+ out.append(spk)
219
+ return out
220
+
221
+
222
+ # -----------------------------
223
+ # Pydantic schema for Gemini
224
+ # -----------------------------
225
+ class FrameChange(BaseModel):
226
+ changed_summary: str = Field(
227
+ ...,
228
+ description="Only the content-summary diff from previous keyframe to current keyframe.",
229
+ )
230
+ possible_reason: str = Field(
231
+ ...,
232
+ description="Why it could have happened (grounded in utterances/on-screen info; if unknown say unknown).",
233
+ )
234
+ added_elements: List[str] = Field(
235
+ default_factory=list,
236
+ description="Notable on-screen text elements that appeared (from diff).",
237
+ )
238
+ removed_elements: List[str] = Field(
239
+ default_factory=list,
240
+ description="Notable on-screen text elements that disappeared (from diff).",
241
+ )
242
+
243
+
244
+ class FrameSummary(BaseModel):
245
+ keyframe_idx: int
246
+ frame_type: str
247
+ t_sec: float
248
+ timestamp: str
249
+ image_path: str
250
+
251
+ on_screen_text: List[str] = Field(default_factory=list)
252
+
253
+ # NEW: all speakers present in this keyframe's utterances
254
+ speakers: List[str] = Field(
255
+ default_factory=list,
256
+ description="Unique list of speakers who spoke during this keyframe (from assigned utterances).",
257
+ )
258
+
259
+ utterance_time_start: Optional[str] = None
260
+ utterance_time_end: Optional[str] = None
261
+
262
+ # UPDATED requirements: must explicitly mention speakers
263
+ utterance_summary: str = Field(
264
+ ...,
265
+ description="Summary of utterances during this keyframe; must explicitly attribute statements to speakers.",
266
+ )
267
+
268
+ # More detailed
269
+ content_summary: str = Field(
270
+ ...,
271
+ description="Detailed frame content summary grounded in frame_type, timestamp, on_screen_text, and screen_parse.",
272
+ )
273
+
274
+ # Combined synthesis
275
+ combined_summary: str = Field(
276
+ ...,
277
+ description="Summary that combines utterance_summary and content_summary.",
278
+ )
279
+
280
+ # NEW: change summary for every keyframe transition (prev -> current). null for first keyframe.
281
+ frame_change: Optional[FrameChange] = None
282
+
283
+ similarity_to_prev: float = 0.0
284
+ reused_prev_content: bool = False
285
+ notes: List[str] = Field(default_factory=list)
286
+
287
+
288
+ class FinalOutput(BaseModel):
289
+ meta: Dict[str, Any]
290
+ keyframes: List[FrameSummary]
291
+
292
+
293
+ # -----------------------------
294
+ # History manager (diminishing returns)
295
+ # -----------------------------
296
+ @dataclass
297
+ class HistoryState:
298
+ recent_frames: List[Dict[str, Any]]
299
+ long_memory: str
300
+ long_memory_max_chars: int = 4500
301
+
302
+ def __init__(self):
303
+ self.recent_frames = []
304
+ self.long_memory = ""
305
+
306
+ def add_frame(self, frame_summary_obj: Dict[str, Any], keep_recent: int = 4):
307
+ self.recent_frames.append(frame_summary_obj)
308
+ if len(self.recent_frames) > keep_recent:
309
+ to_compress = self.recent_frames[:-keep_recent]
310
+ self.recent_frames = self.recent_frames[-keep_recent:]
311
+ return to_compress
312
+ return []
313
+
314
+ def build_history_context(self) -> str:
315
+ parts = []
316
+ if self.long_memory.strip():
317
+ parts.append("LONG_MEMORY (old history, low weight):\n" + self.long_memory.strip())
318
+
319
+ if self.recent_frames:
320
+ parts.append("RECENT_HISTORY (high weight, most recent first):")
321
+ for fr in reversed(self.recent_frames):
322
+ parts.append(
323
+ f"- [{fr.get('timestamp','??')}] {fr.get('frame_type','?').upper()} "
324
+ f"combined_summary: {fr.get('combined_summary','')[:900]}"
325
+ )
326
+ return "\n".join(parts).strip()
327
+
328
+
329
+ # -----------------------------
330
+ # Gemini calls
331
+ # -----------------------------
332
+ def gemini_client() -> genai.Client:
333
+ load_dotenv()
334
+
335
+ api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
336
+ if not api_key:
337
+ raise ValueError("Missing GEMINI_API_KEY in environment (.env not loaded or key not set).")
338
+
339
+ return genai.Client(api_key=api_key)
340
+
341
+
342
+ def call_gemini_structured(
343
+ client: genai.Client,
344
+ model: str,
345
+ system_instruction: str,
346
+ user_prompt: str,
347
+ schema_model: Any,
348
+ temperature: float = 0.2,
349
+ max_retries: int = 3,
350
+ ) -> Any:
351
+ last_err = None
352
+ for attempt in range(1, max_retries + 1):
353
+ try:
354
+ resp = client.models.generate_content(
355
+ model=model,
356
+ contents=user_prompt,
357
+ config=types.GenerateContentConfig(
358
+ system_instruction=system_instruction,
359
+ response_mime_type="application/json",
360
+ response_schema=schema_model,
361
+ temperature=temperature,
362
+ ),
363
+ )
364
+ if getattr(resp, "parsed", None) is not None:
365
+ return resp.parsed
366
+
367
+ txt = getattr(resp, "text", None)
368
+ if not txt:
369
+ raise ValueError("Gemini returned no text/parsed output.")
370
+ return json.loads(txt)
371
+ except Exception as e:
372
+ last_err = e
373
+ time.sleep(0.7 * attempt)
374
+
375
+ raise RuntimeError(f"Gemini structured call failed after retries: {last_err}")
376
+
377
+
378
+ def compress_into_long_memory(
379
+ client: genai.Client,
380
+ model: str,
381
+ existing_long_memory: str,
382
+ frames_to_compress: List[Dict[str, Any]],
383
+ max_chars: int,
384
+ ) -> str:
385
+ if not frames_to_compress:
386
+ return existing_long_memory
387
+
388
+ bullets = []
389
+ for fr in frames_to_compress:
390
+ bullets.append(
391
+ f"[{fr.get('timestamp','??')}][{fr.get('frame_type','?')}] "
392
+ f"{fr.get('combined_summary','')[:500]}"
393
+ )
394
+ chunk = "\n".join(bullets)
395
+
396
+ system = (
397
+ "You compress meeting history. Output must be short, factual, and useful.\n"
398
+ "Do not invent details. Prefer concrete technical points and transitions.\n"
399
+ "Keep it under the requested character budget."
400
+ )
401
+ prompt = (
402
+ f"Existing LONG_MEMORY (may be empty):\n{existing_long_memory}\n\n"
403
+ f"New older frames to merge (older history):\n{chunk}\n\n"
404
+ f"Task:\n"
405
+ f"1) Merge them into LONG_MEMORY.\n"
406
+ f"2) Keep the result <= {max_chars} characters.\n"
407
+ f"3) Use bullet points.\n"
408
+ f"Return ONLY plain text."
409
+ )
410
+
411
+ resp = client.models.generate_content(
412
+ model=model,
413
+ contents=prompt,
414
+ config=types.GenerateContentConfig(
415
+ system_instruction=system,
416
+ temperature=0.2,
417
+ max_output_tokens=800,
418
+ ),
419
+ )
420
+ text = (getattr(resp, "text", "") or "").strip()
421
+ if not text:
422
+ merged = (existing_long_memory + "\n" + chunk).strip()
423
+ return merged[:max_chars]
424
+ return text[:max_chars]
425
+
426
+
427
+ # -----------------------------
428
+ # Core processing logic
429
+ # -----------------------------
430
+ def build_prompt_for_frame(
431
+ frame: Dict[str, Any],
432
+ history_context: str,
433
+ prev_frame: Optional[Dict[str, Any]],
434
+ prev_content_summary: Optional[str],
435
+ similarity_to_prev: float,
436
+ is_similar: bool,
437
+ transition_diff: Optional[Dict[str, Any]],
438
+ ) -> Tuple[str, str]:
439
+ frame_type = (frame.get("frame_type") or "").lower()
440
+ timestamp = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
441
+ t_sec = float(frame.get("t_sec", 0.0))
442
+
443
+ on_screen_text = frame.get("on_screen_text") or []
444
+ screen_parse_summary = summarize_screen_parse(
445
+ frame.get("screen_parse") or {},
446
+ max_regions=8,
447
+ max_region_lines=14,
448
+ max_ocr_lines=140,
449
+ max_chars=12000,
450
+ )
451
+ assigned_utterances = frame.get("assigned_utterances") or []
452
+ speakers = extract_speakers_from_utterances(assigned_utterances)
453
+
454
+ u_start_ts = None
455
+ u_end_ts = None
456
+ if assigned_utterances:
457
+ u_start = min(float(u.get("_start_sec", u.get("start", t_sec))) for u in assigned_utterances)
458
+ u_end = max(float(u.get("_end_sec", u.get("end", t_sec))) for u in assigned_utterances)
459
+ u_start_ts = sec_to_hhmmss(u_start)
460
+ u_end_ts = sec_to_hhmmss(u_end)
461
+
462
+ utt_lines = []
463
+ for u in assigned_utterances[:60]:
464
+ s = float(u.get("_start_sec", u.get("start", 0.0)))
465
+ e = float(u.get("_end_sec", u.get("end", 0.0)))
466
+ spk = str(u.get("speaker", "unknown")).strip() or "unknown"
467
+ txt = (u.get("text", "") or "").strip()
468
+ utt_lines.append(f"[{sec_to_hhmmss(s)}-{sec_to_hhmmss(e)}][{spk}] {txt}")
469
+ utterances_block = safe_join_text(utt_lines, max_chars=12000)
470
+
471
+ reuse_instruction = ""
472
+ if is_similar:
473
+ reuse_instruction = (
474
+ "IMPORTANT: This frame content is very similar to the previous keyframe.\n"
475
+ "Do NOT repeat the entire explanation.\n"
476
+ "Reuse prior context and focus on what is new.\n"
477
+ "frame_change must still be filled if a previous keyframe exists.\n"
478
+ )
479
+
480
+ prev_block = ""
481
+ prev_content_summary_block = "PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\nnone\n\n"
482
+ if prev_frame is not None:
483
+ prev_idx = prev_frame.get("keyframe_idx", -1)
484
+ prev_ts = prev_frame.get("timestamp") or sec_to_hhmmss(prev_frame.get("t_sec", 0.0))
485
+ prev_type = (prev_frame.get("frame_type") or "unknown").lower()
486
+ prev_block = (
487
+ "PREVIOUS_KEYFRAME:\n"
488
+ f"- keyframe_idx: {prev_idx}\n"
489
+ f"- frame_type: {prev_type}\n"
490
+ f"- timestamp: {prev_ts}\n\n"
491
+ )
492
+ prev_content_summary_block = (
493
+ "PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\n"
494
+ f"{(prev_content_summary or 'unknown').strip()}\n\n"
495
+ )
496
+
497
+ transition_diff_block = ""
498
+ if transition_diff is not None:
499
+ transition_diff_block = (
500
+ "KEYFRAME_TRANSITION_DIFF (computed from on_screen_text):\n"
501
+ f"added_elements: {transition_diff.get('added_elements', [])}\n"
502
+ f"removed_elements: {transition_diff.get('removed_elements', [])}\n\n"
503
+ )
504
+
505
+ system_instruction = (
506
+ "You are generating time-aware meeting notes per keyframe.\n"
507
+ "You must follow the provided schema exactly and return JSON only.\n"
508
+ "Do not invent facts not present in the inputs.\n"
509
+ "If something is unknown, say unknown.\n"
510
+ "History has diminishing importance: RECENT_HISTORY is high weight, LONG_MEMORY is low weight.\n"
511
+ "Speaker attribution is required for utterance summary.\n"
512
+ )
513
+
514
+ on_screen_capped = on_screen_text[:350]
515
+
516
+ if frame_type == "slides":
517
+ content_task = (
518
+ "For slides:\n"
519
+ "- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
520
+ " Cover headings, bullets, numbers, claims, and relationships visible on screen.\n"
521
+ "- combined_summary must combine utterance_summary + content_summary.\n"
522
+ )
523
+ elif frame_type == "code":
524
+ content_task = (
525
+ "For code:\n"
526
+ "- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
527
+ " Cover files/modules, functions/classes, logic, inputs/outputs, and config if visible.\n"
528
+ "- combined_summary must combine utterance_summary + content_summary.\n"
529
+ )
530
+ else:
531
+ content_task = (
532
+ "For demo:\n"
533
+ "- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
534
+ " Cover screens, controls, state transitions, and resulting behavior.\n"
535
+ "- combined_summary must combine utterance_summary + content_summary.\n"
536
+ )
537
+
538
+ output_rules = (
539
+ "OUTPUT_RULES (must follow exactly):\n"
540
+ "- Always populate: on_screen_text, speakers, utterance_summary, content_summary, combined_summary.\n"
541
+ "- utterance_summary must use utterance timestamps + speaker + text provided.\n"
542
+ "- content_summary must be grounded in frame_type + timestamp + on_screen_text + screen_parse.\n"
543
+ "- combined_summary must summarize utterance_summary and content_summary.\n"
544
+ "- If previous keyframe exists, frame_change must be present.\n"
545
+ " - changed_summary must be only the difference between previous and current content_summary.\n"
546
+ " - possible_reason remains grounded in utterances/on-screen evidence; else unknown.\n"
547
+ " - added_elements and removed_elements must use provided diff lists.\n"
548
+ "- If no previous keyframe exists, frame_change must be null.\n"
549
+ )
550
+
551
+ user_prompt = (
552
+ f"{prev_block}"
553
+ f"CURRENT_KEYFRAME:\n"
554
+ f"- keyframe_idx: {frame.get('keyframe_idx')}\n"
555
+ f"- frame_type: {frame_type}\n"
556
+ f"- t_sec: {t_sec}\n"
557
+ f"- timestamp: {timestamp}\n"
558
+ f"- image_path: {frame.get('image_path')}\n"
559
+ f"- similarity_to_prev: {similarity_to_prev:.3f}\n"
560
+ f"- detected_speakers: {speakers}\n"
561
+ f"- utterance_time_range: {u_start_ts}-{u_end_ts}\n\n"
562
+ f"ON_SCREEN_TEXT (list):\n{on_screen_capped}\n\n"
563
+ f"SCREEN_PARSE (structured parse of current frame):\n{screen_parse_summary}\n\n"
564
+ f"ASSIGNED_UTTERANCES (time-stamped, includes speaker):\n{utterances_block}\n\n"
565
+ f"{transition_diff_block}"
566
+ f"{prev_content_summary_block}"
567
+ f"HISTORY_CONTEXT:\n{history_context}\n\n"
568
+ f"{output_rules}\n\n"
569
+ f"{reuse_instruction}\n"
570
+ f"{content_task}\n"
571
+ f"Now produce the JSON output for this keyframe following the schema."
572
+ )
573
+
574
+ return system_instruction, user_prompt
575
+
576
+
577
+ def keyframe_items(keyframes_data: Any) -> List[Dict[str, Any]]:
578
+ if isinstance(keyframes_data, dict):
579
+ return keyframes_data.get("keyframes", []) or []
580
+ if isinstance(keyframes_data, list):
581
+ return keyframes_data
582
+ return []
583
+
584
+ def main():
585
+ ap = argparse.ArgumentParser()
586
+ ap.add_argument("--keyframes", required=True, help="Path to keyframes_with_utterances.json")
587
+ ap.add_argument("--out", required=True, help="Output path for final JSON")
588
+ ap.add_argument("--model", default="gemini-2.5-flash", help="Gemini model id")
589
+ ap.add_argument("--similarity_threshold", type=float, default=0.82, help="Similarity threshold for 'reuse prev content'")
590
+ ap.add_argument("--temperature", type=float, default=0.2)
591
+ args = ap.parse_args()
592
+
593
+ log("Starting build_final_output.py ...")
594
+ log(f"Keyframes file: {args.keyframes}")
595
+ log(f"Output file: {args.out}")
596
+ log(f"Model: {args.model}")
597
+
598
+ keyframes_data = load_json(args.keyframes)
599
+ keyframes_list = keyframe_items(keyframes_data)
600
+ if not keyframes_list:
601
+ raise ValueError("No keyframes found in input keyframes file.")
602
+
603
+ # Process keyframes in chronological order.
604
+ keyframes_list = sorted(
605
+ keyframes_list,
606
+ key=lambda x: (
607
+ float(x.get("t_sec", 0.0)),
608
+ int(x.get("keyframe_idx", 0)),
609
+ ),
610
+ )
611
+
612
+ log(f"Loaded keyframes: {len(keyframes_list)}")
613
+
614
+ log("Initializing Gemini client (loading .env + API key)...")
615
+ client = gemini_client()
616
+ log("Gemini client ready.")
617
+
618
+ output = {
619
+ "meta": {
620
+ "keyframes_file": args.keyframes,
621
+ "model": args.model,
622
+ "generated_at_epoch": time.time(),
623
+ "rules": {
624
+ "process_order": "keyframes in chronological order",
625
+ "history": "recent detailed + long_memory compressed (diminishing returns)",
626
+ "similarity_threshold": args.similarity_threshold,
627
+ "transition_change_each_keyframe": True,
628
+ "speakers_per_keyframe": True,
629
+ "utterance_summary_requires_speaker_attribution": True,
630
+ "content_summary_uses_screen_parse": True,
631
+ "combined_summary_synthesizes_utterance_and_content": True,
632
+ "change_summary_is_content_diff": True,
633
+ },
634
+ },
635
+ "keyframes": [],
636
+ }
637
+
638
+ history_state = HistoryState()
639
+
640
+ prev_frame_obj: Optional[Dict[str, Any]] = None
641
+ prev_frame_summary: Optional[Dict[str, Any]] = None
642
+
643
+ global_kf_done = 0
644
+ global_kf_total = len(keyframes_list)
645
+ log(f"Total keyframes to process: {global_kf_total}")
646
+
647
+ for frame in keyframes_list:
648
+ global_kf_done += 1
649
+ kf_idx = frame.get("keyframe_idx")
650
+ kf_ts = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
651
+ kf_type = (frame.get("frame_type") or "unknown").lower()
652
+ utt_count = len(frame.get("assigned_utterances") or [])
653
+ log(f"[{global_kf_done}/{global_kf_total}] Keyframe {kf_idx} @ {kf_ts} | type={kf_type} | utterances={utt_count}")
654
+
655
+ sig_cur = frame_signature(frame)
656
+ sig_prev = frame_signature(prev_frame_obj)
657
+ sim = jaccard_similarity(sig_prev, sig_cur) if prev_frame_obj else 0.0
658
+ is_similar = (prev_frame_obj is not None) and (sim >= args.similarity_threshold)
659
+ log(f" similarity_to_prev={sim:.3f} | reused_prev_content={is_similar}")
660
+
661
+ transition_diff = None
662
+ if prev_frame_obj is not None:
663
+ prev_text = (prev_frame_obj.get("on_screen_text") or [])
664
+ cur_text = (frame.get("on_screen_text") or [])
665
+ added, removed = diff_lists(prev_text, cur_text, max_items=40)
666
+ transition_diff = {"added_elements": added, "removed_elements": removed}
667
+
668
+ history_context = history_state.build_history_context()
669
+
670
+ system_instruction, user_prompt = build_prompt_for_frame(
671
+ frame=frame,
672
+ history_context=history_context,
673
+ prev_frame=prev_frame_obj,
674
+ prev_content_summary=(prev_frame_summary or {}).get("content_summary"),
675
+ similarity_to_prev=sim,
676
+ is_similar=is_similar,
677
+ transition_diff=transition_diff,
678
+ )
679
+
680
+ log(" -> Calling Gemini ...")
681
+ t_call = time.time()
682
+ parsed = call_gemini_structured(
683
+ client=client,
684
+ model=args.model,
685
+ system_instruction=system_instruction,
686
+ user_prompt=user_prompt,
687
+ schema_model=FrameSummary,
688
+ temperature=args.temperature,
689
+ max_retries=3,
690
+ )
691
+ log(f" <- Gemini done in {time.time() - t_call:.1f}s")
692
+
693
+ if isinstance(parsed, BaseModel):
694
+ parsed_dict = parsed.model_dump()
695
+ else:
696
+ parsed_dict = dict(parsed)
697
+
698
+ parsed_dict["similarity_to_prev"] = float(sim)
699
+ parsed_dict["reused_prev_content"] = bool(is_similar)
700
+ if "notes" not in parsed_dict:
701
+ parsed_dict["notes"] = []
702
+ if is_similar:
703
+ parsed_dict["notes"].append("High similarity to previous keyframe; instructed incremental update.")
704
+ if prev_frame_summary is not None:
705
+ parsed_dict["notes"].append("Keyframe-to-keyframe transition diff computed and provided (frame_change required).")
706
+
707
+ # Enforce change summary as strict diff of previous vs current content_summary.
708
+ if prev_frame_summary is None:
709
+ parsed_dict["frame_change"] = None
710
+ else:
711
+ prev_content_summary = (prev_frame_summary or {}).get("content_summary")
712
+ current_content_summary = parsed_dict.get("content_summary")
713
+ existing_change = parsed_dict.get("frame_change") or {}
714
+ if not isinstance(existing_change, dict):
715
+ existing_change = {}
716
+ existing_change["changed_summary"] = build_content_change_summary(
717
+ prev_content_summary=prev_content_summary,
718
+ cur_content_summary=current_content_summary,
719
+ )
720
+ existing_change["possible_reason"] = str(existing_change.get("possible_reason", "")).strip() or "unknown"
721
+ existing_change["added_elements"] = (transition_diff or {}).get("added_elements", [])
722
+ existing_change["removed_elements"] = (transition_diff or {}).get("removed_elements", [])
723
+ parsed_dict["frame_change"] = existing_change
724
+
725
+ output["keyframes"].append(parsed_dict)
726
+
727
+ to_compress = history_state.add_frame(
728
+ frame_summary_obj={
729
+ "timestamp": parsed_dict.get("timestamp"),
730
+ "frame_type": parsed_dict.get("frame_type"),
731
+ "combined_summary": parsed_dict.get("combined_summary", ""),
732
+ },
733
+ keep_recent=4,
734
+ )
735
+ if to_compress:
736
+ log(f" -> Compressing {len(to_compress)} older frame(s) into LONG_MEMORY ...")
737
+ history_state.long_memory = compress_into_long_memory(
738
+ client=client,
739
+ model=args.model,
740
+ existing_long_memory=history_state.long_memory,
741
+ frames_to_compress=to_compress,
742
+ max_chars=history_state.long_memory_max_chars,
743
+ )
744
+ log(" <- LONG_MEMORY updated.")
745
+
746
+ prev_frame_obj = frame
747
+ prev_frame_summary = parsed_dict
748
+
749
+ log("\nAll keyframes processed. Writing output JSON ...")
750
+ save_json(args.out, output)
751
+ log(f"Done. Wrote: {args.out}")
752
+
753
+
754
+ if __name__ == "__main__":
755
+ main()
756
+
757
+
758
+
pipelines/build_final_output_demo_code.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Demo-only Gemini build stage (kept in demo-code route for compatibility).
4
+
5
+ Behavior:
6
+ - `demo` keyframes: summarized with Gemini.
7
+ - `slides`, `code`, and `none` keyframes: NO Gemini call; output is built from OCR + utterances.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ import re
16
+ import time
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ from dotenv import load_dotenv
20
+ from pydantic import BaseModel, Field
21
+ from google import genai
22
+ from google.genai import types
23
+
24
+
25
+ def log(msg: str) -> None:
26
+ print(msg, flush=True)
27
+
28
+
29
+ def load_json(path: str) -> Any:
30
+ with open(path, "r", encoding="utf-8") as f:
31
+ return json.load(f)
32
+
33
+
34
+ def save_json(path: str, obj: Any) -> None:
35
+ out_dir = os.path.dirname(path)
36
+ if out_dir:
37
+ os.makedirs(out_dir, exist_ok=True)
38
+ with open(path, "w", encoding="utf-8") as f:
39
+ json.dump(obj, f, ensure_ascii=False, indent=2)
40
+
41
+
42
+ def sec_to_hhmmss(t: float) -> str:
43
+ t = max(0.0, float(t))
44
+ hh = int(t // 3600)
45
+ mm = int((t % 3600) // 60)
46
+ ss = int(t % 60)
47
+ return f"{hh:02d}:{mm:02d}:{ss:02d}"
48
+
49
+
50
+ def tokenize(s: str) -> List[str]:
51
+ s = s.lower()
52
+ s = re.sub(r"[^a-z0-9_]+", " ", s)
53
+ return [t for t in s.split() if t]
54
+
55
+
56
+ def jaccard_similarity(a: str, b: str) -> float:
57
+ sa, sb = set(tokenize(a)), set(tokenize(b))
58
+ if not sa and not sb:
59
+ return 1.0
60
+ if not sa or not sb:
61
+ return 0.0
62
+ return len(sa & sb) / max(1, len(sa | sb))
63
+
64
+
65
+ def safe_join_text(lines: List[str], max_chars: int = 8000) -> str:
66
+ out = []
67
+ total = 0
68
+ for ln in lines:
69
+ if total + len(ln) + 1 > max_chars:
70
+ break
71
+ out.append(ln)
72
+ total += len(ln) + 1
73
+ return "\n".join(out)
74
+
75
+
76
+ def split_sentences(text: str) -> List[str]:
77
+ if not text:
78
+ return []
79
+ parts = re.split(r"(?<=[.!?])\s+", str(text).strip())
80
+ return [p.strip() for p in parts if p.strip()]
81
+
82
+
83
+ def build_content_change_summary(
84
+ prev_content_summary: Optional[str],
85
+ cur_content_summary: Optional[str],
86
+ max_items: int = 6,
87
+ ) -> str:
88
+ prev = (prev_content_summary or "").strip()
89
+ cur = (cur_content_summary or "").strip()
90
+ if not prev:
91
+ return "Initial keyframe in sequence; no previous content summary to diff against."
92
+ if not cur:
93
+ return "Current content summary is empty or unknown; unable to compute precise content diff."
94
+ if prev == cur:
95
+ return "No material content-summary change from the previous keyframe."
96
+
97
+ prev_sentences = split_sentences(prev)
98
+ cur_sentences = split_sentences(cur)
99
+ prev_set = set(prev_sentences)
100
+ cur_set = set(cur_sentences)
101
+
102
+ added = [s for s in cur_sentences if s not in prev_set][:max_items]
103
+ removed = [s for s in prev_sentences if s not in cur_set][:max_items]
104
+
105
+ if not added and not removed:
106
+ prev_tokens = set(tokenize(prev))
107
+ cur_tokens = set(tokenize(cur))
108
+ added_tokens = sorted(list(cur_tokens - prev_tokens))[:12]
109
+ removed_tokens = sorted(list(prev_tokens - cur_tokens))[:12]
110
+ if not added_tokens and not removed_tokens:
111
+ return "Content summary wording changed but underlying content differences are unclear."
112
+ out = []
113
+ if added_tokens:
114
+ out.append("Added/updated terms: " + ", ".join(added_tokens))
115
+ if removed_tokens:
116
+ out.append("Removed/de-emphasized terms: " + ", ".join(removed_tokens))
117
+ return " ".join(out)
118
+
119
+ chunks = []
120
+ if added:
121
+ chunks.append(
122
+ "Added/updated in current content summary: "
123
+ + " ; ".join(a[:240] for a in added)
124
+ )
125
+ if removed:
126
+ chunks.append(
127
+ "Removed/de-emphasized vs previous content summary: "
128
+ + " ; ".join(r[:240] for r in removed)
129
+ )
130
+ return " ".join(chunks).strip()
131
+
132
+
133
+ def frame_signature(frame: Optional[Dict[str, Any]]) -> str:
134
+ if not frame:
135
+ return ""
136
+ on_screen = frame.get("on_screen_text") or []
137
+ return safe_join_text([str(x) for x in on_screen[:120]], max_chars=3000)
138
+
139
+
140
+ def diff_lists(prev: List[str], cur: List[str], max_items: int = 25) -> Tuple[List[str], List[str]]:
141
+ prev_set, cur_set = set(prev), set(cur)
142
+ added = [x for x in cur if x not in prev_set][:max_items]
143
+ removed = [x for x in prev if x not in cur_set][:max_items]
144
+ return added, removed
145
+
146
+
147
+ def summarize_screen_parse(
148
+ screen_parse: Optional[Dict[str, Any]],
149
+ max_regions: int = 8,
150
+ max_region_lines: int = 12,
151
+ max_ocr_lines: int = 120,
152
+ max_chars: int = 9000,
153
+ ) -> str:
154
+ if not isinstance(screen_parse, dict) or not screen_parse:
155
+ return "unknown"
156
+
157
+ parts: List[str] = []
158
+ frame_w = screen_parse.get("frame_w")
159
+ frame_h = screen_parse.get("frame_h")
160
+ if frame_w is not None and frame_h is not None:
161
+ parts.append(f"frame_size: {frame_w}x{frame_h}")
162
+
163
+ regions = screen_parse.get("layout_regions") or []
164
+ if regions:
165
+ region_lines: List[str] = []
166
+ for i, region in enumerate(regions[:max_regions]):
167
+ label = region.get("label", "unknown")
168
+ conf = region.get("conf", "unknown")
169
+ box = region.get("box", [])
170
+ text_lines = region.get("text_lines") or []
171
+ text_lines_clean = [str(x).strip() for x in text_lines if str(x).strip()][:max_region_lines]
172
+ text_preview = " | ".join(text_lines_clean)
173
+ region_lines.append(
174
+ f"region[{i}] label={label}, conf={conf}, box={box}, text_lines={text_preview}"
175
+ )
176
+ parts.append("layout_regions:\n" + "\n".join(region_lines))
177
+
178
+ ocr_lines = screen_parse.get("ocr_lines") or []
179
+ if ocr_lines:
180
+ ocr_text: List[str] = []
181
+ for item in ocr_lines[:max_ocr_lines]:
182
+ txt = str(item.get("text", "")).strip()
183
+ if txt:
184
+ ocr_text.append(txt)
185
+ if ocr_text:
186
+ parts.append("ocr_lines:\n" + safe_join_text(ocr_text, max_chars=max_chars))
187
+
188
+ merged = "\n\n".join(parts).strip()
189
+ if not merged:
190
+ return "unknown"
191
+ return merged[:max_chars]
192
+
193
+
194
+ def extract_speakers_from_utterances(utterances: List[Dict[str, Any]]) -> List[str]:
195
+ seen = set()
196
+ out = []
197
+ for u in utterances or []:
198
+ spk = str(u.get("speaker", "")).strip() or "unknown"
199
+ if spk not in seen:
200
+ seen.add(spk)
201
+ out.append(spk)
202
+ return out
203
+
204
+
205
+ def utterance_time_bounds(utterances: List[Dict[str, Any]], default_t: float) -> Tuple[Optional[str], Optional[str]]:
206
+ if not utterances:
207
+ return None, None
208
+ starts = []
209
+ ends = []
210
+ for u in utterances:
211
+ try:
212
+ starts.append(float(u.get("_start_sec", u.get("start", default_t))))
213
+ ends.append(float(u.get("_end_sec", u.get("end", default_t))))
214
+ except Exception:
215
+ continue
216
+ if not starts or not ends:
217
+ return None, None
218
+ return sec_to_hhmmss(min(starts)), sec_to_hhmmss(max(ends))
219
+
220
+
221
+ def build_utterance_lines(utterances: List[Dict[str, Any]], max_lines: int = 80) -> List[str]:
222
+ lines: List[str] = []
223
+ for u in utterances[:max_lines]:
224
+ try:
225
+ s = float(u.get("_start_sec", u.get("start", 0.0)))
226
+ e = float(u.get("_end_sec", u.get("end", 0.0)))
227
+ except Exception:
228
+ s, e = 0.0, 0.0
229
+ spk = str(u.get("speaker", "unknown")).strip() or "unknown"
230
+ txt = (u.get("text", "") or "").strip()
231
+ if not txt:
232
+ continue
233
+ lines.append(f"[{sec_to_hhmmss(s)}-{sec_to_hhmmss(e)}][{spk}] {txt}")
234
+ return lines
235
+
236
+
237
+ def local_summary_for_non_demo(frame: Dict[str, Any]) -> Dict[str, str]:
238
+ frame_type = str(frame.get("frame_type", "unknown")).lower()
239
+ ocr_lines = [str(x).strip() for x in (frame.get("on_screen_text") or []) if str(x).strip()]
240
+ utter_lines = build_utterance_lines(frame.get("assigned_utterances") or [], max_lines=20)
241
+
242
+ if utter_lines:
243
+ utterance_summary = " | ".join(utter_lines[:8])
244
+ else:
245
+ utterance_summary = "No assigned utterances for this keyframe."
246
+
247
+ if ocr_lines:
248
+ content_summary = (
249
+ f"{frame_type.upper()} keyframe. OCR extracted on-screen text (top lines): "
250
+ + " | ".join(ocr_lines[:25])
251
+ )
252
+ else:
253
+ content_summary = f"{frame_type.upper()} keyframe. OCR text not available."
254
+
255
+ combined_summary = (
256
+ f"Local (no Gemini) summary for {frame_type} frame. "
257
+ f"Utterances: {utterance_summary} "
258
+ f"Content: {content_summary}"
259
+ )
260
+
261
+ return {
262
+ "utterance_summary": utterance_summary,
263
+ "content_summary": content_summary,
264
+ "combined_summary": combined_summary,
265
+ }
266
+
267
+
268
+ class DemoGeminiSummary(BaseModel):
269
+ utterance_summary: str = Field(
270
+ ...,
271
+ description="Summary of utterances for this frame with explicit speaker attribution where available.",
272
+ )
273
+ content_summary: str = Field(
274
+ ...,
275
+ description="Detailed description of what changed or is shown in this demo frame.",
276
+ )
277
+ combined_summary: str = Field(
278
+ ...,
279
+ description="Combined summary merging utterances and visual content.",
280
+ )
281
+
282
+
283
+ def gemini_client() -> genai.Client:
284
+ load_dotenv()
285
+ api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
286
+ if not api_key:
287
+ raise ValueError("Missing GEMINI_API_KEY in environment (.env not loaded or key not set).")
288
+ return genai.Client(api_key=api_key)
289
+
290
+
291
+ def call_gemini_structured(
292
+ client: genai.Client,
293
+ model: str,
294
+ system_instruction: str,
295
+ user_prompt: str,
296
+ schema_model: Any,
297
+ temperature: float = 0.2,
298
+ max_retries: int = 3,
299
+ ) -> Any:
300
+ last_err = None
301
+ for attempt in range(1, max_retries + 1):
302
+ try:
303
+ resp = client.models.generate_content(
304
+ model=model,
305
+ contents=user_prompt,
306
+ config=types.GenerateContentConfig(
307
+ system_instruction=system_instruction,
308
+ response_mime_type="application/json",
309
+ response_schema=schema_model,
310
+ temperature=temperature,
311
+ ),
312
+ )
313
+ if getattr(resp, "parsed", None) is not None:
314
+ return resp.parsed
315
+ txt = getattr(resp, "text", None)
316
+ if not txt:
317
+ raise ValueError("Gemini returned no text/parsed output.")
318
+ return json.loads(txt)
319
+ except Exception as e:
320
+ last_err = e
321
+ time.sleep(0.7 * attempt)
322
+ raise RuntimeError(f"Gemini structured call failed after retries: {last_err}")
323
+
324
+
325
+ def build_demo_prompt(
326
+ frame: Dict[str, Any],
327
+ prev_content_summary: Optional[str],
328
+ similarity_to_prev: float,
329
+ is_similar: bool,
330
+ ) -> Tuple[str, str]:
331
+ frame_type = str(frame.get("frame_type", "unknown")).lower()
332
+ timestamp = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
333
+ t_sec = float(frame.get("t_sec", 0.0))
334
+ on_screen_text = frame.get("on_screen_text") or []
335
+ screen_parse_summary = summarize_screen_parse(frame.get("screen_parse") or {})
336
+ utterances_block = safe_join_text(
337
+ build_utterance_lines(frame.get("assigned_utterances") or [], max_lines=80),
338
+ max_chars=12000,
339
+ )
340
+ reuse_instruction = ""
341
+ if is_similar:
342
+ reuse_instruction = (
343
+ "Frame is highly similar to previous keyframe. Reuse context and focus on what changed.\n"
344
+ )
345
+
346
+ prev_block = "PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\nnone\n"
347
+ if prev_content_summary:
348
+ prev_block = f"PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\n{prev_content_summary}\n"
349
+
350
+ system_instruction = (
351
+ "You generate keyframe-level meeting notes for demo screens only.\n"
352
+ "Ground all claims in provided utterances and OCR/screen parse.\n"
353
+ "Do not invent facts.\n"
354
+ "Return strict JSON only following schema."
355
+ )
356
+
357
+ user_prompt = (
358
+ f"CURRENT_KEYFRAME:\n"
359
+ f"- frame_type: {frame_type}\n"
360
+ f"- keyframe_idx: {frame.get('keyframe_idx')}\n"
361
+ f"- t_sec: {t_sec}\n"
362
+ f"- timestamp: {timestamp}\n"
363
+ f"- image_path: {frame.get('image_path')}\n"
364
+ f"- similarity_to_prev: {similarity_to_prev:.3f}\n\n"
365
+ f"ON_SCREEN_TEXT:\n{on_screen_text[:350]}\n\n"
366
+ f"SCREEN_PARSE:\n{screen_parse_summary}\n\n"
367
+ f"ASSIGNED_UTTERANCES:\n{utterances_block}\n\n"
368
+ f"{prev_block}\n"
369
+ f"{reuse_instruction}\n"
370
+ f"Requirements:\n"
371
+ f"- utterance_summary: attribute statements to speakers when present.\n"
372
+ f"- content_summary: describe what is visible/changed in this frame.\n"
373
+ f"- combined_summary: merge utterance + visual context.\n"
374
+ )
375
+ return system_instruction, user_prompt
376
+
377
+
378
+ def keyframe_items(keyframes_data: Any) -> List[Dict[str, Any]]:
379
+ if isinstance(keyframes_data, dict):
380
+ return keyframes_data.get("keyframes", []) or []
381
+ if isinstance(keyframes_data, list):
382
+ return keyframes_data
383
+ return []
384
+
385
+
386
+ def main() -> None:
387
+ ap = argparse.ArgumentParser()
388
+ ap.add_argument("--keyframes", required=True, help="Path to keyframes_with_utterances.json")
389
+ ap.add_argument("--out", required=True, help="Output path for final JSON")
390
+ ap.add_argument("--model", default="gemini-2.5-flash", help="Gemini model id")
391
+ ap.add_argument("--similarity-threshold", type=float, default=0.82)
392
+ ap.add_argument("--temperature", type=float, default=0.2)
393
+ args = ap.parse_args()
394
+
395
+ keyframes_data = load_json(args.keyframes)
396
+ keyframes_list = keyframe_items(keyframes_data)
397
+ if not keyframes_list:
398
+ raise ValueError("No keyframes found in input keyframes file.")
399
+
400
+ keyframes_list = sorted(
401
+ keyframes_list,
402
+ key=lambda x: (float(x.get("t_sec", 0.0)), int(x.get("keyframe_idx", 0))),
403
+ )
404
+
405
+ demo_count = sum(1 for kf in keyframes_list if str(kf.get("frame_type", "")).lower() == "demo")
406
+ code_count = sum(1 for kf in keyframes_list if str(kf.get("frame_type", "")).lower() == "code")
407
+ gemini_target_count = demo_count
408
+ local_only_count = len(keyframes_list) - gemini_target_count
409
+ log(
410
+ f"Loaded keyframes: total={len(keyframes_list)} demo={demo_count} "
411
+ f"code={code_count} local_only={local_only_count}"
412
+ )
413
+
414
+ client: Optional[genai.Client] = None
415
+ if gemini_target_count > 0:
416
+ log("Initializing Gemini client (demo frames only)...")
417
+ client = gemini_client()
418
+ log("Gemini client ready.")
419
+
420
+ output: Dict[str, Any] = {
421
+ "meta": {
422
+ "keyframes_file": args.keyframes,
423
+ "model": args.model,
424
+ "generated_at_epoch": time.time(),
425
+ "rules": {
426
+ "demo_frames_use_gemini": True,
427
+ "slides_code_none_use_local_ocr_only": True,
428
+ "similarity_threshold": args.similarity_threshold,
429
+ "frame_change_is_deterministic_content_diff": True,
430
+ },
431
+ "counts": {
432
+ "total_keyframes": len(keyframes_list),
433
+ "demo_keyframes": demo_count,
434
+ "code_keyframes": code_count,
435
+ "gemini_keyframes": gemini_target_count,
436
+ "local_only_keyframes": local_only_count,
437
+ "gemini_calls": 0,
438
+ },
439
+ },
440
+ "keyframes": [],
441
+ }
442
+
443
+ prev_frame_obj: Optional[Dict[str, Any]] = None
444
+ prev_content_summary: Optional[str] = None
445
+
446
+ for idx, frame in enumerate(keyframes_list, start=1):
447
+ frame_type = str(frame.get("frame_type", "unknown")).lower()
448
+ t_sec = float(frame.get("t_sec", 0.0))
449
+ timestamp = frame.get("timestamp") or sec_to_hhmmss(t_sec)
450
+ on_screen_text = [str(x).strip() for x in (frame.get("on_screen_text") or []) if str(x).strip()]
451
+ assigned_utterances = frame.get("assigned_utterances") or []
452
+ speakers = extract_speakers_from_utterances(assigned_utterances)
453
+ utt_start_ts, utt_end_ts = utterance_time_bounds(assigned_utterances, default_t=t_sec)
454
+
455
+ sim = 0.0
456
+ is_similar = False
457
+ if prev_frame_obj is not None:
458
+ sim = jaccard_similarity(frame_signature(prev_frame_obj), frame_signature(frame))
459
+ is_similar = sim >= float(args.similarity_threshold)
460
+
461
+ log(
462
+ f"[{idx}/{len(keyframes_list)}] keyframe={frame.get('keyframe_idx')} "
463
+ f"type={frame_type} time={timestamp} similarity={sim:.3f}"
464
+ )
465
+
466
+ if frame_type == "demo":
467
+ if client is None:
468
+ raise RuntimeError("Internal error: demo frame encountered but Gemini client is not initialized.")
469
+ system_instruction, user_prompt = build_demo_prompt(
470
+ frame=frame,
471
+ prev_content_summary=prev_content_summary,
472
+ similarity_to_prev=sim,
473
+ is_similar=is_similar,
474
+ )
475
+ t0 = time.time()
476
+ parsed = call_gemini_structured(
477
+ client=client,
478
+ model=args.model,
479
+ system_instruction=system_instruction,
480
+ user_prompt=user_prompt,
481
+ schema_model=DemoGeminiSummary,
482
+ temperature=args.temperature,
483
+ max_retries=3,
484
+ )
485
+ log(f" Gemini done in {time.time() - t0:.1f}s")
486
+ output["meta"]["counts"]["gemini_calls"] += 1
487
+ if isinstance(parsed, BaseModel):
488
+ summary_payload = parsed.model_dump()
489
+ else:
490
+ summary_payload = dict(parsed)
491
+ summary_source = "gemini_demo_only"
492
+ else:
493
+ summary_payload = local_summary_for_non_demo(frame)
494
+ summary_source = "local_ocr_only"
495
+
496
+ transition_diff = {"added_elements": [], "removed_elements": []}
497
+ if prev_frame_obj is not None:
498
+ prev_text = [str(x).strip() for x in (prev_frame_obj.get("on_screen_text") or []) if str(x).strip()]
499
+ cur_text = on_screen_text
500
+ added, removed = diff_lists(prev_text, cur_text, max_items=40)
501
+ transition_diff = {"added_elements": added, "removed_elements": removed}
502
+
503
+ frame_change = None
504
+ if prev_content_summary is not None:
505
+ frame_change = {
506
+ "changed_summary": build_content_change_summary(
507
+ prev_content_summary=prev_content_summary,
508
+ cur_content_summary=summary_payload.get("content_summary"),
509
+ ),
510
+ "possible_reason": (
511
+ "Computed from keyframe OCR and utterance differences; no transition LLM call used."
512
+ ),
513
+ "added_elements": transition_diff["added_elements"],
514
+ "removed_elements": transition_diff["removed_elements"],
515
+ }
516
+
517
+ out_frame = {
518
+ "keyframe_idx": int(frame.get("keyframe_idx", idx - 1)),
519
+ "frame_type": frame_type,
520
+ "t_sec": t_sec,
521
+ "timestamp": timestamp,
522
+ "image_path": str(frame.get("image_path", "")),
523
+ "on_screen_text": on_screen_text[:400],
524
+ "speakers": speakers,
525
+ "utterance_time_start": utt_start_ts,
526
+ "utterance_time_end": utt_end_ts,
527
+ "utterance_summary": str(summary_payload.get("utterance_summary", "")).strip(),
528
+ "content_summary": str(summary_payload.get("content_summary", "")).strip(),
529
+ "combined_summary": str(summary_payload.get("combined_summary", "")).strip(),
530
+ "frame_change": frame_change,
531
+ "similarity_to_prev": float(sim),
532
+ "reused_prev_content": bool(is_similar and frame_type == "demo"),
533
+ "notes": [
534
+ f"summary_source={summary_source}",
535
+ "Only demo keyframes are sent to Gemini in this pipeline.",
536
+ ],
537
+ }
538
+
539
+ output["keyframes"].append(out_frame)
540
+ prev_frame_obj = frame
541
+ prev_content_summary = out_frame.get("content_summary")
542
+
543
+ save_json(args.out, output)
544
+ log(f"Done. Wrote: {args.out}")
545
+ log(f"Gemini calls made: {output['meta']['counts']['gemini_calls']}")
546
+
547
+
548
+ if __name__ == "__main__":
549
+ main()
pipelines/condense_final_output.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # condense_final_output.py
2
+ # Usage:
3
+ # python condense_final_output.py --in "C:\meet-agent\out_folder\final_output.json" --out "C:\meet-agent\out_folder\final_output_condensed.json"
4
+ #
5
+ # What it does:
6
+ # - Reads the "final_output.json" produced by your build script
7
+ # - Produces a condensed version with only:
8
+ # - keyframe (idx, timestamp, type, t_sec, image_path)
9
+ # - combined_summary
10
+ # - changed_summary (from transition_change/frame_change/demo_change if present)
11
+ # - Supports both input schemas:
12
+ # 1) new: {"meta": ..., "keyframes": [...]}
13
+ # 2) old: {"meta": ..., "topics": [{"keyframes": [...]}]}
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ from typing import Any, Dict, Optional
19
+
20
+
21
+ def load_json(path: str) -> Any:
22
+ with open(path, "r", encoding="utf-8") as f:
23
+ return json.load(f)
24
+
25
+
26
+ def save_json(path: str, obj: Any) -> None:
27
+ out_dir = os.path.dirname(path)
28
+ if out_dir:
29
+ os.makedirs(out_dir, exist_ok=True)
30
+ with open(path, "w", encoding="utf-8") as f:
31
+ json.dump(obj, f, ensure_ascii=False, indent=2)
32
+
33
+
34
+ def pick_changed_summary(kf: Dict[str, Any]) -> Optional[str]:
35
+ """
36
+ Tries multiple locations, because your schema may store change summaries under different keys
37
+ depending on how you implemented transitions.
38
+
39
+ Priority order:
40
+ 1) transition_change.changed_summary
41
+ 2) frame_change.changed_summary
42
+ 3) demo_change.changed_summary
43
+ 4) changed_summary at root (fallback)
44
+ """
45
+ for container_key in ("transition_change", "frame_change", "demo_change"):
46
+ container = kf.get(container_key)
47
+ if isinstance(container, dict):
48
+ cs = container.get("changed_summary")
49
+ if isinstance(cs, str) and cs.strip():
50
+ return cs.strip()
51
+
52
+ cs_root = kf.get("changed_summary")
53
+ if isinstance(cs_root, str) and cs_root.strip():
54
+ return cs_root.strip()
55
+
56
+ return None
57
+
58
+
59
+ def condense_keyframe(kf: Dict[str, Any]) -> Dict[str, Any]:
60
+ return {
61
+ "keyframe": {
62
+ "keyframe_idx": kf.get("keyframe_idx"),
63
+ "timestamp": kf.get("timestamp"),
64
+ "frame_type": kf.get("frame_type"),
65
+ "t_sec": kf.get("t_sec"),
66
+ "image_path": kf.get("image_path"),
67
+ },
68
+ "combined_summary": kf.get("combined_summary"),
69
+ "changed_summary": pick_changed_summary(kf),
70
+ }
71
+
72
+
73
+ def condense(final_obj: Dict[str, Any]) -> Dict[str, Any]:
74
+ out_meta: Dict[str, Any] = {
75
+ "source": final_obj.get("meta", {}),
76
+ "notes": "Condensed output: keyframe + combined_summary + changed_summary",
77
+ }
78
+
79
+ # New schema: root keyframes list
80
+ root_keyframes = final_obj.get("keyframes", [])
81
+ if isinstance(root_keyframes, list):
82
+ out: Dict[str, Any] = {
83
+ "meta": {**out_meta, "input_schema": "root_keyframes"},
84
+ "keyframes": [],
85
+ }
86
+ for kf in root_keyframes:
87
+ if not isinstance(kf, dict):
88
+ continue
89
+ out["keyframes"].append(condense_keyframe(kf))
90
+ return out
91
+
92
+ # Old schema: topics[] with keyframes[]
93
+ out = {
94
+ "meta": {**out_meta, "input_schema": "topics"},
95
+ "topics": [],
96
+ }
97
+
98
+ topics = final_obj.get("topics", [])
99
+ if not isinstance(topics, list):
100
+ topics = []
101
+
102
+ for t in topics:
103
+ if not isinstance(t, dict):
104
+ continue
105
+
106
+ topic_out = {
107
+ "topic": t.get("topic"),
108
+ "start": t.get("start"),
109
+ "end": t.get("end"),
110
+ "start_ts": t.get("start_ts"),
111
+ "end_ts": t.get("end_ts"),
112
+ "keyframes": [],
113
+ }
114
+
115
+ keyframes = t.get("keyframes", [])
116
+ if not isinstance(keyframes, list):
117
+ keyframes = []
118
+
119
+ for kf in keyframes:
120
+ if not isinstance(kf, dict):
121
+ continue
122
+ topic_out["keyframes"].append(condense_keyframe(kf))
123
+
124
+ out["topics"].append(topic_out)
125
+
126
+ return out
127
+
128
+
129
+ def main() -> None:
130
+ ap = argparse.ArgumentParser()
131
+ ap.add_argument("--in", dest="inp", required=True, help="Path to final_output.json")
132
+ ap.add_argument("--out", dest="out", required=True, help="Path to write condensed JSON")
133
+ args = ap.parse_args()
134
+
135
+ final_obj = load_json(args.inp)
136
+ if not isinstance(final_obj, dict):
137
+ raise ValueError("Input JSON root must be an object/dict (expected FinalOutput-like structure).")
138
+
139
+ condensed = condense(final_obj)
140
+ save_json(args.out, condensed)
141
+ print(f"Wrote condensed JSON: {args.out}")
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()
pipelines/deepgram_extract_utterances.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ deepgram_extract_utterances.py
4
+
5
+ Extract speaker-attributed utterances (start, end, speaker, text)
6
+ from a meeting MP4 using Deepgram.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import mimetypes
14
+ import os
15
+ import sys
16
+ import time
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ import httpx
20
+ from dotenv import load_dotenv
21
+ from deepgram import DeepgramClient, PrerecordedOptions, FileSource
22
+
23
+
24
+ # load .env at startup
25
+ load_dotenv()
26
+
27
+
28
+ def _die(msg: str, code: int = 1) -> None:
29
+ print(f"Error: {msg}", file=sys.stderr)
30
+ sys.exit(code)
31
+
32
+
33
+ def _load_file_source(path: str):
34
+ if not os.path.isfile(path):
35
+ _die(f"File not found: {path}")
36
+
37
+ with open(path, "rb") as f:
38
+ data = f.read()
39
+
40
+ mime, _ = mimetypes.guess_type(path)
41
+ if not mime:
42
+ mime = "application/octet-stream"
43
+
44
+ # IMPORTANT: return a dict, NOT FileSource()
45
+ return {
46
+ "buffer": data,
47
+ "mimetype": mime,
48
+ }
49
+
50
+
51
+
52
+ def _extract_utterances(result: Dict[str, Any]) -> List[Dict[str, Any]]:
53
+ utterances = result.get("results", {}).get("utterances", [])
54
+ out: List[Dict[str, Any]] = []
55
+
56
+ for u in utterances:
57
+ out.append(
58
+ {
59
+ "start": float(u.get("start", 0.0)),
60
+ "end": float(u.get("end", 0.0)),
61
+ "speaker": u.get("speaker"),
62
+ "text": (u.get("transcript") or "").strip(),
63
+ }
64
+ )
65
+
66
+ return out
67
+
68
+
69
+ def _is_non_retryable_error(exc: Exception) -> bool:
70
+ code = getattr(exc, "status_code", None)
71
+ if isinstance(code, int) and 400 <= code < 500:
72
+ return True
73
+ status = getattr(exc, "status", None)
74
+ if isinstance(status, int) and 400 <= status < 500:
75
+ return True
76
+ msg = str(exc).lower()
77
+ # Deepgram SDK exceptions often encode status in message text.
78
+ if "status: 4" in msg or "bad request" in msg or "unsupported data" in msg:
79
+ return True
80
+ return False
81
+
82
+
83
+ def transcribe_and_extract(
84
+ path: str,
85
+ model: str = "nova-3",
86
+ language: Optional[str] = None,
87
+ request_timeout_sec: float = 1200.0,
88
+ connect_timeout_sec: float = 30.0,
89
+ retries: int = 3,
90
+ retry_backoff_sec: float = 2.0,
91
+ ) -> tuple[Dict[str, Any], Dict[str, Any]]:
92
+ api_key = os.getenv("DEEPGRAM_API_KEY")
93
+ if not api_key:
94
+ _die("DEEPGRAM_API_KEY not found in environment or .env")
95
+
96
+ client = DeepgramClient(api_key=api_key)
97
+
98
+ source = _load_file_source(path)
99
+
100
+ options_kwargs: Dict[str, Any] = {
101
+ "model": model,
102
+ "smart_format": True,
103
+ "punctuate": True,
104
+ "utterances": True,
105
+ "diarize": True,
106
+ }
107
+ if language:
108
+ options_kwargs["language"] = language
109
+
110
+ options = PrerecordedOptions(**options_kwargs)
111
+
112
+ # Deepgram SDK default HTTP timeout is 30s; long recordings often exceed that.
113
+ timeout = httpx.Timeout(float(request_timeout_sec), connect=float(connect_timeout_sec))
114
+ retries = max(1, int(retries))
115
+
116
+ last_err: Optional[Exception] = None
117
+ response = None
118
+ for attempt in range(1, retries + 1):
119
+ try:
120
+ response = client.listen.rest.v("1").transcribe_file(
121
+ source,
122
+ options,
123
+ timeout=timeout,
124
+ )
125
+ break
126
+ except Exception as e:
127
+ last_err = e
128
+ if _is_non_retryable_error(e):
129
+ # Client/input errors won't succeed on retry.
130
+ raise
131
+ if attempt >= retries:
132
+ raise
133
+ wait_sec = float(retry_backoff_sec) * attempt
134
+ print(
135
+ f"Deepgram request failed (attempt {attempt}/{retries}): {type(e).__name__}: {e}. "
136
+ f"Retrying in {wait_sec:.1f}s..."
137
+ )
138
+ time.sleep(wait_sec)
139
+
140
+ if response is None:
141
+ raise RuntimeError(f"Deepgram transcription failed after {retries} attempts: {last_err}")
142
+
143
+ result_dict = response.to_dict() if hasattr(response, "to_dict") else dict(response)
144
+
145
+ return {
146
+ "input_file": os.path.abspath(path),
147
+ "model": model,
148
+ "utterances": _extract_utterances(result_dict),
149
+ }, result_dict
150
+
151
+
152
+ def main() -> None:
153
+ parser = argparse.ArgumentParser()
154
+ parser.add_argument("input", help="Path to meeting file (.mp4, .wav, .mp3)")
155
+ parser.add_argument("-o", "--output", default="utterances.json")
156
+ parser.add_argument("--raw", help="Optional raw Deepgram response JSON")
157
+ parser.add_argument("--model", default="nova-3")
158
+ parser.add_argument("--language", help="Optional language code (e.g. en, en-US)")
159
+ parser.add_argument(
160
+ "--request-timeout-sec",
161
+ type=float,
162
+ default=1200.0,
163
+ help="HTTP request timeout for Deepgram API call (default: 1200s).",
164
+ )
165
+ parser.add_argument(
166
+ "--connect-timeout-sec",
167
+ type=float,
168
+ default=30.0,
169
+ help="HTTP connect timeout for Deepgram API call (default: 30s).",
170
+ )
171
+ parser.add_argument(
172
+ "--retries",
173
+ type=int,
174
+ default=3,
175
+ help="Number of retry attempts for Deepgram call (default: 3).",
176
+ )
177
+ parser.add_argument(
178
+ "--retry-backoff-sec",
179
+ type=float,
180
+ default=2.0,
181
+ help="Base retry backoff seconds; actual sleep is base * attempt (default: 2.0).",
182
+ )
183
+ args = parser.parse_args()
184
+
185
+ extracted, raw = transcribe_and_extract(
186
+ args.input,
187
+ model=args.model,
188
+ language=args.language,
189
+ request_timeout_sec=float(args.request_timeout_sec),
190
+ connect_timeout_sec=float(args.connect_timeout_sec),
191
+ retries=int(args.retries),
192
+ retry_backoff_sec=float(args.retry_backoff_sec),
193
+ )
194
+
195
+ with open(args.output, "w", encoding="utf-8") as f:
196
+ json.dump(extracted, f, ensure_ascii=False, indent=2)
197
+
198
+ if args.raw:
199
+ with open(args.raw, "w", encoding="utf-8") as f:
200
+ json.dump(raw, f, ensure_ascii=False, indent=2)
201
+
202
+ print(f"Saved utterances to {args.output}")
203
+ if args.raw:
204
+ print(f"Saved raw response to {args.raw}")
205
+
206
+
207
+ if __name__ == "__main__":
208
+ main()
pipelines/models/yolov8x-doclaynet.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fd403628e5377fc08105df49489fc4a8997d1376589470865d874f1ee918317
3
+ size 136821929
pipelines/run_pipeline_all.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pipeline orchestrator.
4
+
5
+ Runs:
6
+ 1) deepgram_extract_utterances.py (parallel)
7
+ 2) smart_keyframes_and_classify.py (parallel)
8
+ 3) assign_utterances_to_keyframes.py (after 1+2)
9
+ 4) build_final_output.py (after 3)
10
+ 5) condense_final_output.py (after 4)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import subprocess
17
+ import sys
18
+ import time
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from pathlib import Path
21
+ from typing import Dict, List, Sequence, Tuple
22
+
23
+
24
+ def run_command(name: str, cmd: Sequence[str], cwd: Path) -> None:
25
+ start = time.perf_counter()
26
+ print(f"\n[{name}] START")
27
+ print(f"[{name}] CMD: {' '.join(cmd)}")
28
+ result = subprocess.run(cmd, cwd=str(cwd))
29
+ dur = time.perf_counter() - start
30
+ if result.returncode != 0:
31
+ raise RuntimeError(f"[{name}] failed with exit code {result.returncode}")
32
+ print(f"[{name}] DONE in {dur:.2f}s")
33
+
34
+
35
+ def run_parallel(commands: List[Tuple[str, List[str]]], cwd: Path) -> None:
36
+ if not commands:
37
+ return
38
+ with ThreadPoolExecutor(max_workers=len(commands)) as ex:
39
+ futures = {
40
+ ex.submit(run_command, name, cmd, cwd): name
41
+ for name, cmd in commands
42
+ }
43
+ for fut in as_completed(futures):
44
+ fut.result()
45
+
46
+
47
+ def require_file(path: Path, step_name: str) -> None:
48
+ if not path.exists():
49
+ raise FileNotFoundError(f"[{step_name}] expected output not found: {path}")
50
+
51
+
52
+ def main() -> None:
53
+ ap = argparse.ArgumentParser(description="Run full meeting summarization pipeline.")
54
+ ap.add_argument("--video", required=True, help="Path to meeting video/audio input.")
55
+ ap.add_argument("--out", required=True, help="Output directory for pipeline artifacts.")
56
+
57
+ ap.add_argument("--python", default=sys.executable, help="Python executable to use.")
58
+
59
+ ap.add_argument("--deepgram-model", default="nova-3", help="Deepgram model.")
60
+ ap.add_argument("--deepgram-language", default=None, help="Deepgram language (optional).")
61
+ ap.add_argument(
62
+ "--deepgram-raw-out",
63
+ default=None,
64
+ help="Optional path for raw Deepgram response JSON.",
65
+ )
66
+ ap.add_argument(
67
+ "--deepgram-request-timeout-sec",
68
+ type=float,
69
+ default=1200.0,
70
+ help="HTTP request timeout for Deepgram call.",
71
+ )
72
+ ap.add_argument(
73
+ "--deepgram-connect-timeout-sec",
74
+ type=float,
75
+ default=30.0,
76
+ help="HTTP connect timeout for Deepgram call.",
77
+ )
78
+ ap.add_argument(
79
+ "--deepgram-retries",
80
+ type=int,
81
+ default=3,
82
+ help="Retry attempts for Deepgram call.",
83
+ )
84
+ ap.add_argument(
85
+ "--deepgram-retry-backoff-sec",
86
+ type=float,
87
+ default=2.0,
88
+ help="Base retry backoff seconds for Deepgram call.",
89
+ )
90
+ ap.add_argument(
91
+ "--force-deepgram",
92
+ action="store_true",
93
+ help="Re-run Deepgram even if utterances.json already exists.",
94
+ )
95
+
96
+ ap.add_argument("--force-keyframes", action="store_true", help="Pass --force to smart keyframe script.")
97
+ ap.add_argument("--pre-roll-sec", type=float, default=3.0, help="Pre-roll seconds for utterance assignment.")
98
+
99
+ ap.add_argument("--gemini-model", default="gemini-2.5-flash", help="Gemini model id.")
100
+ ap.add_argument("--similarity-threshold", type=float, default=0.82, help="Similarity threshold for build step.")
101
+ ap.add_argument("--temperature", type=float, default=0.2, help="Gemini temperature for build step.")
102
+ args = ap.parse_args()
103
+
104
+ repo_dir = Path(__file__).resolve().parent
105
+ out_dir = Path(args.out).resolve()
106
+ out_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ video_path = Path(args.video).resolve()
109
+ if not video_path.exists():
110
+ raise FileNotFoundError(f"Input video not found: {video_path}")
111
+
112
+ deepgram_script = repo_dir / "deepgram_extract_utterances.py"
113
+ smart_kf_script = repo_dir / "smart_keyframes_and_classify.py"
114
+ assign_script = repo_dir / "assign_utterances_to_keyframes.py"
115
+ build_script = repo_dir / "build_final_output.py"
116
+ condense_script = repo_dir / "condense_final_output.py"
117
+
118
+ for s in [deepgram_script, smart_kf_script, assign_script, build_script, condense_script]:
119
+ if not s.exists():
120
+ raise FileNotFoundError(f"Script not found: {s}")
121
+
122
+ utterances_json = out_dir / "utterances.json"
123
+ keyframes_parsed_json = out_dir / "keyframes_parsed.json"
124
+ keyframes_with_utterances_json = out_dir / "keyframes_with_utterances.json"
125
+ final_output_json = out_dir / "final_output.json"
126
+ final_output_condensed_json = out_dir / "final_output_condensed.json"
127
+ deepgram_raw_json = Path(args.deepgram_raw_out).resolve() if args.deepgram_raw_out else None
128
+
129
+ python_exe = str(Path(args.python))
130
+
131
+ # 1 + 2 in parallel
132
+ deepgram_cmd = [
133
+ python_exe,
134
+ str(deepgram_script),
135
+ str(video_path),
136
+ "-o",
137
+ str(utterances_json),
138
+ "--model",
139
+ str(args.deepgram_model),
140
+ "--request-timeout-sec",
141
+ str(args.deepgram_request_timeout_sec),
142
+ "--connect-timeout-sec",
143
+ str(args.deepgram_connect_timeout_sec),
144
+ "--retries",
145
+ str(args.deepgram_retries),
146
+ "--retry-backoff-sec",
147
+ str(args.deepgram_retry_backoff_sec),
148
+ ]
149
+ if args.deepgram_language:
150
+ deepgram_cmd.extend(["--language", str(args.deepgram_language)])
151
+ if deepgram_raw_json is not None:
152
+ deepgram_cmd.extend(["--raw", str(deepgram_raw_json)])
153
+
154
+ smart_kf_cmd = [
155
+ python_exe,
156
+ str(smart_kf_script),
157
+ "--video",
158
+ str(video_path),
159
+ "--out",
160
+ str(out_dir),
161
+ ]
162
+ if args.force_keyframes:
163
+ smart_kf_cmd.append("--force")
164
+
165
+ parallel_commands: List[Tuple[str, List[str]]] = []
166
+ if args.force_deepgram or (not utterances_json.exists()):
167
+ parallel_commands.append(("deepgram_extract_utterances", deepgram_cmd))
168
+ else:
169
+ print(f"[deepgram_extract_utterances] SKIP (exists): {utterances_json}")
170
+
171
+ if args.force_keyframes or (not keyframes_parsed_json.exists()):
172
+ parallel_commands.append(("smart_keyframes_and_classify", smart_kf_cmd))
173
+ else:
174
+ print(f"[smart_keyframes_and_classify] SKIP (exists): {keyframes_parsed_json}")
175
+
176
+ if parallel_commands:
177
+ print("Running Step 1+2 in parallel...")
178
+ run_parallel(parallel_commands, cwd=repo_dir)
179
+ else:
180
+ print("Skipping Step 1+2 (all required artifacts already exist).")
181
+
182
+ require_file(utterances_json, "deepgram_extract_utterances")
183
+ require_file(keyframes_parsed_json, "smart_keyframes_and_classify")
184
+
185
+ # 3 assign
186
+ assign_cmd = [
187
+ python_exe,
188
+ str(assign_script),
189
+ str(keyframes_parsed_json),
190
+ str(utterances_json),
191
+ "-o",
192
+ str(keyframes_with_utterances_json),
193
+ "--pre-roll-sec",
194
+ str(args.pre_roll_sec),
195
+ ]
196
+ run_command("assign_utterances_to_keyframes", assign_cmd, cwd=repo_dir)
197
+ require_file(keyframes_with_utterances_json, "assign_utterances_to_keyframes")
198
+
199
+ # 4 build
200
+ build_cmd = [
201
+ python_exe,
202
+ str(build_script),
203
+ "--keyframes",
204
+ str(keyframes_with_utterances_json),
205
+ "--out",
206
+ str(final_output_json),
207
+ "--model",
208
+ str(args.gemini_model),
209
+ "--similarity_threshold",
210
+ str(args.similarity_threshold),
211
+ "--temperature",
212
+ str(args.temperature),
213
+ ]
214
+ run_command("build_final_output", build_cmd, cwd=repo_dir)
215
+ require_file(final_output_json, "build_final_output")
216
+
217
+ # 5 condense
218
+ condense_cmd = [
219
+ python_exe,
220
+ str(condense_script),
221
+ "--in",
222
+ str(final_output_json),
223
+ "--out",
224
+ str(final_output_condensed_json),
225
+ ]
226
+ run_command("condense_final_output", condense_cmd, cwd=repo_dir)
227
+ require_file(final_output_condensed_json, "condense_final_output")
228
+
229
+ print("\nPipeline completed successfully.")
230
+ print(f"Utterances: {utterances_json}")
231
+ print(f"Keyframes parsed: {keyframes_parsed_json}")
232
+ print(f"Keyframes+utterances: {keyframes_with_utterances_json}")
233
+ print(f"Final output: {final_output_json}")
234
+ print(f"Condensed output: {final_output_condensed_json}")
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
pipelines/run_pipeline_demo_code.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Demo-only Gemini pipeline orchestrator (kept in demo-code route for compatibility).
4
+
5
+ Pipeline steps:
6
+ 1) deepgram_extract_utterances.py (parallel)
7
+ 2) smart_keyframes_and_classify.py (parallel)
8
+ 3) assign_utterances_to_keyframes.py
9
+ 4) build_final_output_demo_code.py (Gemini for demo only; slides+code local OCR/transcript)
10
+ 5) condense_final_output.py
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import subprocess
17
+ import sys
18
+ import time
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from pathlib import Path
21
+ from typing import List, Sequence, Tuple
22
+
23
+
24
+ def run_command(name: str, cmd: Sequence[str], cwd: Path) -> None:
25
+ start = time.perf_counter()
26
+ print(f"\n[{name}] START")
27
+ print(f"[{name}] CMD: {' '.join(cmd)}")
28
+ result = subprocess.run(cmd, cwd=str(cwd))
29
+ dur = time.perf_counter() - start
30
+ if result.returncode != 0:
31
+ raise RuntimeError(f"[{name}] failed with exit code {result.returncode}")
32
+ print(f"[{name}] DONE in {dur:.2f}s")
33
+
34
+
35
+ def run_parallel(commands: List[Tuple[str, List[str]]], cwd: Path) -> None:
36
+ if not commands:
37
+ return
38
+ with ThreadPoolExecutor(max_workers=len(commands)) as ex:
39
+ futures = {ex.submit(run_command, name, cmd, cwd): name for name, cmd in commands}
40
+ for fut in as_completed(futures):
41
+ fut.result()
42
+
43
+
44
+ def require_file(path: Path, step_name: str) -> None:
45
+ if not path.exists():
46
+ raise FileNotFoundError(f"[{step_name}] expected output not found: {path}")
47
+
48
+
49
+ def main() -> None:
50
+ ap = argparse.ArgumentParser(description="Run demo-only Gemini meeting pipeline (demo-code route alias).")
51
+ ap.add_argument("--video", required=True, help="Path to meeting video/audio input.")
52
+ ap.add_argument("--out", required=True, help="Output directory for pipeline artifacts.")
53
+
54
+ ap.add_argument("--python", default=sys.executable, help="Python executable to use.")
55
+
56
+ ap.add_argument("--deepgram-model", default="nova-3", help="Deepgram model.")
57
+ ap.add_argument("--deepgram-language", default=None, help="Deepgram language (optional).")
58
+ ap.add_argument(
59
+ "--deepgram-raw-out",
60
+ default=None,
61
+ help="Optional path for raw Deepgram response JSON.",
62
+ )
63
+ ap.add_argument(
64
+ "--deepgram-request-timeout-sec",
65
+ type=float,
66
+ default=1200.0,
67
+ help="HTTP request timeout for Deepgram call.",
68
+ )
69
+ ap.add_argument(
70
+ "--deepgram-connect-timeout-sec",
71
+ type=float,
72
+ default=30.0,
73
+ help="HTTP connect timeout for Deepgram call.",
74
+ )
75
+ ap.add_argument(
76
+ "--deepgram-retries",
77
+ type=int,
78
+ default=3,
79
+ help="Retry attempts for Deepgram call.",
80
+ )
81
+ ap.add_argument(
82
+ "--deepgram-retry-backoff-sec",
83
+ type=float,
84
+ default=2.0,
85
+ help="Base retry backoff seconds for Deepgram call.",
86
+ )
87
+ ap.add_argument(
88
+ "--force-deepgram",
89
+ action="store_true",
90
+ help="Re-run Deepgram even if utterances.json already exists.",
91
+ )
92
+
93
+ ap.add_argument("--force-keyframes", action="store_true", help="Pass --force to smart keyframe script.")
94
+ ap.add_argument("--pre-roll-sec", type=float, default=3.0, help="Pre-roll seconds for utterance assignment.")
95
+
96
+ ap.add_argument("--gemini-model", default="gemini-2.5-flash", help="Gemini model id.")
97
+ ap.add_argument(
98
+ "--similarity-threshold",
99
+ type=float,
100
+ default=0.82,
101
+ help="Similarity threshold for demo prompt reuse logic.",
102
+ )
103
+ ap.add_argument("--temperature", type=float, default=0.2, help="Gemini temperature for demo keyframes.")
104
+ args = ap.parse_args()
105
+
106
+ pipeline_dir = Path(__file__).resolve().parent
107
+ repo_dir = pipeline_dir
108
+
109
+ out_dir = Path(args.out).resolve()
110
+ out_dir.mkdir(parents=True, exist_ok=True)
111
+
112
+ video_path = Path(args.video).resolve()
113
+ if not video_path.exists():
114
+ raise FileNotFoundError(f"Input video not found: {video_path}")
115
+
116
+ deepgram_script = repo_dir / "deepgram_extract_utterances.py"
117
+ smart_kf_script = repo_dir / "smart_keyframes_and_classify.py"
118
+ assign_script = repo_dir / "assign_utterances_to_keyframes.py"
119
+ build_demo_script = pipeline_dir / "build_final_output_demo_code.py"
120
+ condense_script = repo_dir / "condense_final_output.py"
121
+
122
+ for s in [deepgram_script, smart_kf_script, assign_script, build_demo_script, condense_script]:
123
+ if not s.exists():
124
+ raise FileNotFoundError(f"Script not found: {s}")
125
+
126
+ utterances_json = out_dir / "utterances.json"
127
+ keyframes_parsed_json = out_dir / "keyframes_parsed.json"
128
+ keyframes_with_utterances_json = out_dir / "keyframes_with_utterances.json"
129
+ final_output_json = out_dir / "final_output_demo_code.json"
130
+ final_output_condensed_json = out_dir / "final_output_demo_code_condensed.json"
131
+ deepgram_raw_json = Path(args.deepgram_raw_out).resolve() if args.deepgram_raw_out else None
132
+
133
+ python_exe = str(Path(args.python))
134
+
135
+ deepgram_cmd = [
136
+ python_exe,
137
+ str(deepgram_script),
138
+ str(video_path),
139
+ "-o",
140
+ str(utterances_json),
141
+ "--model",
142
+ str(args.deepgram_model),
143
+ "--request-timeout-sec",
144
+ str(args.deepgram_request_timeout_sec),
145
+ "--connect-timeout-sec",
146
+ str(args.deepgram_connect_timeout_sec),
147
+ "--retries",
148
+ str(args.deepgram_retries),
149
+ "--retry-backoff-sec",
150
+ str(args.deepgram_retry_backoff_sec),
151
+ ]
152
+ if args.deepgram_language:
153
+ deepgram_cmd.extend(["--language", str(args.deepgram_language)])
154
+ if deepgram_raw_json is not None:
155
+ deepgram_cmd.extend(["--raw", str(deepgram_raw_json)])
156
+
157
+ smart_kf_cmd = [
158
+ python_exe,
159
+ str(smart_kf_script),
160
+ "--video",
161
+ str(video_path),
162
+ "--out",
163
+ str(out_dir),
164
+ "--no-yolo-for-non-demo",
165
+ ]
166
+ if args.force_keyframes:
167
+ smart_kf_cmd.append("--force")
168
+
169
+ parallel_commands: List[Tuple[str, List[str]]] = []
170
+ if args.force_deepgram or (not utterances_json.exists()):
171
+ parallel_commands.append(("deepgram_extract_utterances", deepgram_cmd))
172
+ else:
173
+ print(f"[deepgram_extract_utterances] SKIP (exists): {utterances_json}")
174
+
175
+ if args.force_keyframes or (not keyframes_parsed_json.exists()):
176
+ parallel_commands.append(("smart_keyframes_and_classify", smart_kf_cmd))
177
+ else:
178
+ print(f"[smart_keyframes_and_classify] SKIP (exists): {keyframes_parsed_json}")
179
+
180
+ if parallel_commands:
181
+ print("Running Step 1+2 in parallel...")
182
+ run_parallel(parallel_commands, cwd=repo_dir)
183
+ else:
184
+ print("Skipping Step 1+2 (all required artifacts already exist).")
185
+
186
+ require_file(utterances_json, "deepgram_extract_utterances")
187
+ require_file(keyframes_parsed_json, "smart_keyframes_and_classify")
188
+
189
+ assign_cmd = [
190
+ python_exe,
191
+ str(assign_script),
192
+ str(keyframes_parsed_json),
193
+ str(utterances_json),
194
+ "-o",
195
+ str(keyframes_with_utterances_json),
196
+ "--pre-roll-sec",
197
+ str(args.pre_roll_sec),
198
+ ]
199
+ run_command("assign_utterances_to_keyframes", assign_cmd, cwd=repo_dir)
200
+ require_file(keyframes_with_utterances_json, "assign_utterances_to_keyframes")
201
+
202
+ build_cmd = [
203
+ python_exe,
204
+ str(build_demo_script),
205
+ "--keyframes",
206
+ str(keyframes_with_utterances_json),
207
+ "--out",
208
+ str(final_output_json),
209
+ "--model",
210
+ str(args.gemini_model),
211
+ "--similarity-threshold",
212
+ str(args.similarity_threshold),
213
+ "--temperature",
214
+ str(args.temperature),
215
+ ]
216
+ run_command("build_final_output_demo_code", build_cmd, cwd=repo_dir)
217
+ require_file(final_output_json, "build_final_output_demo_code")
218
+
219
+ condense_cmd = [
220
+ python_exe,
221
+ str(condense_script),
222
+ "--in",
223
+ str(final_output_json),
224
+ "--out",
225
+ str(final_output_condensed_json),
226
+ ]
227
+ run_command("condense_final_output", condense_cmd, cwd=repo_dir)
228
+ require_file(final_output_condensed_json, "condense_final_output")
229
+
230
+ print("\nDemo-only Gemini pipeline completed successfully.")
231
+ print(f"Utterances: {utterances_json}")
232
+ print(f"Keyframes parsed: {keyframes_parsed_json}")
233
+ print(f"Keyframes+utterances: {keyframes_with_utterances_json}")
234
+ print(f"Final output (demo-only Gemini): {final_output_json}")
235
+ print(f"Condensed output (demo-only Gemini): {final_output_condensed_json}")
236
+
237
+
238
+ if __name__ == "__main__":
239
+ main()
pipelines/smart_keyframes_and_classify.py ADDED
@@ -0,0 +1,1443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # smart_keyframes_and_classify.py
2
+ import argparse
3
+ import json
4
+ import os
5
+ import time
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ import re
10
+ import concurrent.futures as cf
11
+
12
+ import cv2
13
+ import numpy as np
14
+ from dotenv import load_dotenv
15
+
16
+ try:
17
+ import clip
18
+ import torch
19
+ from PIL import Image
20
+ except Exception:
21
+ clip = None
22
+ torch = None
23
+ Image = None
24
+
25
+ # Local models (layout + OCR)
26
+ # pip install ultralytics paddleocr paddlepaddle opencv-python numpy python-dotenv
27
+ from ultralytics import YOLO
28
+
29
+ # Avoid oneDNN fused-conv issues seen in some Paddle/PaddleOCR builds on CPU.
30
+ # Use hard overrides (not setdefault) so shell/.env values cannot re-enable it.
31
+ os.environ["FLAGS_use_mkldnn"] = "0"
32
+ os.environ["FLAGS_enable_mkldnn"] = "0"
33
+ os.environ["FLAGS_use_onednn"] = "0"
34
+
35
+ # Compatibility patch for NumPy>=2 with imgaug (transitive dep of PaddleOCR).
36
+ # imgaug expects np.sctypes, removed in NumPy 2.0.
37
+ if not hasattr(np, "sctypes"):
38
+ def _np_type(name: str, default):
39
+ return getattr(np, name, default)
40
+
41
+ np.sctypes = {
42
+ "int": [_np_type("int8", int), _np_type("int16", int), _np_type("int32", int), _np_type("int64", int)],
43
+ "uint": [_np_type("uint8", int), _np_type("uint16", int), _np_type("uint32", int), _np_type("uint64", int)],
44
+ "float": [_np_type("float16", float), _np_type("float32", float), _np_type("float64", float)],
45
+ "complex": [_np_type("complex64", complex), _np_type("complex128", complex)],
46
+ "others": [_np_type("bool_", bool), _np_type("object_", object), _np_type("str_", str), _np_type("bytes_", bytes)],
47
+ }
48
+
49
+ from paddleocr import PaddleOCR
50
+
51
+
52
+ # ============================================================
53
+ # EDIT THESE IN CODE (no tuning args needed in the command)
54
+ # ============================================================
55
+
56
+
57
+ def _env_bool(name: str, default: bool) -> bool:
58
+ raw = os.getenv(name)
59
+ if raw is None:
60
+ return bool(default)
61
+ return str(raw).strip().lower() in {"1", "true", "yes", "y", "on"}
62
+
63
+
64
+ def _auto_has_cuda() -> bool:
65
+ try:
66
+ return bool(torch is not None and torch.cuda.is_available())
67
+ except Exception:
68
+ return False
69
+
70
+ # Candidate sampling (local, no API)
71
+ SAMPLE_FPS = 1.0
72
+ RESIZE_W = 360
73
+ CANDIDATE_PERCENTILE = 70.0
74
+ MAX_CANDIDATES = 180
75
+
76
+ # Final cap
77
+ MAX_FRAMES = 150
78
+
79
+ # Fast/parse resize for local inference (CLIP)
80
+ FAST_FRAME_MAX_W = 720
81
+
82
+ # Parallelism removed (no LLM calls)
83
+ BASE_SLEEP_SEC = 0.0
84
+
85
+ # Local screen parsing (required)
86
+ ENABLE_LOCAL_SCREEN_PARSE = True
87
+
88
+ # Layout detector weights (DocLayNet-style YOLO weights recommended)
89
+ # Example: models/yolov8n-doclaynet.pt
90
+ LAYOUT_YOLO_WEIGHTS = os.getenv("LAYOUT_YOLO_WEIGHTS", "models/yolov8x-doclaynet.pt")
91
+ LAYOUT_CONF = float(os.getenv("LAYOUT_CONF", "0.25"))
92
+ LAYOUT_IOU = float(os.getenv("LAYOUT_IOU", "0.45"))
93
+
94
+ # YOLO runtime settings
95
+ # Defaults are deployment-safe (CPU on non-GPU hosts), but can be overridden via env.
96
+ YOLO_DEVICE = os.getenv("YOLO_DEVICE", "0" if _auto_has_cuda() else "cpu")
97
+ YOLO_IMGSZ = int(os.getenv("YOLO_IMGSZ", "640")) # try 512 for more speed if acceptable
98
+
99
+ # OCR
100
+ OCR_LANG = os.getenv("OCR_LANG", "en")
101
+ OCR_MIN_CONF = float(os.getenv("OCR_MIN_CONF", "0.45"))
102
+
103
+ # OCR runtime settings (GPU + crop-only OCR)
104
+ USE_GPU = _env_bool("OCR_GPU", _auto_has_cuda())
105
+ OCR_CROP_MAX_REGIONS = int(os.getenv("OCR_CROP_MAX_REGIONS", "10"))
106
+
107
+ # Downscale OCR crops by frame type (slides/demo faster; code keeps max)
108
+ OCR_CROP_SCALE_BY_TYPE = {
109
+ "slides": float(os.getenv("OCR_CROP_SCALE_SLIDES", "0.80")),
110
+ "demo": float(os.getenv("OCR_CROP_SCALE_DEMO", "0.75")),
111
+ "code": float(os.getenv("OCR_CROP_SCALE_CODE", "1.00")),
112
+ "none": float(os.getenv("OCR_CROP_SCALE_NONE", "0.75")),
113
+ }
114
+
115
+ # Resize input frame BEFORE YOLO+OCR in step 3 (slides/demo smaller; code max)
116
+ PARSE_MAX_W_BY_TYPE = {
117
+ "slides": int(os.getenv("PARSE_MAX_W_SLIDES", "1280")),
118
+ "demo": int(os.getenv("PARSE_MAX_W_DEMO", "1280")),
119
+ "none": int(os.getenv("PARSE_MAX_W_NONE", "1280")),
120
+ "code": int(os.getenv("PARSE_MAX_W_CODE", "99999")), # effectively "no resize"
121
+ }
122
+
123
+ # CLIP frame type classifier
124
+ # -----------------------------
125
+ # CLIP setup (more robust, fewer “code” false-positives)
126
+ # Strategy:
127
+ # 1) Use multiple POS prompts per class (ensembling)
128
+ # 2) Add NEG prompts per class (especially for "code") and score = mean(pos) - mean(neg)
129
+ # This makes "slides with code screenshots" stay as slides, and prevents "demo with code words" -> code.
130
+ # -----------------------------
131
+
132
+ CLIP_MODEL_NAME = os.getenv("CLIP_MODEL_NAME", "ViT-B/32")
133
+
134
+ # class labels (keep as-is)
135
+ CLIP_CLASS_LABELS = ["slides", "code", "demo", "none"]
136
+
137
+ # scoring mode used by your classifier code (implement if you haven't):
138
+ # score(class) = mean(sim(image, pos_prompts)) - mean(sim(image, neg_prompts))
139
+ CLIP_SCORE_MODE = os.getenv("CLIP_SCORE_MODE", "pos_minus_neg")
140
+
141
+ # If your pipeline supports a minimum margin between top-1 and top-2 to accept the prediction:
142
+ # (helps when frames are ambiguous)
143
+ CLIP_MIN_MARGIN = float(os.getenv("CLIP_MIN_MARGIN", "0.03"))
144
+
145
+ # Prompt bank: POS and NEG per class
146
+ CLIP_PROMPT_BANK = {
147
+ "slides": {
148
+ "pos": [
149
+ "a screenshot of a presentation slide (PowerPoint or Google Slides)",
150
+ "a slide with a large title at the top and bullet points below",
151
+ "a slide canvas with wide margins and centered content",
152
+ "a lecture slide with sections, headings, and bullet lists",
153
+ "a slide that may include a small embedded screenshot (code or UI) but is still a slide",
154
+ "a shared slide deck page in a video meeting (16:9 slide layout)",
155
+ ],
156
+ "neg": [
157
+ "a full screen web application dashboard with navigation sidebar",
158
+ "a desktop application interface with many clickable controls",
159
+ "a full screen code editor filling the screen",
160
+ "a terminal window filling the screen",
161
+ "a webcam grid of meeting participants",
162
+ ],
163
+ },
164
+
165
+ "code": {
166
+ "pos": [
167
+ "a full screen code editor filling most of the screen with many lines of code",
168
+ "an IDE with syntax highlighting and line numbers, code dominates the screen",
169
+ "a programming editor with file tree sidebar and editor pane, not inside a slide",
170
+ "a terminal and code editor side by side with readable code dominating",
171
+ ],
172
+ "neg": [
173
+ "a presentation slide that contains a screenshot of code",
174
+ "a slide with a code snippet as part of a slide deck",
175
+ "a slide with a code image and slide title and bullets",
176
+ "a demo UI screen that contains a small code panel",
177
+ ],
178
+ },
179
+
180
+ "demo": {
181
+ "pos": [
182
+ "a web application dashboard with a left navigation sidebar and multiple panels",
183
+ "a product user interface with buttons, menus, input fields, and toolbars",
184
+ "a browser-based app with tabs, filters, tables, charts, and navigation",
185
+ "a desktop software UI with controls, forms, and interactive elements",
186
+ "a product demo screen where the interface fills the screen (not a slide canvas)",
187
+ ],
188
+ "neg": [
189
+ "a PowerPoint or Google Slides presentation slide",
190
+ "a slide with title at top and bullet points",
191
+ "a slide deck page with large margins and a single canvas",
192
+ "a slide with an embedded screenshot of a UI",
193
+ "a slide with a cursor hovering over a tab",
194
+ "a slide with a code snippet or code screenshot",
195
+ ],
196
+ },
197
+
198
+ "none": {
199
+ "pos": [
200
+ "a video call gallery view with participants and no shared screen",
201
+ "a mostly blank screen or black screen",
202
+ "a blurred transition frame with no readable content",
203
+ "a loading screen with minimal content",
204
+ ],
205
+ "neg": [
206
+ "a presentation slide",
207
+ "a web application dashboard",
208
+ "a full screen code editor",
209
+ ],
210
+ },
211
+ }
212
+
213
+ CLIP_CLASS_PROMPTS = [CLIP_PROMPT_BANK[c]["pos"] for c in CLIP_CLASS_LABELS]
214
+ CLIP_CLASS_NEG_PROMPTS = [CLIP_PROMPT_BANK[c]["neg"] for c in CLIP_CLASS_LABELS]
215
+
216
+ # Caps for JSON size
217
+ MAX_OCR_LINES = 300
218
+
219
+ # ---- NEW: hard global time gap between kept keyframes ----
220
+ MIN_KEYFRAME_GAP_SEC = 3.0
221
+
222
+ # Sensitivity rules (VISUAL ONLY)
223
+ SENS = {
224
+ "slides": {"min_gap_sec": 1.2, "diff_mult": 1.60},
225
+ "code": {"min_gap_sec": 0.8, "diff_mult": 0.70},
226
+ "demo": {"min_gap_sec": 0.45, "diff_mult": 0.60},
227
+ "none": {"min_gap_sec": 0.55, "diff_mult": 0.95},
228
+ }
229
+
230
+ # Concurrent parsing workers (YOLO + OCR) for KEPT keyframes
231
+ PARSE_WORKERS = int(os.getenv("PARSE_WORKERS", "2"))
232
+
233
+
234
+ # ----------------------------
235
+ # Data structures
236
+ # ----------------------------
237
+
238
+ @dataclass
239
+ class CandidateFrame:
240
+ t_sec: float
241
+ frame_idx: int
242
+ diff_score: float # diff vs previous sampled frame (local)
243
+
244
+
245
+ # ----------------------------
246
+ # Utils
247
+ # ----------------------------
248
+
249
+ def fmt_hhmmss(sec: float) -> str:
250
+ sec = max(0.0, float(sec))
251
+ h = int(sec // 3600)
252
+ m = int((sec % 3600) // 60)
253
+ s = int(sec % 60)
254
+ return f"{h:02d}:{m:02d}:{s:02d}"
255
+
256
+
257
+ def safe_read_json(path: Path) -> Any:
258
+ return json.loads(path.read_text(encoding="utf-8"))
259
+
260
+
261
+ def safe_write_json(path: Path, obj: Any) -> None:
262
+ path.parent.mkdir(parents=True, exist_ok=True)
263
+ path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
264
+
265
+
266
+ def _probe_video(video_path: Path) -> Tuple[float, float, int]:
267
+ cap = cv2.VideoCapture(str(video_path))
268
+ if not cap.isOpened():
269
+ raise RuntimeError(f"Could not open video: {video_path}")
270
+ fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
271
+ frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
272
+ duration = float(frames / fps) if frames else 0.0
273
+ cap.release()
274
+ return float(fps), float(duration), int(frames)
275
+
276
+
277
+ def _mad_diff(a: np.ndarray, b: np.ndarray) -> float:
278
+ return float(np.mean(np.abs(a.astype(np.int16) - b.astype(np.int16))))
279
+
280
+
281
+ def _downscale_gray(frame_bgr: np.ndarray, resize_w: int) -> np.ndarray:
282
+ h, w = frame_bgr.shape[:2]
283
+ new_w = int(resize_w)
284
+ new_h = int(h * (new_w / max(1, w)))
285
+ small = cv2.resize(frame_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
286
+ return cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
287
+
288
+
289
+ def _resize_frame_max_w(frame_bgr: np.ndarray, max_w: int) -> np.ndarray:
290
+ h, w = frame_bgr.shape[:2]
291
+ if w <= max_w:
292
+ return frame_bgr
293
+ new_w = int(max_w)
294
+ new_h = int(h * (new_w / w))
295
+ return cv2.resize(frame_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
296
+
297
+
298
+ def _single_line(s: str, max_len: int = 220) -> str:
299
+ if s is None:
300
+ return ""
301
+ s = str(s).replace("\r", " ").replace("\n", " ")
302
+ s = re.sub(r"\s+", " ", s).strip()
303
+ if len(s) > max_len:
304
+ s = s[: max(0, max_len - 1)].rstrip() + "…"
305
+ return s
306
+
307
+
308
+ # ----------------------------
309
+ # Video frame reader (single capture)
310
+ # ----------------------------
311
+
312
+ class VideoReader:
313
+ def __init__(self, video_path: Path):
314
+ self.cap = cv2.VideoCapture(str(video_path))
315
+ if not self.cap.isOpened():
316
+ raise RuntimeError(f"Could not open video: {video_path}")
317
+
318
+ def read_at_frame(self, frame_idx: int) -> Optional[np.ndarray]:
319
+ self.cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
320
+ ret, frame = self.cap.read()
321
+ if not ret:
322
+ return None
323
+ return frame
324
+
325
+ def close(self) -> None:
326
+ try:
327
+ self.cap.release()
328
+ except Exception:
329
+ pass
330
+
331
+
332
+ # ----------------------------
333
+ # Local screen parse helpers (YOLO layout + PaddleOCR)
334
+ # ----------------------------
335
+
336
+ def _xyxy_to_int(xyxy):
337
+ x1, y1, x2, y2 = xyxy
338
+ return [int(round(x1)), int(round(y1)), int(round(x2)), int(round(y2))]
339
+
340
+
341
+ def _clip_box(box, w, h):
342
+ x1, y1, x2, y2 = box
343
+ x1 = max(0, min(x1, w - 1))
344
+ y1 = max(0, min(y1, h - 1))
345
+ x2 = max(0, min(x2, w - 1))
346
+ y2 = max(0, min(y2, h - 1))
347
+ if x2 < x1:
348
+ x1, x2 = x2, x1
349
+ if y2 < y1:
350
+ y1, y2 = y2, y1
351
+ return [x1, y1, x2, y2]
352
+
353
+
354
+ def _box_center(box):
355
+ x1, y1, x2, y2 = box
356
+ return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
357
+
358
+
359
+ def _zone_for_box(box, W, H):
360
+ cx, cy = _box_center(box)
361
+ if cy < 0.18 * H:
362
+ return "top"
363
+ if cy > 0.85 * H:
364
+ return "bottom"
365
+ if cx < 0.33 * W:
366
+ return "left"
367
+ if cx > 0.67 * W:
368
+ return "right"
369
+ return "center"
370
+
371
+
372
+ def _sort_reading_order(items):
373
+ return sorted(items, key=lambda it: (it["box"][1], it["box"][0]))
374
+
375
+
376
+ def run_layout_yolo(layout_model: YOLO, frame_bgr: np.ndarray) -> List[dict]:
377
+ H, W = frame_bgr.shape[:2]
378
+ res = layout_model.predict(
379
+ source=frame_bgr,
380
+ conf=LAYOUT_CONF,
381
+ iou=LAYOUT_IOU,
382
+ imgsz=YOLO_IMGSZ,
383
+ device=YOLO_DEVICE,
384
+ verbose=False
385
+ )[0]
386
+
387
+ regions = []
388
+ names = res.names
389
+ if res.boxes is None:
390
+ return regions
391
+
392
+ for b in res.boxes:
393
+ cls_id = int(b.cls.item())
394
+ conf = float(b.conf.item())
395
+ label = str(names.get(cls_id, f"class_{cls_id}"))
396
+ box = _xyxy_to_int(b.xyxy[0].tolist())
397
+ box = _clip_box(box, W, H)
398
+ regions.append({"label": label, "conf": conf, "box": box})
399
+
400
+ return _sort_reading_order(regions)
401
+
402
+
403
+ def run_paddle_ocr(ocr: PaddleOCR, frame_bgr: np.ndarray) -> List[dict]:
404
+ # Full-frame OCR fallback (kept for safety), with angle cls OFF (cls=False)
405
+ H, W = frame_bgr.shape[:2]
406
+ out = []
407
+
408
+ rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
409
+ result = ocr.ocr(rgb, cls=False)
410
+ if not result:
411
+ return out
412
+
413
+ lines = result[0] if isinstance(result, list) and len(result) > 0 else []
414
+ if lines is None:
415
+ return out
416
+ if not isinstance(lines, list):
417
+ return out
418
+
419
+ for line in lines:
420
+ if line is None or not isinstance(line, (list, tuple)) or len(line) < 2:
421
+ continue
422
+ quad = line[0]
423
+ pair = line[1]
424
+ if quad is None or pair is None:
425
+ continue
426
+ if not isinstance(pair, (list, tuple)) or len(pair) < 2:
427
+ continue
428
+ text, conf = pair[0], pair[1]
429
+ conf = float(conf)
430
+ if conf < OCR_MIN_CONF:
431
+ continue
432
+
433
+ if not isinstance(quad, (list, tuple)) or len(quad) == 0:
434
+ continue
435
+ xs = [p[0] for p in quad]
436
+ ys = [p[1] for p in quad]
437
+ x1, y1, x2, y2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
438
+ box = _clip_box([x1, y1, x2, y2], W, H)
439
+
440
+ txt = _single_line(text, max_len=220)
441
+ if not txt:
442
+ continue
443
+
444
+ out.append({
445
+ "text": txt,
446
+ "conf": conf,
447
+ "quad": [[float(p[0]), float(p[1])] for p in quad],
448
+ "box": box,
449
+ })
450
+
451
+ if len(out) >= int(MAX_OCR_LINES):
452
+ break
453
+
454
+ return _sort_reading_order(out)
455
+
456
+
457
+ def _is_text_heavy_label(label: str) -> bool:
458
+ lab = (label or "").lower()
459
+ keys = ["title", "text", "list", "table", "header", "heading"]
460
+ return any(k in lab for k in keys)
461
+
462
+
463
+ def _crop_and_scale(frame_bgr: np.ndarray, box: List[int], scale: float) -> Optional[np.ndarray]:
464
+ x1, y1, x2, y2 = box
465
+ crop = frame_bgr[y1:y2, x1:x2]
466
+ if crop is None or crop.size == 0:
467
+ return None
468
+ if scale is None or float(scale) >= 0.999:
469
+ return crop
470
+ return cv2.resize(crop, (0, 0), fx=float(scale), fy=float(scale), interpolation=cv2.INTER_AREA)
471
+
472
+
473
+ def run_paddle_ocr_on_text_regions(
474
+ ocr: PaddleOCR,
475
+ frame_bgr: np.ndarray,
476
+ regions: List[dict],
477
+ frame_type: str,
478
+ max_regions: int = 10,
479
+ ) -> List[dict]:
480
+ """
481
+ OCR ONLY on YOLO text-heavy regions (title/text/list/table/header).
482
+ Angle classifier is OFF via cls=False.
483
+ Crops are optionally downscaled by frame_type (slides/demo faster, code max).
484
+ """
485
+ H, W = frame_bgr.shape[:2]
486
+ out: List[dict] = []
487
+
488
+ scale = float(OCR_CROP_SCALE_BY_TYPE.get(str(frame_type), 0.80))
489
+
490
+ text_regions = [r for r in regions if _is_text_heavy_label(r.get("label", ""))]
491
+ text_regions = text_regions[: int(max_regions)]
492
+
493
+ # If YOLO didn't detect any text region, fallback to full-frame OCR
494
+ if not text_regions:
495
+ return run_paddle_ocr(ocr, frame_bgr)
496
+
497
+ for r in text_regions:
498
+ box = r["box"]
499
+ x1, y1, x2, y2 = box
500
+
501
+ crop = _crop_and_scale(frame_bgr, box, scale=scale)
502
+ if crop is None or crop.size == 0:
503
+ continue
504
+
505
+ rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
506
+ res = ocr.ocr(rgb, cls=False) # cls OFF (angle cls OFF)
507
+ lines = res[0] if res else []
508
+ if lines is None or not isinstance(lines, list):
509
+ continue
510
+ if not lines:
511
+ continue
512
+
513
+ inv_scale = (1.0 / scale) if scale and scale > 0 else 1.0
514
+
515
+ for line in lines:
516
+ if line is None or not isinstance(line, (list, tuple)) or len(line) < 2:
517
+ continue
518
+ quad = line[0]
519
+ pair = line[1]
520
+ if quad is None or pair is None:
521
+ continue
522
+ if not isinstance(pair, (list, tuple)) or len(pair) < 2:
523
+ continue
524
+ text, conf = pair[0], pair[1]
525
+ conf = float(conf)
526
+ if conf < OCR_MIN_CONF:
527
+ continue
528
+
529
+ if not isinstance(quad, (list, tuple)) or len(quad) == 0:
530
+ continue
531
+ quad_global = []
532
+ for p in quad:
533
+ gx = float(p[0]) * inv_scale + float(x1)
534
+ gy = float(p[1]) * inv_scale + float(y1)
535
+ quad_global.append([gx, gy])
536
+
537
+ xs = [p[0] for p in quad_global]
538
+ ys = [p[1] for p in quad_global]
539
+ gx1, gy1, gx2, gy2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
540
+ gbox = _clip_box([gx1, gy1, gx2, gy2], W, H)
541
+
542
+ txt = _single_line(text, max_len=220)
543
+ if not txt:
544
+ continue
545
+
546
+ out.append({
547
+ "text": txt,
548
+ "conf": conf,
549
+ "quad": quad_global,
550
+ "box": gbox,
551
+ "from_region_label": r.get("label", ""),
552
+ "from_region_box": box,
553
+ "crop_scale": float(scale),
554
+ })
555
+
556
+ if len(out) >= int(MAX_OCR_LINES):
557
+ break
558
+
559
+ if len(out) >= int(MAX_OCR_LINES):
560
+ break
561
+
562
+ return _sort_reading_order(out)
563
+
564
+
565
+ def attach_zones(regions: List[dict], W: int, H: int) -> Dict[str, List[dict]]:
566
+ zones = {"top": [], "left": [], "center": [], "right": [], "bottom": []}
567
+ for r in regions:
568
+ z = _zone_for_box(r["box"], W, H)
569
+ zones[z].append(r)
570
+ for z in zones:
571
+ zones[z] = _sort_reading_order(zones[z])
572
+ return zones
573
+
574
+
575
+ def guess_title(regions: List[dict], ocr_lines: List[dict]) -> str:
576
+ title_boxes = []
577
+ for r in regions:
578
+ lab = r.get("label", "").lower()
579
+ if ("title" in lab) or (lab == "title") or ("header" in lab and "page" not in lab):
580
+ title_boxes.append(r["box"])
581
+
582
+ def inside(line_box, region_box) -> bool:
583
+ x1, y1, x2, y2 = line_box
584
+ rx1, ry1, rx2, ry2 = region_box
585
+ return (x1 >= rx1 - 3 and y1 >= ry1 - 3 and x2 <= rx2 + 3 and y2 <= ry2 + 3)
586
+
587
+ if title_boxes:
588
+ lines = []
589
+ for ob in ocr_lines:
590
+ for tb in title_boxes:
591
+ if inside(ob["box"], tb):
592
+ lines.append(ob["text"])
593
+ break
594
+ lines = [x for x in lines if x]
595
+ if lines:
596
+ return " ".join(lines[:3]).strip()
597
+
598
+ if ocr_lines:
599
+ return ocr_lines[0]["text"]
600
+ return ""
601
+
602
+
603
+ def attach_ocr_to_regions(regions: List[dict], ocr_lines: List[dict], pad: int = 3) -> List[dict]:
604
+ def inside(line_box, region_box) -> bool:
605
+ x1, y1, x2, y2 = line_box
606
+ rx1, ry1, rx2, ry2 = region_box
607
+ return (x1 >= rx1 - pad and y1 >= ry1 - pad and x2 <= rx2 + pad and y2 <= ry2 + pad)
608
+
609
+ out = []
610
+ for r in regions:
611
+ rb = r.get("box")
612
+ if not rb:
613
+ out.append(r)
614
+ continue
615
+
616
+ texts = []
617
+ lines_in = []
618
+ for ln in ocr_lines:
619
+ lb = ln.get("box")
620
+ if lb and inside(lb, rb):
621
+ t = ln.get("text", "")
622
+ if t:
623
+ texts.append(t)
624
+ lines_in.append(ln)
625
+
626
+ rr = dict(r)
627
+ rr["text_lines"] = texts
628
+ rr["text"] = " ".join(texts).strip()
629
+ rr["ocr_line_count"] = len(lines_in)
630
+ out.append(rr)
631
+
632
+ return out
633
+
634
+
635
+ # ----------------------------
636
+ # CLIP frame type classifier (no LLM)
637
+ # ----------------------------
638
+
639
+ def init_clip_classifier() -> Tuple[Any, Any, Dict[str, Any], str]:
640
+ """
641
+ Builds a robust CLIP classifier with:
642
+ - POS prompt ensembling per class
643
+ - NEG prompt ensembling per class
644
+ - score = mean(sim to POS) - mean(sim to NEG)
645
+ Returns:
646
+ clip_model, preprocess, pack, device
647
+ where pack contains text features and metadata.
648
+ """
649
+ if clip is None or torch is None or Image is None:
650
+ raise RuntimeError(
651
+ "CLIP dependencies missing. Install torch and CLIP "
652
+ "(e.g. pip install torch and pip install git+https://github.com/openai/CLIP.git)."
653
+ )
654
+
655
+ device = "cuda" if torch.cuda.is_available() else "cpu"
656
+ try:
657
+ model, preprocess = clip.load(CLIP_MODEL_NAME, device=device)
658
+ model.eval()
659
+ except Exception as e:
660
+ raise RuntimeError(f"CLIP init failed for model '{CLIP_MODEL_NAME}': {type(e).__name__}: {e}") from e
661
+
662
+ if len(CLIP_CLASS_PROMPTS) != len(CLIP_CLASS_LABELS):
663
+ raise ValueError("CLIP_CLASS_PROMPTS must align with CLIP_CLASS_LABELS (same length).")
664
+
665
+ if "CLIP_CLASS_NEG_PROMPTS" not in globals():
666
+ raise ValueError("CLIP_CLASS_NEG_PROMPTS is missing. Define it (aligned with CLIP_CLASS_LABELS).")
667
+
668
+ if len(CLIP_CLASS_NEG_PROMPTS) != len(CLIP_CLASS_LABELS):
669
+ raise ValueError("CLIP_CLASS_NEG_PROMPTS must align with CLIP_CLASS_LABELS (same length).")
670
+
671
+ flat_pos: List[str] = []
672
+ pos_slices: List[Tuple[int, int]] = []
673
+ idx = 0
674
+ for prompts in CLIP_CLASS_PROMPTS:
675
+ if not isinstance(prompts, list) or len(prompts) == 0:
676
+ raise ValueError("Each entry in CLIP_CLASS_PROMPTS must be a non-empty list[str].")
677
+ s = idx
678
+ for p in prompts:
679
+ if not isinstance(p, str):
680
+ raise ValueError("All POS prompts must be strings.")
681
+ flat_pos.append(p)
682
+ idx += 1
683
+ pos_slices.append((s, idx))
684
+
685
+ flat_neg: List[str] = []
686
+ neg_slices: List[Tuple[int, int]] = []
687
+ idx = 0
688
+ for prompts in CLIP_CLASS_NEG_PROMPTS:
689
+ if not isinstance(prompts, list) or len(prompts) == 0:
690
+ raise ValueError("Each entry in CLIP_CLASS_NEG_PROMPTS must be a non-empty list[str].")
691
+ s = idx
692
+ for p in prompts:
693
+ if not isinstance(p, str):
694
+ raise ValueError("All NEG prompts must be strings.")
695
+ flat_neg.append(p)
696
+ idx += 1
697
+ neg_slices.append((s, idx))
698
+
699
+ with torch.no_grad():
700
+ pos_tokens = clip.tokenize(flat_pos).to(device)
701
+ pos_feats_all = model.encode_text(pos_tokens)
702
+ pos_feats_all = pos_feats_all / pos_feats_all.norm(dim=-1, keepdim=True)
703
+
704
+ neg_tokens = clip.tokenize(flat_neg).to(device)
705
+ neg_feats_all = model.encode_text(neg_tokens)
706
+ neg_feats_all = neg_feats_all / neg_feats_all.norm(dim=-1, keepdim=True)
707
+
708
+ pos_class_feats: List[torch.Tensor] = []
709
+ neg_class_feats: List[torch.Tensor] = []
710
+
711
+ for (s, e) in pos_slices:
712
+ pos_class_feats.append(pos_feats_all[s:e])
713
+ for (s, e) in neg_slices:
714
+ neg_class_feats.append(neg_feats_all[s:e])
715
+
716
+ pack = {
717
+ "labels": CLIP_CLASS_LABELS,
718
+ "pos_class_feats": pos_class_feats,
719
+ "neg_class_feats": neg_class_feats,
720
+ "score_mode": str(CLIP_SCORE_MODE),
721
+ "min_margin": float(CLIP_MIN_MARGIN),
722
+ }
723
+ return model, preprocess, pack, device
724
+
725
+
726
+ def classify_frame_clip(
727
+ *,
728
+ frame_bgr: np.ndarray,
729
+ clip_model: Any,
730
+ clip_preprocess: Any,
731
+ clip_text_features: Any,
732
+ clip_device: str,
733
+ ) -> Tuple[str, Dict[str, float]]:
734
+ pack = clip_text_features
735
+ labels: List[str] = pack["labels"]
736
+ pos_class_feats: List[Any] = pack["pos_class_feats"]
737
+ neg_class_feats: List[Any] = pack["neg_class_feats"]
738
+
739
+ none_margin: float = float(pack.get("none_margin", 0.02))
740
+ weak_thr: float = float(pack.get("weak_thr", 0.00))
741
+ slide_close: float = float(pack.get("slide_close", 0.03))
742
+
743
+ rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
744
+ img = Image.fromarray(rgb)
745
+ image = clip_preprocess(img).unsqueeze(0).to(clip_device)
746
+
747
+ with torch.no_grad():
748
+ img_feat = clip_model.encode_image(image)
749
+ img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
750
+
751
+ scores: List[float] = []
752
+ for i in range(len(labels)):
753
+ pos_feats = pos_class_feats[i].to(clip_device)
754
+ neg_feats = neg_class_feats[i].to(clip_device)
755
+
756
+ pos_sims = (img_feat @ pos_feats.T).squeeze(0)
757
+ neg_sims = (img_feat @ neg_feats.T).squeeze(0)
758
+
759
+ score = float(pos_sims.mean().item() - neg_sims.mean().item())
760
+ scores.append(score)
761
+
762
+ scores_np = np.array(scores, dtype=np.float32)
763
+ score_map: Dict[str, float] = {labels[i]: float(scores_np[i]) for i in range(len(labels))}
764
+
765
+ if "none" not in labels:
766
+ best_idx = int(np.argmax(scores_np))
767
+ pred = labels[best_idx]
768
+ if ("slides" in labels) and (pred != "slides"):
769
+ winner_score = float(score_map[pred])
770
+ slides_score = float(score_map["slides"])
771
+ if (winner_score - slides_score) < float(slide_close):
772
+ pred = "slides"
773
+ score_map["_slide_close"] = float(slide_close)
774
+ return pred, score_map
775
+
776
+ none_idx = int(labels.index("none"))
777
+ none_score = float(scores_np[none_idx])
778
+
779
+ non_none_idxs = [i for i, lab in enumerate(labels) if lab != "none"]
780
+ best_non_none_idx = int(max(non_none_idxs, key=lambda i: float(scores_np[i])))
781
+ best_non_none_label = labels[best_non_none_idx]
782
+ best_non_none_score = float(scores_np[best_non_none_idx])
783
+
784
+ if (none_score >= best_non_none_score + none_margin) or (best_non_none_score < weak_thr):
785
+ pred = "none"
786
+ else:
787
+ pred = best_non_none_label
788
+
789
+ if pred != "none" and ("slides" in labels) and (pred != "slides"):
790
+ slides_score = float(score_map["slides"])
791
+ winner_score = float(score_map[pred])
792
+ if (winner_score - slides_score) < float(slide_close):
793
+ pred = "slides"
794
+
795
+ score_map["_best_non_none_score"] = float(best_non_none_score)
796
+ score_map["_none_score"] = float(none_score)
797
+ score_map["_none_margin"] = float(none_margin)
798
+ score_map["_weak_thr"] = float(weak_thr)
799
+ score_map["_best_non_none_idx"] = float(best_non_none_idx)
800
+ score_map["_none_idx"] = float(none_idx)
801
+ score_map["_slide_close"] = float(slide_close)
802
+ if "slides" in labels:
803
+ score_map["_slides_score"] = float(score_map["slides"])
804
+
805
+ return pred, score_map
806
+
807
+
808
+ # ----------------------------
809
+ # Candidate detection (cheap, local)
810
+ # ----------------------------
811
+
812
+ def find_candidates_diff(
813
+ video_path: Path,
814
+ sample_fps: float,
815
+ resize_w: int,
816
+ candidate_percentile: float,
817
+ max_candidates: int,
818
+ ) -> Tuple[List[CandidateFrame], float]:
819
+ fps, duration, total_frames = _probe_video(video_path)
820
+ if duration <= 0 or total_frames <= 0:
821
+ raise RuntimeError("Could not determine video duration/frames.")
822
+
823
+ cap = cv2.VideoCapture(str(video_path))
824
+ if not cap.isOpened():
825
+ raise RuntimeError(f"Could not open video: {video_path}")
826
+
827
+ sample_fps = float(sample_fps)
828
+ if sample_fps <= 0:
829
+ raise ValueError("sample_fps must be > 0")
830
+
831
+ step_frames = max(1, int(round(fps / sample_fps)))
832
+
833
+ print(f" [step1] video_fps={fps:.3f} duration_sec={duration:.2f} total_frames={total_frames}")
834
+ print(f" [step1] SAMPLE_FPS={sample_fps} -> step_frames={step_frames} (~{1.0/sample_fps:.2f}s per sample)")
835
+ print(f" [step1] RESIZE_W={resize_w} CANDIDATE_PERCENTILE={candidate_percentile} MAX_CANDIDATES={max_candidates}")
836
+
837
+ candidates: List[CandidateFrame] = []
838
+ diffs: List[float] = []
839
+
840
+ prev_gray = None
841
+ sampled = 0
842
+
843
+ max_k = int((total_frames - 1) // step_frames) if total_frames > 0 else 0
844
+
845
+ for k in range(max_k + 1):
846
+ frame_idx = int(k * step_frames)
847
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
848
+ ret, frame = cap.read()
849
+ if not ret or frame is None:
850
+ break
851
+
852
+ sampled += 1
853
+ t_sec = frame_idx / fps
854
+
855
+ gray = _downscale_gray(frame, resize_w=resize_w)
856
+ d = 999.0 if prev_gray is None else _mad_diff(gray, prev_gray)
857
+
858
+ candidates.append(CandidateFrame(t_sec=float(t_sec), frame_idx=int(frame_idx), diff_score=float(d)))
859
+ diffs.append(float(d))
860
+ prev_gray = gray
861
+
862
+ if sampled % 300 == 0:
863
+ print(f" [step1] sampled={sampled} last_t={fmt_hhmmss(t_sec)} last_diff={d:.2f}")
864
+
865
+ cap.release()
866
+
867
+ if not candidates:
868
+ print(" [step1] no candidates produced (empty video?)")
869
+ return [], 0.0
870
+
871
+ diffs_np = np.array(diffs, dtype=np.float32)
872
+ diffs_for_thr = diffs_np[1:] if len(diffs_np) > 1 else diffs_np
873
+ base_thr = float(np.percentile(diffs_for_thr, float(candidate_percentile)))
874
+ base_thr = max(4.0, base_thr)
875
+
876
+ order = np.argsort(diffs_np)[::-1]
877
+ picked = set()
878
+ out: List[CandidateFrame] = []
879
+
880
+ out.append(candidates[0])
881
+ picked.add(0)
882
+
883
+ for idx in order:
884
+ if len(out) >= int(max_candidates):
885
+ break
886
+ ii = int(idx)
887
+ if ii in picked:
888
+ continue
889
+ out.append(candidates[ii])
890
+ picked.add(ii)
891
+
892
+ out.sort(key=lambda x: x.t_sec)
893
+
894
+ print(f" [step1] sampled_frames={sampled} raw_candidates={len(candidates)} selected_candidates={len(out)} base_thr={base_thr:.2f}")
895
+ return out, base_thr
896
+
897
+
898
+ # ----------------------------
899
+ # Keyframe keep rule (visual only)
900
+ # ----------------------------
901
+
902
+ def should_keep_visual_only(
903
+ *,
904
+ frame_type: str,
905
+ t_sec: float,
906
+ diff_to_last_keep: float,
907
+ base_thr: float,
908
+ last_kept_t: float,
909
+ ) -> Tuple[bool, Dict[str, float]]:
910
+ cfg = SENS.get(frame_type, {"min_gap_sec": 1.0, "diff_mult": 1.0})
911
+ diff_mult = float(cfg.get("diff_mult", 1.0))
912
+
913
+ min_gap = float(MIN_KEYFRAME_GAP_SEC)
914
+
915
+ ok_gap = True if last_kept_t <= -1e8 else ((t_sec - last_kept_t) >= min_gap)
916
+ thr_eff = float(base_thr * diff_mult)
917
+ ok_visual = diff_to_last_keep >= thr_eff
918
+
919
+ debug = {
920
+ "diff_to_last_keep": float(diff_to_last_keep),
921
+ "thr_effective": float(thr_eff),
922
+ "ok_gap": 1.0 if ok_gap else 0.0,
923
+ "ok_visual": 1.0 if ok_visual else 0.0,
924
+ "min_gap_sec_used": float(min_gap),
925
+ }
926
+ return (ok_gap and ok_visual), debug
927
+
928
+
929
+ # ----------------------------
930
+ # Concurrent parsing worker (YOLO + OCR) for kept keyframes
931
+ # ----------------------------
932
+
933
+ _WORKER_LAYOUT_MODEL = None
934
+ _WORKER_OCR_MODEL = None
935
+
936
+ def _worker_init(layout_weights: str, ocr_lang: str, enable_yolo: bool = True):
937
+ global _WORKER_LAYOUT_MODEL, _WORKER_OCR_MODEL
938
+ _WORKER_LAYOUT_MODEL = YOLO(layout_weights) if enable_yolo else None
939
+
940
+ # IMPORTANT:
941
+ # - use_angle_cls=False: turn off angle classifier
942
+ # - use_gpu=USE_GPU: attempts GPU (requires paddlepaddle-gpu)
943
+ _WORKER_OCR_MODEL = PaddleOCR(
944
+ use_angle_cls=False,
945
+ lang=ocr_lang,
946
+ use_gpu=USE_GPU,
947
+ show_log=False,
948
+ enable_mkldnn=False,
949
+ ir_optim=False,
950
+ )
951
+
952
+ def _parse_one_keyframe(job: dict) -> dict:
953
+ global _WORKER_LAYOUT_MODEL, _WORKER_OCR_MODEL
954
+ kidx = int(job["keyframe_idx"])
955
+ img_path = job["image_path"]
956
+ frame_type = str(job.get("frame_type", "none"))
957
+ parse_mode = str(job.get("parse_mode", "yolo_ocr"))
958
+
959
+ frame = cv2.imread(str(img_path))
960
+ if frame is None:
961
+ return {"keyframe_idx": kidx, "error": f"Could not read image: {img_path}"}
962
+
963
+ # Resize for slides/demo/none to speed up YOLO+OCR; keep code max
964
+ max_w = int(PARSE_MAX_W_BY_TYPE.get(frame_type, 1280))
965
+ frame_for_parse = _resize_frame_max_w(frame, max_w=max_w) if max_w < 99999 else frame
966
+
967
+ H, W = frame_for_parse.shape[:2]
968
+
969
+ regions: List[dict] = []
970
+ t_yolo_ms = 0.0
971
+ if parse_mode == "yolo_ocr":
972
+ if _WORKER_LAYOUT_MODEL is None:
973
+ return {"keyframe_idx": kidx, "error": "YOLO model is not initialized for yolo_ocr parse mode."}
974
+ t0 = time.perf_counter()
975
+ regions = run_layout_yolo(_WORKER_LAYOUT_MODEL, frame_for_parse)
976
+ t_yolo_ms = (time.perf_counter() - t0) * 1000.0
977
+
978
+ t0 = time.perf_counter()
979
+ if parse_mode == "yolo_ocr":
980
+ ocr_lines = run_paddle_ocr_on_text_regions(
981
+ _WORKER_OCR_MODEL,
982
+ frame_for_parse,
983
+ regions,
984
+ frame_type=frame_type,
985
+ max_regions=OCR_CROP_MAX_REGIONS,
986
+ )
987
+ else:
988
+ # OCR-only mode: no layout detection, run full-frame OCR.
989
+ ocr_lines = run_paddle_ocr(_WORKER_OCR_MODEL, frame_for_parse)
990
+ t_ocr_ms = (time.perf_counter() - t0) * 1000.0
991
+
992
+ t0 = time.perf_counter()
993
+ regions_with_text = attach_ocr_to_regions(regions, ocr_lines) if regions else []
994
+ zones = attach_zones(regions_with_text, W=W, H=H) if regions_with_text else {"top": [], "left": [], "center": [], "right": [], "bottom": []}
995
+ title_guess_val = guess_title(regions_with_text, ocr_lines)
996
+ t_attach_ms = (time.perf_counter() - t0) * 1000.0
997
+
998
+ text_lines = [x["text"] for x in ocr_lines if x.get("text")][:MAX_OCR_LINES]
999
+
1000
+ screen_parse = {
1001
+ "frame_w": int(W),
1002
+ "frame_h": int(H),
1003
+ "layout_regions": regions_with_text,
1004
+ "ocr_lines": ocr_lines,
1005
+ "zones": zones,
1006
+ "title_guess": title_guess_val,
1007
+ "layout_model": str(LAYOUT_YOLO_WEIGHTS),
1008
+ "ocr_lang": str(OCR_LANG),
1009
+ "layout_conf": float(LAYOUT_CONF),
1010
+ "layout_iou": float(LAYOUT_IOU),
1011
+ "ocr_min_conf": float(OCR_MIN_CONF),
1012
+ "parse_input_frame_type": str(frame_type),
1013
+ "yolo_device": str(YOLO_DEVICE),
1014
+ "yolo_imgsz": int(YOLO_IMGSZ),
1015
+ "ocr_use_gpu": bool(USE_GPU),
1016
+ "ocr_angle_cls": False,
1017
+ "ocr_crop_max_regions": int(OCR_CROP_MAX_REGIONS),
1018
+ "ocr_crop_scale_used": float(OCR_CROP_SCALE_BY_TYPE.get(frame_type, 0.80)),
1019
+ "parse_max_w_used": int(max_w),
1020
+ "parse_mode": str(parse_mode),
1021
+ }
1022
+
1023
+ return {
1024
+ "keyframe_idx": kidx,
1025
+ "on_screen_text": text_lines,
1026
+ "screen_parse": screen_parse,
1027
+ "parse_timings_ms": {
1028
+ "full_yolo_ms": float(t_yolo_ms),
1029
+ "full_ocr_ms": float(t_ocr_ms),
1030
+ "attach_text_ms": float(t_attach_ms),
1031
+ }
1032
+ }
1033
+
1034
+
1035
+ # ----------------------------
1036
+ # Main
1037
+ # ----------------------------
1038
+
1039
+ def main():
1040
+ load_dotenv()
1041
+
1042
+ ap = argparse.ArgumentParser()
1043
+ ap.add_argument("--video", required=True, help="Path to meeting.mp4")
1044
+ ap.add_argument("--out", required=True, help="Output folder")
1045
+ ap.add_argument("--force", action="store_true")
1046
+ ap.add_argument(
1047
+ "--no-yolo-for-non-demo",
1048
+ action="store_true",
1049
+ help="Use OCR-only parsing for non-demo frames (slides/code/none).",
1050
+ )
1051
+ args = ap.parse_args()
1052
+
1053
+ if not ENABLE_LOCAL_SCREEN_PARSE:
1054
+ raise RuntimeError("ENABLE_LOCAL_SCREEN_PARSE must be True. YOLO and PaddleOCR are required.")
1055
+
1056
+ if not Path(LAYOUT_YOLO_WEIGHTS).exists():
1057
+ raise FileNotFoundError(f"Layout YOLO weights not found at: {LAYOUT_YOLO_WEIGHTS}")
1058
+
1059
+ try:
1060
+ _ = YOLO(LAYOUT_YOLO_WEIGHTS)
1061
+ except Exception as e:
1062
+ raise RuntimeError(f"YOLO init failed: {type(e).__name__}: {e}") from e
1063
+
1064
+ # NOTE: this tries GPU; if your Paddle is CPU-only, this may error.
1065
+ # In that case install paddlepaddle-gpu, or set USE_GPU=False.
1066
+ try:
1067
+ _ = PaddleOCR(
1068
+ use_angle_cls=False,
1069
+ lang=OCR_LANG,
1070
+ use_gpu=USE_GPU,
1071
+ show_log=False,
1072
+ enable_mkldnn=False,
1073
+ ir_optim=False,
1074
+ )
1075
+ except Exception as e:
1076
+ raise RuntimeError(f"PaddleOCR init failed: {type(e).__name__}: {e}") from e
1077
+
1078
+ try:
1079
+ clip_model, clip_preprocess, clip_text_features, clip_device = init_clip_classifier()
1080
+ except Exception as e:
1081
+ raise RuntimeError(f"CLIP classifier init failed: {type(e).__name__}: {e}") from e
1082
+
1083
+ video_path = Path(args.video).resolve()
1084
+ out_dir = Path(args.out).resolve()
1085
+ out_dir.mkdir(parents=True, exist_ok=True)
1086
+
1087
+ frames_dir = out_dir / "frames_selected"
1088
+ frames_dir.mkdir(parents=True, exist_ok=True)
1089
+
1090
+ enriched_json = out_dir / "keyframes_parsed.json"
1091
+ timing_json = out_dir / "timing_summary.json"
1092
+ classified_dir = out_dir / "classified"
1093
+ classified_dir.mkdir(parents=True, exist_ok=True)
1094
+
1095
+ out_paths = {
1096
+ "slides": classified_dir / "slides_keyframes.json",
1097
+ "code": classified_dir / "code_keyframes.json",
1098
+ "demo": classified_dir / "demo_keyframes.json",
1099
+ "none": classified_dir / "none_keyframes.json",
1100
+ }
1101
+
1102
+ t_total0 = time.perf_counter()
1103
+ timing_totals = {
1104
+ "candidate_detection_ms": 0.0,
1105
+ "candidate_loop_ms": 0.0,
1106
+ "read_frame_ms": 0.0,
1107
+ "gray_diff_ms": 0.0,
1108
+ "clip_ms": 0.0,
1109
+ "keep_logic_ms": 0.0,
1110
+ "save_frame_ms": 0.0,
1111
+ "parse_concurrent_ms": 0.0,
1112
+ "json_write_ms": 0.0,
1113
+ }
1114
+
1115
+ all_selected: List[dict] = []
1116
+ processed_times: set = set()
1117
+
1118
+ last_kept_t = -1e9
1119
+ last_kept_gray: Optional[np.ndarray] = None
1120
+
1121
+ if (not args.force) and enriched_json.exists():
1122
+ try:
1123
+ all_selected = safe_read_json(enriched_json)
1124
+ if isinstance(all_selected, list) and all_selected:
1125
+ processed_times = {round(float(x.get("t_sec", -1.0)), 2) for x in all_selected if "t_sec" in x}
1126
+ last = all_selected[-1]
1127
+ last_kept_t = float(last.get("t_sec", last_kept_t))
1128
+
1129
+ last_img = Path(last.get("image_path", ""))
1130
+ if last_img.exists():
1131
+ img = cv2.imread(str(last_img))
1132
+ if img is not None:
1133
+ last_kept_gray = _downscale_gray(img, RESIZE_W)
1134
+
1135
+ print(f"Resuming: already selected {len(all_selected)} keyframes (last at {fmt_hhmmss(last_kept_t)}).")
1136
+ except Exception:
1137
+ all_selected = []
1138
+ processed_times = set()
1139
+ last_kept_t = -1e9
1140
+ last_kept_gray = None
1141
+
1142
+ if args.force:
1143
+ all_selected = []
1144
+ processed_times = set()
1145
+ last_kept_t = -1e9
1146
+ last_kept_gray = None
1147
+
1148
+ print("1) Finding candidate change points locally (no API)...")
1149
+ print(f" [step1] starting... (this can take time on long videos)")
1150
+ t0 = time.perf_counter()
1151
+ candidates, base_thr = find_candidates_diff(
1152
+ video_path=video_path,
1153
+ sample_fps=SAMPLE_FPS,
1154
+ resize_w=RESIZE_W,
1155
+ candidate_percentile=CANDIDATE_PERCENTILE,
1156
+ max_candidates=MAX_CANDIDATES,
1157
+ )
1158
+ t1_ms = (time.perf_counter() - t0) * 1000.0
1159
+ timing_totals["candidate_detection_ms"] += t1_ms
1160
+ print(f" [step1] done in {t1_ms/1000.0:.2f}s")
1161
+
1162
+ print(f"Candidates: {len(candidates)}, base diff threshold ~ {base_thr:.2f}")
1163
+ print("Sensitivity config (edit in code):", SENS)
1164
+ print("Layout model:", LAYOUT_YOLO_WEIGHTS)
1165
+ print("YOLO device:", YOLO_DEVICE, "| imgsz:", YOLO_IMGSZ)
1166
+ print("OCR lang:", OCR_LANG, "| OCR_MIN_CONF:", OCR_MIN_CONF, "| OCR GPU:", USE_GPU, "| angle_cls:", False)
1167
+ print("CLIP model:", CLIP_MODEL_NAME, "| device:", clip_device)
1168
+ print("Parse workers:", PARSE_WORKERS)
1169
+ print(f"Global min gap override (seconds since last keyframe): {MIN_KEYFRAME_GAP_SEC:.2f}s")
1170
+
1171
+ kept_count = len(all_selected)
1172
+ reader = VideoReader(video_path)
1173
+
1174
+ try:
1175
+ print("2) Selecting keyframes (VISUAL ONLY: time gap + diff; no OCR in loop)...")
1176
+ t_loop0 = time.perf_counter()
1177
+
1178
+ for i, cand in enumerate(candidates, start=1):
1179
+ if kept_count >= int(MAX_FRAMES):
1180
+ break
1181
+
1182
+ t_key = round(float(cand.t_sec), 2)
1183
+ if t_key in processed_times:
1184
+ continue
1185
+ if cand.t_sec <= (last_kept_t + 1e-6) and last_kept_t > -1e8:
1186
+ continue
1187
+
1188
+ gap = float(cand.t_sec - last_kept_t) if last_kept_t > -1e8 else 9999.0
1189
+
1190
+ if last_kept_t > -1e8 and gap < float(MIN_KEYFRAME_GAP_SEC):
1191
+ continue
1192
+
1193
+ t0 = time.perf_counter()
1194
+ frame = reader.read_at_frame(cand.frame_idx)
1195
+ timing_totals["read_frame_ms"] += (time.perf_counter() - t0) * 1000.0
1196
+ if frame is None:
1197
+ continue
1198
+
1199
+ t0 = time.perf_counter()
1200
+ gray_now = _downscale_gray(frame, RESIZE_W)
1201
+ diff_to_last_keep = 999.0 if last_kept_gray is None else _mad_diff(gray_now, last_kept_gray)
1202
+ timing_totals["gray_diff_ms"] += (time.perf_counter() - t0) * 1000.0
1203
+
1204
+ print(
1205
+ f"[{i}/{len(candidates)}] t={fmt_hhmmss(cand.t_sec)} "
1206
+ f"gap_since_last_keep={gap:.2f}s cand_diff={cand.diff_score:.2f} keep_diff={diff_to_last_keep:.2f} ..."
1207
+ )
1208
+
1209
+ frame_fast = _resize_frame_max_w(frame, FAST_FRAME_MAX_W)
1210
+
1211
+ t0 = time.perf_counter()
1212
+ frame_type, clip_probs = classify_frame_clip(
1213
+ frame_bgr=frame_fast,
1214
+ clip_model=clip_model,
1215
+ clip_preprocess=clip_preprocess,
1216
+ clip_text_features=clip_text_features,
1217
+ clip_device=clip_device,
1218
+ )
1219
+ t_clip_ms = (time.perf_counter() - t0) * 1000.0
1220
+ timing_totals["clip_ms"] += t_clip_ms
1221
+
1222
+ t0 = time.perf_counter()
1223
+ keep, dbg = should_keep_visual_only(
1224
+ frame_type=frame_type,
1225
+ t_sec=float(cand.t_sec),
1226
+ diff_to_last_keep=float(diff_to_last_keep),
1227
+ base_thr=float(base_thr),
1228
+ last_kept_t=float(last_kept_t),
1229
+ )
1230
+ t_keep_ms = (time.perf_counter() - t0) * 1000.0
1231
+ timing_totals["keep_logic_ms"] += t_keep_ms
1232
+
1233
+ print(
1234
+ f" timings: clip={t_clip_ms:.0f}ms keep_logic={t_keep_ms:.0f}ms "
1235
+ f"| type={frame_type} keep={keep} | diff={diff_to_last_keep:.2f} thr_eff={dbg['thr_effective']:.2f} "
1236
+ f"| min_gap_used={dbg.get('min_gap_sec_used', MIN_KEYFRAME_GAP_SEC):.2f}s"
1237
+ )
1238
+
1239
+ if not keep:
1240
+ if BASE_SLEEP_SEC > 0:
1241
+ time.sleep(BASE_SLEEP_SEC)
1242
+ continue
1243
+
1244
+ t0 = time.perf_counter()
1245
+ out_img = frames_dir / f"frame_{kept_count:04d}_{cand.t_sec:.2f}s_{frame_type}.jpg"
1246
+ cv2.imwrite(str(out_img), frame)
1247
+ t_save_ms = (time.perf_counter() - t0) * 1000.0
1248
+ timing_totals["save_frame_ms"] += t_save_ms
1249
+
1250
+ item = {
1251
+ "keyframe_idx": int(kept_count),
1252
+ "t_sec": float(cand.t_sec),
1253
+ "timestamp": fmt_hhmmss(cand.t_sec),
1254
+ "image_path": str(out_img),
1255
+
1256
+ "frame_type": frame_type,
1257
+ "on_screen_text": [],
1258
+ "screen_parse": None,
1259
+
1260
+ "candidate_diff_score": float(cand.diff_score),
1261
+ "diff_to_last_keep": float(diff_to_last_keep),
1262
+ "base_diff_threshold": float(base_thr),
1263
+ "thr_effective": float(dbg.get("thr_effective", 0.0)),
1264
+ "gap_since_last_keep_sec": float(gap),
1265
+
1266
+ "clip_probs": {k: float(v) for k, v in clip_probs.items()},
1267
+ "clip_prompt_map": dict(zip(CLIP_CLASS_LABELS, CLIP_CLASS_PROMPTS)),
1268
+ "clip_model_name": str(CLIP_MODEL_NAME),
1269
+
1270
+ "timings_ms": {
1271
+ "clip_ms": float(t_clip_ms),
1272
+ "keep_logic_ms": float(t_keep_ms),
1273
+ "save_frame_ms": float(t_save_ms),
1274
+ },
1275
+ }
1276
+
1277
+ all_selected.append(item)
1278
+ processed_times.add(t_key)
1279
+ kept_count += 1
1280
+
1281
+ last_kept_t = float(cand.t_sec)
1282
+ last_kept_gray = gray_now
1283
+
1284
+ t0 = time.perf_counter()
1285
+ safe_write_json(enriched_json, all_selected)
1286
+ timing_totals["json_write_ms"] += (time.perf_counter() - t0) * 1000.0
1287
+
1288
+ if BASE_SLEEP_SEC > 0:
1289
+ time.sleep(BASE_SLEEP_SEC)
1290
+
1291
+ timing_totals["candidate_loop_ms"] += (time.perf_counter() - t_loop0) * 1000.0
1292
+
1293
+ finally:
1294
+ reader.close()
1295
+
1296
+ # Phase 3: YOLO + OCR concurrently on keyframes that need parsing
1297
+ to_parse = []
1298
+ for it in all_selected:
1299
+ if (not args.force) and isinstance(it.get("screen_parse"), dict) and it.get("on_screen_text"):
1300
+ continue
1301
+ if it.get("image_path"):
1302
+ frame_type = str(it.get("frame_type", "none"))
1303
+ parse_mode = "yolo_ocr"
1304
+ if args.no_yolo_for_non_demo and frame_type != "demo":
1305
+ parse_mode = "ocr_only"
1306
+ to_parse.append({
1307
+ "keyframe_idx": int(it["keyframe_idx"]),
1308
+ "t_sec": float(it["t_sec"]),
1309
+ "frame_type": frame_type,
1310
+ "image_path": str(it["image_path"]),
1311
+ "parse_mode": parse_mode,
1312
+ })
1313
+
1314
+ print(f"3) Parsing kept keyframes with YOLO+OCR concurrently... to_parse={len(to_parse)}")
1315
+
1316
+ if to_parse:
1317
+ yolo_jobs = sum(1 for j in to_parse if j.get("parse_mode") == "yolo_ocr")
1318
+ ocr_only_jobs = len(to_parse) - yolo_jobs
1319
+ enable_yolo = yolo_jobs > 0
1320
+
1321
+ print(" [step3] starting ProcessPoolExecutor...")
1322
+ print(f" [step3] PARSE_WORKERS={PARSE_WORKERS} (each worker loads YOLO + PaddleOCR once)")
1323
+ print(f" [step3] YOLO_DEVICE={YOLO_DEVICE} YOLO_IMGSZ={YOLO_IMGSZ} | OCR_GPU={USE_GPU} angle_cls=False")
1324
+ print(f" [step3] OCR crops: max_regions={OCR_CROP_MAX_REGIONS} scale_by_type={OCR_CROP_SCALE_BY_TYPE}")
1325
+ print(f" [step3] Parse resize max_w_by_type={PARSE_MAX_W_BY_TYPE}")
1326
+ print(f" [step3] parse_mode split: yolo_ocr={yolo_jobs}, ocr_only={ocr_only_jobs}")
1327
+
1328
+ t0 = time.perf_counter()
1329
+
1330
+ with cf.ProcessPoolExecutor(
1331
+ max_workers=max(1, PARSE_WORKERS),
1332
+ initializer=_worker_init,
1333
+ initargs=(str(LAYOUT_YOLO_WEIGHTS), str(OCR_LANG), bool(enable_yolo)),
1334
+ ) as ex:
1335
+ fut_to_job = {ex.submit(_parse_one_keyframe, job): job for job in to_parse}
1336
+
1337
+ done_count = 0
1338
+ err_count = 0
1339
+ t_last_report = time.perf_counter()
1340
+
1341
+ for fut in cf.as_completed(fut_to_job):
1342
+ job = fut_to_job[fut]
1343
+ job_kidx = int(job.get("keyframe_idx", -1))
1344
+ done_count += 1
1345
+
1346
+ try:
1347
+ res = fut.result()
1348
+ except Exception as e:
1349
+ err_count += 1
1350
+ if 0 <= job_kidx < len(all_selected):
1351
+ all_selected[job_kidx]["screen_parse_error"] = f"worker_exception: {type(e).__name__}: {e}"
1352
+ now = time.perf_counter()
1353
+ if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
1354
+ print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
1355
+ t_last_report = now
1356
+ continue
1357
+
1358
+ kidx = int(res.get("keyframe_idx", job_kidx))
1359
+ if kidx < 0 or kidx >= len(all_selected):
1360
+ now = time.perf_counter()
1361
+ if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
1362
+ print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
1363
+ t_last_report = now
1364
+ continue
1365
+
1366
+ # Track explicit worker-level error payloads.
1367
+ if "error" in res:
1368
+ err_count += 1
1369
+
1370
+ if "error" in res:
1371
+ all_selected[kidx]["screen_parse_error"] = res["error"]
1372
+ else:
1373
+ all_selected[kidx]["on_screen_text"] = res.get("on_screen_text", [])[:MAX_OCR_LINES]
1374
+ all_selected[kidx]["screen_parse"] = res.get("screen_parse")
1375
+ tm = all_selected[kidx].get("timings_ms", {}) or {}
1376
+ tm.update(res.get("parse_timings_ms", {}) or {})
1377
+ all_selected[kidx]["timings_ms"] = tm
1378
+
1379
+ now = time.perf_counter()
1380
+ if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
1381
+ print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
1382
+ t_last_report = now
1383
+
1384
+ t3_ms = (time.perf_counter() - t0) * 1000.0
1385
+ timing_totals["parse_concurrent_ms"] += t3_ms
1386
+ print(f" [step3] done in {t3_ms/1000.0:.2f}s (errors={err_count})")
1387
+
1388
+ # Rebuild buckets from final frame_type
1389
+ buckets: Dict[str, List[dict]] = {k: [] for k in out_paths.keys()}
1390
+ for it in all_selected:
1391
+ ft = it.get("frame_type", "none")
1392
+ if ft not in buckets:
1393
+ ft = "none"
1394
+ it["frame_type"] = "none"
1395
+ buckets[ft].append(it)
1396
+
1397
+ # Final writes
1398
+ t0 = time.perf_counter()
1399
+ safe_write_json(enriched_json, all_selected)
1400
+ for ft, p in out_paths.items():
1401
+ safe_write_json(p, buckets[ft])
1402
+ timing_totals["json_write_ms"] += (time.perf_counter() - t0) * 1000.0
1403
+
1404
+ total_ms = (time.perf_counter() - t_total0) * 1000.0
1405
+
1406
+ timing_summary = {
1407
+ "timing_totals_ms": {k: float(v) for k, v in timing_totals.items()},
1408
+ "total_ms": float(total_ms),
1409
+ "candidates": int(len(candidates)),
1410
+ "selected_frames": int(len(all_selected)),
1411
+ "parsed_frames": int(sum(1 for x in all_selected if isinstance(x.get("screen_parse"), dict))),
1412
+ "parse_workers": int(PARSE_WORKERS),
1413
+ "min_keyframe_gap_sec": float(MIN_KEYFRAME_GAP_SEC),
1414
+ "yolo_device": str(YOLO_DEVICE),
1415
+ "yolo_imgsz": int(YOLO_IMGSZ),
1416
+ "ocr_use_gpu": bool(USE_GPU),
1417
+ "ocr_angle_cls": False,
1418
+ "ocr_crop_max_regions": int(OCR_CROP_MAX_REGIONS),
1419
+ "ocr_crop_scale_by_type": dict(OCR_CROP_SCALE_BY_TYPE),
1420
+ "parse_max_w_by_type": dict(PARSE_MAX_W_BY_TYPE),
1421
+ }
1422
+ safe_write_json(timing_json, timing_summary)
1423
+
1424
+ print("\nDone.")
1425
+ print("Selected frames:", len(all_selected))
1426
+ print("Frames folder:", frames_dir)
1427
+ print("Parsed JSON:", enriched_json)
1428
+ print("Timing JSON:", timing_json)
1429
+ for ft, p in out_paths.items():
1430
+ print(ft, "->", p)
1431
+
1432
+ print("\nTiming summary (ms):")
1433
+ for k, v in timing_totals.items():
1434
+ print(f" {k}: {v:.0f}")
1435
+ print(f" total_ms: {total_ms:.0f}")
1436
+
1437
+
1438
+ if __name__ == "__main__":
1439
+ try:
1440
+ main()
1441
+ except Exception as e:
1442
+ print(f"[ERROR] {type(e).__name__}: {e}")
1443
+ raise
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.0.0
2
+ fastapi==0.116.1
3
+ uvicorn==0.34.3
4
+ python-multipart==0.0.20
5
+ setuptools==70.0.0
6
+ wheel==0.45.1
7
+ python-dotenv==1.2.1
8
+ deepgram-sdk==4.8.0
9
+ httpx==0.28.1
10
+ google-genai==1.60.0
11
+ pydantic==2.12.5
12
+ opencv-python-headless==4.11.0.86
13
+ numpy==1.26.4
14
+ ultralytics==8.4.12
15
+ paddleocr==2.7.3
16
+ paddlepaddle==2.6.2
17
+ torch==2.5.1
run_manager.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ import threading
11
+ import time
12
+ import uuid
13
+ from html import unescape
14
+ from pathlib import Path
15
+ from typing import Any, Dict, Optional
16
+ from urllib.parse import parse_qs, urljoin, urlparse
17
+
18
+ import httpx
19
+
20
+
21
+ BASE_DIR = Path(__file__).resolve().parent
22
+ PIPELINES_DIR = BASE_DIR / "pipelines"
23
+ DEFAULT_WORKDIR = Path(os.getenv("PIPELINE_WORKDIR", tempfile.gettempdir())) / "deployed-meet-runs"
24
+ DEFAULT_WORKDIR.mkdir(parents=True, exist_ok=True)
25
+ RUNS_DIR = DEFAULT_WORKDIR / "runs"
26
+ RUNS_DIR.mkdir(parents=True, exist_ok=True)
27
+
28
+
29
+ def _tail(text: str, max_lines: int = 220) -> str:
30
+ lines = (text or "").splitlines()
31
+ if len(lines) <= max_lines:
32
+ return "\n".join(lines)
33
+ return "\n".join(lines[-max_lines:])
34
+
35
+
36
+ def _run_dir(run_id: str) -> Path:
37
+ return RUNS_DIR / run_id
38
+
39
+
40
+ def _meta_path(run_id: str) -> Path:
41
+ return _run_dir(run_id) / "run_meta.json"
42
+
43
+
44
+ def _logs_path(run_id: str) -> Path:
45
+ return _run_dir(run_id) / "pipeline.log"
46
+
47
+
48
+ def _write_json(path: Path, data: Dict[str, Any]) -> None:
49
+ path.parent.mkdir(parents=True, exist_ok=True)
50
+ tmp = path.with_suffix(path.suffix + ".tmp")
51
+ tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
52
+ tmp.replace(path)
53
+
54
+
55
+ def _read_json(path: Path) -> Dict[str, Any]:
56
+ return json.loads(path.read_text(encoding="utf-8"))
57
+
58
+
59
+ def _extract_gdrive_file_id(url: str) -> Optional[str]:
60
+ parsed = urlparse(url)
61
+ host = (parsed.netloc or "").lower()
62
+ if "drive.google.com" not in host:
63
+ return None
64
+
65
+ m = re.search(r"/file/d/([a-zA-Z0-9_-]+)", parsed.path or "")
66
+ if m:
67
+ return m.group(1)
68
+
69
+ qs = parse_qs(parsed.query or "")
70
+ if "id" in qs and qs["id"]:
71
+ return qs["id"][0]
72
+
73
+ return None
74
+
75
+
76
+ def _download_google_drive(url: str, out_path: Path) -> None:
77
+ file_id = _extract_gdrive_file_id(url)
78
+ if not file_id:
79
+ raise ValueError("Could not parse Google Drive file id from video_url.")
80
+
81
+ direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
82
+
83
+ def _is_html_response(resp: httpx.Response) -> bool:
84
+ ctype = (resp.headers.get("content-type") or "").lower()
85
+ if "html" in ctype or "text/plain" in ctype:
86
+ return True
87
+ head = (resp.content[:256] or b"").lower()
88
+ return b"<html" in head or b"<!doctype html" in head
89
+
90
+ def _write_if_file(resp: httpx.Response) -> bool:
91
+ if _is_html_response(resp):
92
+ return False
93
+ if not resp.content or len(resp.content) < 1024:
94
+ return False
95
+ out_path.write_bytes(resp.content)
96
+ return True
97
+
98
+ with httpx.Client(timeout=120.0, follow_redirects=True) as client:
99
+ candidates = [
100
+ direct_url,
101
+ f"https://drive.usercontent.google.com/download?id={file_id}&export=download&confirm=t",
102
+ ]
103
+ for c in candidates:
104
+ rr = client.get(c)
105
+ rr.raise_for_status()
106
+ if _write_if_file(rr):
107
+ return
108
+
109
+ page = client.get(f"https://drive.google.com/file/d/{file_id}/view")
110
+ page.raise_for_status()
111
+ html = page.text or ""
112
+
113
+ form_action_match = re.search(r'id="download-form"[^>]*action="([^"]+)"', html)
114
+ if form_action_match:
115
+ action = unescape(form_action_match.group(1))
116
+ action_url = urljoin("https://drive.google.com", action)
117
+ params = {k: v for k, v in re.findall(r'<input[^>]+name="([^"]+)"[^>]+value="([^"]*)"', html)}
118
+ form_resp = client.get(action_url, params=params)
119
+ form_resp.raise_for_status()
120
+ if _write_if_file(form_resp):
121
+ return
122
+
123
+ link_match = re.search(r'href="(/uc\?export=download[^"]+)"', html)
124
+ if link_match:
125
+ href = unescape(link_match.group(1)).replace("&amp;", "&")
126
+ link_url = urljoin("https://drive.google.com", href)
127
+ link_resp = client.get(link_url)
128
+ link_resp.raise_for_status()
129
+ if _write_if_file(link_resp):
130
+ return
131
+
132
+ cookie_confirm = None
133
+ for k, v in page.cookies.items():
134
+ if str(k).startswith("download_warning"):
135
+ cookie_confirm = v
136
+ break
137
+ if cookie_confirm:
138
+ confirm_url = f"https://drive.google.com/uc?export=download&confirm={cookie_confirm}&id={file_id}"
139
+ confirm_resp = client.get(confirm_url)
140
+ confirm_resp.raise_for_status()
141
+ if _write_if_file(confirm_resp):
142
+ return
143
+
144
+ msg = "Google Drive link did not provide a downloadable file."
145
+ low = html.lower()
146
+ if "you need access" in low or "request access" in low:
147
+ msg += " File is not publicly accessible."
148
+ elif "quota exceeded" in low or "too many users have viewed or downloaded" in low:
149
+ msg += " File appears to be quota-limited by Google Drive."
150
+ else:
151
+ msg += " Use a publicly accessible direct file link or local video file upload."
152
+ raise ValueError(msg)
153
+
154
+
155
+ def _validate_video_file(path: Path) -> None:
156
+ if not path.exists() or not path.is_file():
157
+ raise ValueError(f"Input video file not found: {path}")
158
+
159
+ size = path.stat().st_size
160
+ if size < 1024:
161
+ raise ValueError(f"Input file is too small to be valid media: {path} ({size} bytes)")
162
+
163
+ try:
164
+ head = path.read_bytes()[:4096].lower()
165
+ if b"<html" in head or b"<!doctype html" in head or b"{\"error\"" in head:
166
+ raise ValueError(
167
+ "Downloaded input is not a media file (looks like HTML/JSON response). "
168
+ "Use a direct video URL or upload a file."
169
+ )
170
+ except ValueError:
171
+ raise
172
+ except Exception:
173
+ pass
174
+
175
+ try:
176
+ import cv2
177
+
178
+ cap = cv2.VideoCapture(str(path))
179
+ ok = cap.isOpened()
180
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
181
+ cap.release()
182
+ if (not ok) or frame_count <= 0:
183
+ raise ValueError(
184
+ "Input file is not a decodable video for this runtime. "
185
+ "Provide a valid MP4 (H.264/AAC recommended)."
186
+ )
187
+ except ValueError:
188
+ raise
189
+ except Exception:
190
+ pass
191
+
192
+
193
+ def _resolve_python_executable(python_bin: Optional[str]) -> str:
194
+ if python_bin:
195
+ p = Path(python_bin).expanduser()
196
+ if not p.exists():
197
+ raise ValueError(f"python_bin does not exist: {p}")
198
+ return str(p.resolve())
199
+
200
+ candidates = [
201
+ BASE_DIR.parent / ".venv" / "Scripts" / "python.exe",
202
+ BASE_DIR / ".venv" / "Scripts" / "python.exe",
203
+ BASE_DIR.parent / ".venv" / "bin" / "python",
204
+ BASE_DIR / ".venv" / "bin" / "python",
205
+ ]
206
+ for c in candidates:
207
+ if c.exists():
208
+ return str(c.resolve())
209
+
210
+ return sys.executable or os.getenv("PYTHON_BIN") or "python"
211
+
212
+
213
+ def _resolve_out_dir(out_dir: Optional[str], run_id: str) -> Path:
214
+ if out_dir:
215
+ p = Path(out_dir)
216
+ if not p.is_absolute():
217
+ p = DEFAULT_WORKDIR / p
218
+ else:
219
+ p = DEFAULT_WORKDIR / f"run_{run_id}"
220
+ p.mkdir(parents=True, exist_ok=True)
221
+ return p.resolve()
222
+
223
+
224
+ def _build_common_args(
225
+ *,
226
+ video_path: Path,
227
+ out_dir: Path,
228
+ deepgram_model: str,
229
+ deepgram_language: Optional[str],
230
+ deepgram_request_timeout_sec: float,
231
+ deepgram_connect_timeout_sec: float,
232
+ deepgram_retries: int,
233
+ deepgram_retry_backoff_sec: float,
234
+ force_deepgram: bool,
235
+ force_keyframes: bool,
236
+ pre_roll_sec: float,
237
+ gemini_model: str,
238
+ similarity_threshold: float,
239
+ temperature: float,
240
+ ) -> list[str]:
241
+ args = [
242
+ "--video",
243
+ str(video_path),
244
+ "--out",
245
+ str(out_dir),
246
+ "--deepgram-model",
247
+ deepgram_model,
248
+ "--deepgram-request-timeout-sec",
249
+ str(deepgram_request_timeout_sec),
250
+ "--deepgram-connect-timeout-sec",
251
+ str(deepgram_connect_timeout_sec),
252
+ "--deepgram-retries",
253
+ str(deepgram_retries),
254
+ "--deepgram-retry-backoff-sec",
255
+ str(deepgram_retry_backoff_sec),
256
+ "--pre-roll-sec",
257
+ str(pre_roll_sec),
258
+ "--gemini-model",
259
+ gemini_model,
260
+ "--similarity-threshold",
261
+ str(similarity_threshold),
262
+ "--temperature",
263
+ str(temperature),
264
+ ]
265
+ if deepgram_language:
266
+ args.extend(["--deepgram-language", deepgram_language])
267
+ if force_deepgram:
268
+ args.append("--force-deepgram")
269
+ if force_keyframes:
270
+ args.append("--force-keyframes")
271
+ return args
272
+
273
+
274
+ def _build_output_files(out_dir: Path, variant: str) -> Dict[str, str]:
275
+ return {
276
+ "utterances": str(out_dir / "utterances.json"),
277
+ "keyframes_parsed": str(out_dir / "keyframes_parsed.json"),
278
+ "keyframes_with_utterances": str(out_dir / "keyframes_with_utterances.json"),
279
+ "final_output": str(
280
+ out_dir / ("final_output.json" if variant == "full" else "final_output_demo_code.json")
281
+ ),
282
+ "final_output_condensed": str(
283
+ out_dir / ("final_output_condensed.json" if variant == "full" else "final_output_demo_code_condensed.json")
284
+ ),
285
+ }
286
+
287
+
288
+ def _artifact_state(output_files: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
289
+ state: Dict[str, Dict[str, Any]] = {}
290
+ for key, p in output_files.items():
291
+ path = Path(p)
292
+ if path.exists():
293
+ try:
294
+ st = path.stat()
295
+ state[key] = {
296
+ "size_bytes": int(st.st_size),
297
+ "mtime": float(st.st_mtime),
298
+ }
299
+ except Exception:
300
+ state[key] = {"size_bytes": -1, "mtime": -1.0}
301
+ return state
302
+
303
+
304
+ def _format_artifact_compact(state: Dict[str, Dict[str, Any]]) -> str:
305
+ if not state:
306
+ return "none"
307
+ parts = []
308
+ for k in sorted(state.keys()):
309
+ sz = float(state[k].get("size_bytes", 0))
310
+ parts.append(f"{k}:{sz/1024.0:.1f}KB")
311
+ return ", ".join(parts)
312
+
313
+
314
+ def _watch_run(run_id: str, proc: subprocess.Popen, started_at: float, log_fh, heartbeat_sec: float) -> None:
315
+ heartbeat_sec = max(2.0, float(heartbeat_sec))
316
+ last_hb = 0.0
317
+ last_artifact_change = started_at
318
+ last_state: Dict[str, Dict[str, Any]] = {}
319
+
320
+ while True:
321
+ now = time.time()
322
+ rc = proc.poll()
323
+
324
+ if (now - last_hb) >= heartbeat_sec:
325
+ try:
326
+ meta_file = _meta_path(run_id)
327
+ meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
328
+ out_files = meta.get("output_files", {}) or {}
329
+ cur_state = _artifact_state(out_files)
330
+ changed = cur_state != last_state
331
+ if changed:
332
+ last_artifact_change = now
333
+ unchanged_for = now - last_artifact_change
334
+ elapsed = now - started_at
335
+
336
+ log_fh.write(
337
+ "[runner] heartbeat "
338
+ f"elapsed={elapsed:.1f}s pid={proc.pid} "
339
+ f"artifacts={len(cur_state)}/{len(out_files)} "
340
+ f"changed={'yes' if changed else 'no'} "
341
+ f"unchanged_for={unchanged_for:.1f}s "
342
+ f"[{_format_artifact_compact(cur_state)}]\n"
343
+ )
344
+ log_fh.flush()
345
+
346
+ meta["last_heartbeat_epoch"] = now
347
+ meta["last_heartbeat_elapsed_sec"] = round(elapsed, 3)
348
+ meta["artifacts_ready_count"] = len(cur_state)
349
+ meta["artifacts_total_count"] = len(out_files)
350
+ meta["artifacts_unchanged_for_sec"] = round(unchanged_for, 3)
351
+ _write_json(meta_file, meta)
352
+ last_state = cur_state
353
+ except Exception as e:
354
+ try:
355
+ log_fh.write(f"[runner] heartbeat_error: {type(e).__name__}: {e}\n")
356
+ log_fh.flush()
357
+ except Exception:
358
+ pass
359
+ last_hb = now
360
+
361
+ if rc is not None:
362
+ return_code = int(rc)
363
+ break
364
+
365
+ time.sleep(1.0)
366
+
367
+ finished_at = time.time()
368
+ try:
369
+ meta_file = _meta_path(run_id)
370
+ meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
371
+ meta["status"] = "succeeded" if return_code == 0 else "failed"
372
+ meta["exit_code"] = int(return_code)
373
+ meta["finished_at_epoch"] = finished_at
374
+ meta["duration_sec"] = round(finished_at - started_at, 3)
375
+ _write_json(meta_file, meta)
376
+ except Exception as e:
377
+ try:
378
+ log_fh.write(f"\n[runner] failed to update metadata: {type(e).__name__}: {e}\n")
379
+ log_fh.flush()
380
+ except Exception:
381
+ pass
382
+
383
+ try:
384
+ log_fh.write(f"\n[runner] process finished with exit_code={return_code}\n")
385
+ log_fh.flush()
386
+ except Exception:
387
+ pass
388
+ finally:
389
+ try:
390
+ log_fh.close()
391
+ except Exception:
392
+ pass
393
+
394
+
395
+ def start_run(
396
+ *,
397
+ variant: str,
398
+ video_file_path: Optional[str],
399
+ video_url: Optional[str],
400
+ out_dir: Optional[str],
401
+ python_bin: Optional[str],
402
+ deepgram_model: str,
403
+ deepgram_language: Optional[str],
404
+ deepgram_request_timeout_sec: float,
405
+ deepgram_connect_timeout_sec: float,
406
+ deepgram_retries: int,
407
+ deepgram_retry_backoff_sec: float,
408
+ force_deepgram: bool,
409
+ force_keyframes: bool,
410
+ pre_roll_sec: float,
411
+ gemini_model: str,
412
+ similarity_threshold: float,
413
+ temperature: float,
414
+ log_heartbeat_sec: float = 10.0,
415
+ ) -> Dict[str, Any]:
416
+ script_name = {
417
+ "full": "run_pipeline_all.py",
418
+ "demo-code": "run_pipeline_demo_code.py",
419
+ }.get(variant)
420
+ if not script_name:
421
+ raise ValueError("variant must be one of: full, demo-code")
422
+
423
+ pipeline_script = PIPELINES_DIR / script_name
424
+ if not pipeline_script.exists():
425
+ raise FileNotFoundError(f"Missing pipeline script: {pipeline_script}")
426
+
427
+ run_id = uuid.uuid4().hex[:12]
428
+ run_dir = _run_dir(run_id)
429
+ run_dir.mkdir(parents=True, exist_ok=True)
430
+
431
+ if video_file_path:
432
+ src = Path(video_file_path).expanduser().resolve()
433
+ if not src.exists():
434
+ raise ValueError(f"Uploaded/local video file not found: {src}")
435
+ dst = run_dir / f"input_{run_id}{src.suffix or '.mp4'}"
436
+ shutil.copy2(src, dst)
437
+ video_path = dst
438
+ elif video_url:
439
+ suffix = Path(video_url).suffix or ".mp4"
440
+ video_path = run_dir / f"input_{run_id}{suffix}"
441
+ if _extract_gdrive_file_id(video_url):
442
+ _download_google_drive(video_url, video_path)
443
+ else:
444
+ with httpx.stream("GET", video_url, timeout=120.0, follow_redirects=True) as r:
445
+ r.raise_for_status()
446
+ with open(video_path, "wb") as f:
447
+ for chunk in r.iter_bytes():
448
+ f.write(chunk)
449
+ else:
450
+ raise ValueError("Provide one of: video_file_path or video_url")
451
+
452
+ _validate_video_file(video_path)
453
+ out_path = _resolve_out_dir(out_dir, run_id)
454
+ python_exe = _resolve_python_executable(python_bin)
455
+
456
+ cmd = [
457
+ python_exe,
458
+ "-u",
459
+ str(pipeline_script),
460
+ "--python",
461
+ python_exe,
462
+ *_build_common_args(
463
+ video_path=video_path,
464
+ out_dir=out_path,
465
+ deepgram_model=deepgram_model,
466
+ deepgram_language=deepgram_language,
467
+ deepgram_request_timeout_sec=deepgram_request_timeout_sec,
468
+ deepgram_connect_timeout_sec=deepgram_connect_timeout_sec,
469
+ deepgram_retries=deepgram_retries,
470
+ deepgram_retry_backoff_sec=deepgram_retry_backoff_sec,
471
+ force_deepgram=force_deepgram,
472
+ force_keyframes=force_keyframes,
473
+ pre_roll_sec=pre_roll_sec,
474
+ gemini_model=gemini_model,
475
+ similarity_threshold=similarity_threshold,
476
+ temperature=temperature,
477
+ ),
478
+ ]
479
+
480
+ started = time.time()
481
+ logs_path = _logs_path(run_id)
482
+ log_fh = open(logs_path, "a", encoding="utf-8", buffering=1)
483
+ log_fh.write(
484
+ f"[runner] run_id={run_id} variant={variant} started_at_epoch={started}\n"
485
+ f"[runner] command={' '.join(cmd)}\n"
486
+ f"[runner] cwd={PIPELINES_DIR}\n\n"
487
+ f"[runner] heartbeat_interval_sec={log_heartbeat_sec}\n"
488
+ f"[runner] python_unbuffered=1\n\n"
489
+ )
490
+ log_fh.flush()
491
+
492
+ child_env = os.environ.copy()
493
+ child_env["PYTHONUNBUFFERED"] = "1"
494
+ child_env.setdefault("PYTHONIOENCODING", "utf-8")
495
+
496
+ proc = subprocess.Popen(
497
+ cmd,
498
+ cwd=str(PIPELINES_DIR),
499
+ stdout=log_fh,
500
+ stderr=subprocess.STDOUT,
501
+ text=True,
502
+ env=child_env,
503
+ )
504
+
505
+ meta = {
506
+ "variant": variant,
507
+ "run_id": run_id,
508
+ "python_executable": python_exe,
509
+ "command": cmd,
510
+ "status": "running",
511
+ "exit_code": None,
512
+ "pid": proc.pid,
513
+ "started_at_epoch": started,
514
+ "finished_at_epoch": None,
515
+ "duration_sec": None,
516
+ "out_dir": str(out_path),
517
+ "logs_path": str(logs_path),
518
+ "heartbeat_interval_sec": float(log_heartbeat_sec),
519
+ "output_files": _build_output_files(out_path, variant),
520
+ }
521
+ _write_json(_meta_path(run_id), meta)
522
+
523
+ watcher = threading.Thread(
524
+ target=_watch_run,
525
+ args=(run_id, proc, started, log_fh, float(log_heartbeat_sec)),
526
+ daemon=True,
527
+ )
528
+ watcher.start()
529
+
530
+ return {
531
+ "run_id": run_id,
532
+ "variant": variant,
533
+ "status": "running",
534
+ "python_executable": python_exe,
535
+ "status_path": f"runs/{run_id}",
536
+ "logs_path": f"runs/{run_id}/logs",
537
+ "final_output_path": f"runs/{run_id}/final-output",
538
+ "final_output_condensed_path": f"runs/{run_id}/final-output/condensed",
539
+ "out_dir": str(out_path),
540
+ }
541
+
542
+
543
+ def get_status(run_id: str) -> Dict[str, Any]:
544
+ p = _meta_path(run_id)
545
+ if not p.exists():
546
+ raise FileNotFoundError(f"Unknown run_id: {run_id}")
547
+ return _read_json(p)
548
+
549
+
550
+ def get_logs(run_id: str, tail_lines: int = 300) -> str:
551
+ meta = get_status(run_id)
552
+ p = Path(meta.get("logs_path", ""))
553
+ if not p.exists():
554
+ return ""
555
+ txt = p.read_text(encoding="utf-8", errors="replace")
556
+ limit = max(1, min(int(tail_lines), 5000))
557
+ return _tail(txt, max_lines=limit)
558
+
559
+
560
+ def get_final_output(run_id: str, condensed: bool = False) -> Dict[str, Any]:
561
+ meta = get_status(run_id)
562
+ status = meta.get("status")
563
+ key = "final_output_condensed" if condensed else "final_output"
564
+ out_file = Path(meta["output_files"][key])
565
+
566
+ if status == "running":
567
+ return {
568
+ "run_id": run_id,
569
+ "status": status,
570
+ "message": "Pipeline is still running. Check logs.",
571
+ }
572
+ if status == "failed":
573
+ return {
574
+ "run_id": run_id,
575
+ "status": status,
576
+ "message": "Pipeline failed. Check logs.",
577
+ }
578
+ if not out_file.exists():
579
+ raise FileNotFoundError(f"Output not found: {out_file}")
580
+ return _read_json(out_file)
581
+
vercel.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": 2,
3
+ "builds": [
4
+ {
5
+ "src": "api/index.py",
6
+ "use": "@vercel/python"
7
+ }
8
+ ],
9
+ "routes": [
10
+ {
11
+ "src": "/(.*)",
12
+ "dest": "api/index.py"
13
+ }
14
+ ],
15
+ "functions": {
16
+ "api/index.py": {
17
+ "maxDuration": 900
18
+ }
19
+ }
20
+ }
21
+