Spaces:
Sleeping
Sleeping
Commit ·
2db58d0
1
Parent(s): b903d41
Deploy deployed-meet Gradio app
Browse files- .dockerignore +12 -0
- .gitattributes +1 -0
- .gitignore +7 -0
- Dockerfile +30 -0
- HF_SPACE_DEPLOY.md +65 -0
- README.md +44 -6
- api/index.py +672 -0
- app.py +279 -0
- pipelines/assign_utterances_to_keyframes.py +249 -0
- pipelines/build_final_output.py +758 -0
- pipelines/build_final_output_demo_code.py +549 -0
- pipelines/condense_final_output.py +145 -0
- pipelines/deepgram_extract_utterances.py +208 -0
- pipelines/models/yolov8x-doclaynet.pt +3 -0
- pipelines/run_pipeline_all.py +238 -0
- pipelines/run_pipeline_demo_code.py +239 -0
- pipelines/smart_keyframes_and_classify.py +1443 -0
- requirements.txt +17 -0
- run_manager.py +581 -0
- vercel.json +21 -0
.dockerignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.venv
|
| 4 |
+
__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*.log
|
| 9 |
+
out*
|
| 10 |
+
tmp
|
| 11 |
+
runs
|
| 12 |
+
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
pipelines/models/*.pt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.env
|
| 4 |
+
.venv/
|
| 5 |
+
out*/
|
| 6 |
+
tmp/
|
| 7 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
| 4 |
+
PIP_NO_CACHE_DIR=1 \
|
| 5 |
+
PYTHONUNBUFFERED=1 \
|
| 6 |
+
PYTHONIOENCODING=utf-8 \
|
| 7 |
+
PIPELINE_WORKDIR=/data/deployed-meet-runs
|
| 8 |
+
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
git \
|
| 11 |
+
ffmpeg \
|
| 12 |
+
curl \
|
| 13 |
+
libgl1 \
|
| 14 |
+
libglib2.0-0 \
|
| 15 |
+
libgomp1 \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
COPY . /app
|
| 20 |
+
|
| 21 |
+
RUN pip install --upgrade pip setuptools wheel && \
|
| 22 |
+
pip install -r requirements.txt && \
|
| 23 |
+
pip install --no-build-isolation "git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1"
|
| 24 |
+
|
| 25 |
+
RUN mkdir -p /data/deployed-meet-runs
|
| 26 |
+
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
CMD ["python", "-m", "uvicorn", "api.index:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 30 |
+
|
HF_SPACE_DEPLOY.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deploy to Hugging Face Spaces (Gradio SDK)
|
| 2 |
+
|
| 3 |
+
This package is now Gradio-native (`app.py`) and does not require Docker on Spaces.
|
| 4 |
+
The `demo-code` variant is configured as demo-only Gemini:
|
| 5 |
+
- Gemini + YOLO only for `demo` frames.
|
| 6 |
+
- `slides`/`code`/`none` use OCR + transcript output.
|
| 7 |
+
|
| 8 |
+
## 1) Create the Space
|
| 9 |
+
1. Go to `https://huggingface.co/new-space`.
|
| 10 |
+
2. Choose:
|
| 11 |
+
- SDK: `Gradio`
|
| 12 |
+
- Space name: your choice (for example `deployed-meet`)
|
| 13 |
+
- Visibility: your choice
|
| 14 |
+
3. Click **Create Space**.
|
| 15 |
+
|
| 16 |
+
## 2) Clone the Space repo
|
| 17 |
+
```powershell
|
| 18 |
+
git clone https://huggingface.co/spaces/<YOUR_USER>/<YOUR_SPACE_NAME> hf-space-deployed-meet
|
| 19 |
+
cd hf-space-deployed-meet
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## 3) Copy this folder into the Space repo
|
| 23 |
+
Copy everything from local `deployed-meet/` into this cloned Space repo root.
|
| 24 |
+
|
| 25 |
+
Required root files after copy:
|
| 26 |
+
- `app.py`
|
| 27 |
+
- `run_manager.py`
|
| 28 |
+
- `requirements.txt`
|
| 29 |
+
- `README.md`
|
| 30 |
+
- `pipelines/...`
|
| 31 |
+
|
| 32 |
+
## 4) Track model weights with Git LFS
|
| 33 |
+
```powershell
|
| 34 |
+
git lfs install
|
| 35 |
+
git lfs track "pipelines/models/*.pt"
|
| 36 |
+
git add .gitattributes
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## 5) Add secrets in Space Settings
|
| 40 |
+
In **Settings -> Variables and secrets**, add:
|
| 41 |
+
- `GEMINI_API_KEY`
|
| 42 |
+
- `DEEPGRAM_API_KEY`
|
| 43 |
+
|
| 44 |
+
Optional:
|
| 45 |
+
- `PIPELINE_WORKDIR=/data/deployed-meet-runs`
|
| 46 |
+
- `YOLO_DEVICE=cpu` (if your Space has no GPU)
|
| 47 |
+
- `OCR_GPU=false` (if your Space has no GPU)
|
| 48 |
+
|
| 49 |
+
## 6) Commit and push
|
| 50 |
+
```powershell
|
| 51 |
+
git add .
|
| 52 |
+
git commit -m "Deploy deployed-meet Gradio app"
|
| 53 |
+
git push
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Wait for the build to complete.
|
| 57 |
+
|
| 58 |
+
## 7) Open the app and run
|
| 59 |
+
- App URL: `https://<YOUR_SPACE_NAME>.hf.space`
|
| 60 |
+
- Start from **Start Run** tab, then monitor from **Track Run** tab.
|
| 61 |
+
|
| 62 |
+
## ZeroGPU note
|
| 63 |
+
- ZeroGPU only works with Gradio Spaces, which this repo now uses.
|
| 64 |
+
- This pipeline is long-running and model-heavy, so ZeroGPU sessions may be unstable for long videos.
|
| 65 |
+
- For reliable long jobs, CPU upgraded hardware or a dedicated GPU Space is recommended.
|
README.md
CHANGED
|
@@ -1,12 +1,50 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.5.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: deployed-meet
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: gradio
|
|
|
|
| 4 |
app_file: app.py
|
| 5 |
pinned: false
|
| 6 |
---
|
| 7 |
|
| 8 |
+
# deployed-meet
|
| 9 |
+
|
| 10 |
+
Gradio-based deployment package for the meeting pipeline.
|
| 11 |
+
|
| 12 |
+
## Pipeline variants
|
| 13 |
+
- `full`:
|
| 14 |
+
- Gemini is called for all keyframe types (`slides`, `code`, `demo`, `none` as applicable).
|
| 15 |
+
- `demo-code`:
|
| 16 |
+
- Gemini is called only for `demo` keyframes.
|
| 17 |
+
- `slides`/`code`/`none` are built from OCR + transcript.
|
| 18 |
+
- `smart_keyframes_and_classify.py` runs with `--no-yolo-for-non-demo` in this variant.
|
| 19 |
+
|
| 20 |
+
## Run locally (Gradio)
|
| 21 |
+
```powershell
|
| 22 |
+
cd deployed-meet
|
| 23 |
+
C:/meet-agent/.venv/Scripts/activate
|
| 24 |
+
pip install -r requirements.txt
|
| 25 |
+
python app.py
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Open: `http://127.0.0.1:7860`
|
| 29 |
+
|
| 30 |
+
## How to use UI
|
| 31 |
+
1. Go to **Start Run**.
|
| 32 |
+
2. Select variant (`full` or `demo-code`).
|
| 33 |
+
3. Choose input mode (`Upload File` or `Video URL`).
|
| 34 |
+
4. Click **Start Pipeline** and copy the generated `run_id`.
|
| 35 |
+
5. Go to **Track Run**, paste `run_id`, then use:
|
| 36 |
+
- **Refresh Status + Logs**
|
| 37 |
+
- **Watch Live**
|
| 38 |
+
- **Fetch Final Output**
|
| 39 |
+
- **Fetch Condensed Output**
|
| 40 |
+
|
| 41 |
+
## Required environment variables
|
| 42 |
+
Set these before starting:
|
| 43 |
+
- `GEMINI_API_KEY`
|
| 44 |
+
- `DEEPGRAM_API_KEY`
|
| 45 |
+
|
| 46 |
+
Optional:
|
| 47 |
+
- `PIPELINE_WORKDIR` (defaults to temp directory)
|
| 48 |
+
|
| 49 |
+
## Legacy FastAPI
|
| 50 |
+
Existing FastAPI code is still in `api/index.py`, but Hugging Face Gradio Spaces will run `app.py`.
|
api/index.py
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import tempfile
|
| 9 |
+
import threading
|
| 10 |
+
import time
|
| 11 |
+
import uuid
|
| 12 |
+
from html import unescape
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Any, Dict, Optional
|
| 15 |
+
from urllib.parse import parse_qs, urljoin, urlparse
|
| 16 |
+
|
| 17 |
+
import httpx
|
| 18 |
+
from fastapi import FastAPI, HTTPException
|
| 19 |
+
from fastapi.responses import JSONResponse, PlainTextResponse
|
| 20 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
BASE_DIR = Path(__file__).resolve().parents[1]
|
| 24 |
+
PIPELINES_DIR = BASE_DIR / "pipelines"
|
| 25 |
+
DEFAULT_WORKDIR = Path(os.getenv("PIPELINE_WORKDIR", tempfile.gettempdir())) / "deployed-meet-runs"
|
| 26 |
+
DEFAULT_WORKDIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
RUNS_DIR = DEFAULT_WORKDIR / "runs"
|
| 28 |
+
RUNS_DIR.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PipelineRequest(BaseModel):
|
| 32 |
+
video_path: Optional[str] = Field(default=None, description="Absolute or server-local path to input video.")
|
| 33 |
+
video_url: Optional[HttpUrl] = Field(default=None, description="Optional URL to download input video from.")
|
| 34 |
+
out_dir: Optional[str] = Field(default=None, description="Optional output directory. Defaults to /tmp run folder.")
|
| 35 |
+
|
| 36 |
+
deepgram_model: str = "nova-3"
|
| 37 |
+
deepgram_language: Optional[str] = None
|
| 38 |
+
deepgram_request_timeout_sec: float = 1200.0
|
| 39 |
+
deepgram_connect_timeout_sec: float = 30.0
|
| 40 |
+
deepgram_retries: int = 3
|
| 41 |
+
deepgram_retry_backoff_sec: float = 2.0
|
| 42 |
+
force_deepgram: bool = False
|
| 43 |
+
|
| 44 |
+
force_keyframes: bool = False
|
| 45 |
+
pre_roll_sec: float = 3.0
|
| 46 |
+
gemini_model: str = "gemini-2.5-flash"
|
| 47 |
+
similarity_threshold: float = 0.82
|
| 48 |
+
temperature: float = 0.2
|
| 49 |
+
python_bin: Optional[str] = Field(
|
| 50 |
+
default=None,
|
| 51 |
+
description="Optional Python executable path for running pipeline subprocesses.",
|
| 52 |
+
)
|
| 53 |
+
log_heartbeat_sec: float = Field(
|
| 54 |
+
default=10.0,
|
| 55 |
+
description="Seconds between heartbeat progress lines written to run logs.",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
app = FastAPI(title="deployed-meet", version="1.0.0")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _tail(text: str, max_lines: int = 220) -> str:
|
| 63 |
+
lines = (text or "").splitlines()
|
| 64 |
+
if len(lines) <= max_lines:
|
| 65 |
+
return "\n".join(lines)
|
| 66 |
+
return "\n".join(lines[-max_lines:])
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _run_dir(run_id: str) -> Path:
|
| 70 |
+
return RUNS_DIR / run_id
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _meta_path(run_id: str) -> Path:
|
| 74 |
+
return _run_dir(run_id) / "run_meta.json"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _logs_path(run_id: str) -> Path:
|
| 78 |
+
return _run_dir(run_id) / "pipeline.log"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _write_json(path: Path, data: Dict[str, Any]) -> None:
|
| 82 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
tmp = path.with_suffix(path.suffix + ".tmp")
|
| 84 |
+
tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 85 |
+
tmp.replace(path)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _read_json(path: Path) -> Dict[str, Any]:
|
| 89 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _get_meta_or_404(run_id: str) -> Dict[str, Any]:
|
| 93 |
+
p = _meta_path(run_id)
|
| 94 |
+
if not p.exists():
|
| 95 |
+
raise HTTPException(status_code=404, detail=f"Unknown run_id: {run_id}")
|
| 96 |
+
try:
|
| 97 |
+
return _read_json(p)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
raise HTTPException(status_code=500, detail=f"Failed to read run metadata: {type(e).__name__}: {e}") from e
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _resolve_video_input(req: PipelineRequest, run_id: str, run_dir: Path) -> Path:
|
| 103 |
+
if req.video_path:
|
| 104 |
+
p = Path(req.video_path).expanduser().resolve()
|
| 105 |
+
if not p.exists():
|
| 106 |
+
raise HTTPException(status_code=400, detail=f"video_path does not exist: {p}")
|
| 107 |
+
return p
|
| 108 |
+
|
| 109 |
+
if req.video_url:
|
| 110 |
+
suffix = Path(str(req.video_url)).suffix or ".mp4"
|
| 111 |
+
local = run_dir / f"input_{run_id}{suffix}"
|
| 112 |
+
try:
|
| 113 |
+
url = str(req.video_url)
|
| 114 |
+
if _extract_gdrive_file_id(url):
|
| 115 |
+
_download_google_drive(url, local)
|
| 116 |
+
else:
|
| 117 |
+
with httpx.stream("GET", url, timeout=120.0, follow_redirects=True) as r:
|
| 118 |
+
r.raise_for_status()
|
| 119 |
+
with open(local, "wb") as f:
|
| 120 |
+
for chunk in r.iter_bytes():
|
| 121 |
+
f.write(chunk)
|
| 122 |
+
except HTTPException:
|
| 123 |
+
raise
|
| 124 |
+
except Exception as e:
|
| 125 |
+
raise HTTPException(status_code=400, detail=f"Failed to download video_url: {type(e).__name__}: {e}") from e
|
| 126 |
+
return local
|
| 127 |
+
|
| 128 |
+
raise HTTPException(status_code=400, detail="Provide one of: video_path or video_url.")
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _extract_gdrive_file_id(url: str) -> Optional[str]:
|
| 132 |
+
parsed = urlparse(url)
|
| 133 |
+
host = (parsed.netloc or "").lower()
|
| 134 |
+
if "drive.google.com" not in host:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
m = re.search(r"/file/d/([a-zA-Z0-9_-]+)", parsed.path or "")
|
| 138 |
+
if m:
|
| 139 |
+
return m.group(1)
|
| 140 |
+
|
| 141 |
+
qs = parse_qs(parsed.query or "")
|
| 142 |
+
if "id" in qs and qs["id"]:
|
| 143 |
+
return qs["id"][0]
|
| 144 |
+
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _download_google_drive(url: str, out_path: Path) -> None:
|
| 149 |
+
file_id = _extract_gdrive_file_id(url)
|
| 150 |
+
if not file_id:
|
| 151 |
+
raise HTTPException(status_code=400, detail="Could not parse Google Drive file id from video_url.")
|
| 152 |
+
|
| 153 |
+
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
| 154 |
+
|
| 155 |
+
def _is_html_response(resp: httpx.Response) -> bool:
|
| 156 |
+
ctype = (resp.headers.get("content-type") or "").lower()
|
| 157 |
+
if "html" in ctype or "text/plain" in ctype:
|
| 158 |
+
return True
|
| 159 |
+
head = (resp.content[:256] or b"").lower()
|
| 160 |
+
return b"<html" in head or b"<!doctype html" in head
|
| 161 |
+
|
| 162 |
+
def _write_if_file(resp: httpx.Response) -> bool:
|
| 163 |
+
if _is_html_response(resp):
|
| 164 |
+
return False
|
| 165 |
+
if not resp.content or len(resp.content) < 1024:
|
| 166 |
+
return False
|
| 167 |
+
out_path.write_bytes(resp.content)
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
try:
|
| 171 |
+
with httpx.Client(timeout=120.0, follow_redirects=True) as client:
|
| 172 |
+
# Try a couple of direct download endpoints first.
|
| 173 |
+
candidates = [
|
| 174 |
+
direct_url,
|
| 175 |
+
f"https://drive.usercontent.google.com/download?id={file_id}&export=download&confirm=t",
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
for c in candidates:
|
| 179 |
+
rr = client.get(c)
|
| 180 |
+
rr.raise_for_status()
|
| 181 |
+
if _write_if_file(rr):
|
| 182 |
+
return
|
| 183 |
+
|
| 184 |
+
# Parse Drive HTML interstitial page and submit download form if present.
|
| 185 |
+
page = client.get(f"https://drive.google.com/file/d/{file_id}/view")
|
| 186 |
+
page.raise_for_status()
|
| 187 |
+
html = page.text or ""
|
| 188 |
+
|
| 189 |
+
# Pattern A: explicit download form.
|
| 190 |
+
form_action_match = re.search(r'id="download-form"[^>]*action="([^"]+)"', html)
|
| 191 |
+
if form_action_match:
|
| 192 |
+
action = unescape(form_action_match.group(1))
|
| 193 |
+
action_url = urljoin("https://drive.google.com", action)
|
| 194 |
+
params = {k: v for k, v in re.findall(r'<input[^>]+name="([^"]+)"[^>]+value="([^"]*)"', html)}
|
| 195 |
+
form_resp = client.get(action_url, params=params)
|
| 196 |
+
form_resp.raise_for_status()
|
| 197 |
+
if _write_if_file(form_resp):
|
| 198 |
+
return
|
| 199 |
+
|
| 200 |
+
# Pattern B: direct download link in page HTML.
|
| 201 |
+
link_match = re.search(r'href="(/uc\?export=download[^"]+)"', html)
|
| 202 |
+
if link_match:
|
| 203 |
+
href = unescape(link_match.group(1)).replace("&", "&")
|
| 204 |
+
link_url = urljoin("https://drive.google.com", href)
|
| 205 |
+
link_resp = client.get(link_url)
|
| 206 |
+
link_resp.raise_for_status()
|
| 207 |
+
if _write_if_file(link_resp):
|
| 208 |
+
return
|
| 209 |
+
|
| 210 |
+
# Pattern C: download_warning cookie + confirm token flow.
|
| 211 |
+
cookie_confirm = None
|
| 212 |
+
for k, v in page.cookies.items():
|
| 213 |
+
if str(k).startswith("download_warning"):
|
| 214 |
+
cookie_confirm = v
|
| 215 |
+
break
|
| 216 |
+
if cookie_confirm:
|
| 217 |
+
confirm_url = f"https://drive.google.com/uc?export=download&confirm={cookie_confirm}&id={file_id}"
|
| 218 |
+
confirm_resp = client.get(confirm_url)
|
| 219 |
+
confirm_resp.raise_for_status()
|
| 220 |
+
if _write_if_file(confirm_resp):
|
| 221 |
+
return
|
| 222 |
+
|
| 223 |
+
msg = "Google Drive link did not provide a downloadable file."
|
| 224 |
+
low = html.lower()
|
| 225 |
+
if "you need access" in low or "request access" in low:
|
| 226 |
+
msg += " File is not publicly accessible."
|
| 227 |
+
elif "quota exceeded" in low or "too many users have viewed or downloaded" in low:
|
| 228 |
+
msg += " File appears to be quota-limited by Google Drive."
|
| 229 |
+
else:
|
| 230 |
+
msg += " Use a publicly accessible direct file link or local video_path."
|
| 231 |
+
raise HTTPException(status_code=400, detail=msg)
|
| 232 |
+
except HTTPException:
|
| 233 |
+
raise
|
| 234 |
+
except Exception as e:
|
| 235 |
+
raise HTTPException(status_code=400, detail=f"Failed to download Google Drive file: {type(e).__name__}: {e}") from e
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def _validate_video_file(path: Path) -> None:
|
| 239 |
+
if not path.exists() or not path.is_file():
|
| 240 |
+
raise HTTPException(status_code=400, detail=f"Input video file not found: {path}")
|
| 241 |
+
|
| 242 |
+
size = path.stat().st_size
|
| 243 |
+
if size < 1024:
|
| 244 |
+
raise HTTPException(status_code=400, detail=f"Input file is too small to be valid media: {path} ({size} bytes)")
|
| 245 |
+
|
| 246 |
+
# Common case for bad video_url: downloaded HTML/JSON page saved as .mp4.
|
| 247 |
+
try:
|
| 248 |
+
head = path.read_bytes()[:4096].lower()
|
| 249 |
+
if b"<html" in head or b"<!doctype html" in head or b"{\"error\"" in head:
|
| 250 |
+
raise HTTPException(
|
| 251 |
+
status_code=400,
|
| 252 |
+
detail=(
|
| 253 |
+
"Downloaded input is not a media file (looks like HTML/JSON response). "
|
| 254 |
+
"Use a direct video file URL or provide video_path."
|
| 255 |
+
),
|
| 256 |
+
)
|
| 257 |
+
except HTTPException:
|
| 258 |
+
raise
|
| 259 |
+
except Exception:
|
| 260 |
+
pass
|
| 261 |
+
|
| 262 |
+
# Lightweight decode check.
|
| 263 |
+
try:
|
| 264 |
+
import cv2 # local import to avoid import cost at startup
|
| 265 |
+
|
| 266 |
+
cap = cv2.VideoCapture(str(path))
|
| 267 |
+
ok = cap.isOpened()
|
| 268 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 269 |
+
cap.release()
|
| 270 |
+
if (not ok) or frame_count <= 0:
|
| 271 |
+
raise HTTPException(
|
| 272 |
+
status_code=400,
|
| 273 |
+
detail=(
|
| 274 |
+
"Input file is not a decodable video for this runtime. "
|
| 275 |
+
"Provide a valid MP4 (H.264/AAC recommended) or use a direct media URL."
|
| 276 |
+
),
|
| 277 |
+
)
|
| 278 |
+
except HTTPException:
|
| 279 |
+
raise
|
| 280 |
+
except Exception:
|
| 281 |
+
# If cv2 probing fails unexpectedly, let pipeline attempt process rather than hard-fail.
|
| 282 |
+
pass
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _resolve_python_executable(req: PipelineRequest) -> str:
|
| 286 |
+
if req.python_bin:
|
| 287 |
+
p = Path(req.python_bin).expanduser()
|
| 288 |
+
if not p.exists():
|
| 289 |
+
raise HTTPException(status_code=400, detail=f"python_bin does not exist: {p}")
|
| 290 |
+
return str(p.resolve())
|
| 291 |
+
|
| 292 |
+
# Prefer project virtualenv if available.
|
| 293 |
+
candidates = [
|
| 294 |
+
BASE_DIR.parent / ".venv" / "Scripts" / "python.exe", # Windows, repo root venv
|
| 295 |
+
BASE_DIR / ".venv" / "Scripts" / "python.exe", # Windows, deployed-meet local venv
|
| 296 |
+
BASE_DIR.parent / ".venv" / "bin" / "python", # Unix, repo root venv
|
| 297 |
+
BASE_DIR / ".venv" / "bin" / "python", # Unix, deployed-meet local venv
|
| 298 |
+
]
|
| 299 |
+
for c in candidates:
|
| 300 |
+
if c.exists():
|
| 301 |
+
return str(c.resolve())
|
| 302 |
+
|
| 303 |
+
# Fallback to currently running interpreter.
|
| 304 |
+
return sys.executable or os.getenv("PYTHON_BIN") or "python"
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def _resolve_out_dir(req: PipelineRequest, run_id: str) -> Path:
|
| 308 |
+
if req.out_dir:
|
| 309 |
+
p = Path(req.out_dir)
|
| 310 |
+
if not p.is_absolute():
|
| 311 |
+
p = DEFAULT_WORKDIR / p
|
| 312 |
+
else:
|
| 313 |
+
p = DEFAULT_WORKDIR / f"run_{run_id}"
|
| 314 |
+
p.mkdir(parents=True, exist_ok=True)
|
| 315 |
+
return p.resolve()
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def _build_common_args(req: PipelineRequest, video_path: Path, out_dir: Path) -> list[str]:
|
| 319 |
+
args = [
|
| 320 |
+
"--video",
|
| 321 |
+
str(video_path),
|
| 322 |
+
"--out",
|
| 323 |
+
str(out_dir),
|
| 324 |
+
"--deepgram-model",
|
| 325 |
+
req.deepgram_model,
|
| 326 |
+
"--deepgram-request-timeout-sec",
|
| 327 |
+
str(req.deepgram_request_timeout_sec),
|
| 328 |
+
"--deepgram-connect-timeout-sec",
|
| 329 |
+
str(req.deepgram_connect_timeout_sec),
|
| 330 |
+
"--deepgram-retries",
|
| 331 |
+
str(req.deepgram_retries),
|
| 332 |
+
"--deepgram-retry-backoff-sec",
|
| 333 |
+
str(req.deepgram_retry_backoff_sec),
|
| 334 |
+
"--pre-roll-sec",
|
| 335 |
+
str(req.pre_roll_sec),
|
| 336 |
+
"--gemini-model",
|
| 337 |
+
req.gemini_model,
|
| 338 |
+
"--similarity-threshold",
|
| 339 |
+
str(req.similarity_threshold),
|
| 340 |
+
"--temperature",
|
| 341 |
+
str(req.temperature),
|
| 342 |
+
]
|
| 343 |
+
if req.deepgram_language:
|
| 344 |
+
args.extend(["--deepgram-language", req.deepgram_language])
|
| 345 |
+
if req.force_deepgram:
|
| 346 |
+
args.append("--force-deepgram")
|
| 347 |
+
if req.force_keyframes:
|
| 348 |
+
args.append("--force-keyframes")
|
| 349 |
+
return args
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
def _build_output_files(out_dir: Path, variant: str) -> Dict[str, str]:
|
| 353 |
+
return {
|
| 354 |
+
"utterances": str(out_dir / "utterances.json"),
|
| 355 |
+
"keyframes_parsed": str(out_dir / "keyframes_parsed.json"),
|
| 356 |
+
"keyframes_with_utterances": str(out_dir / "keyframes_with_utterances.json"),
|
| 357 |
+
"final_output": str(
|
| 358 |
+
out_dir / ("final_output.json" if variant == "full" else "final_output_demo_code.json")
|
| 359 |
+
),
|
| 360 |
+
"final_output_condensed": str(
|
| 361 |
+
out_dir / ("final_output_condensed.json" if variant == "full" else "final_output_demo_code_condensed.json")
|
| 362 |
+
),
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def _artifact_state(output_files: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
|
| 367 |
+
state: Dict[str, Dict[str, Any]] = {}
|
| 368 |
+
for key, p in output_files.items():
|
| 369 |
+
path = Path(p)
|
| 370 |
+
if path.exists():
|
| 371 |
+
try:
|
| 372 |
+
st = path.stat()
|
| 373 |
+
state[key] = {
|
| 374 |
+
"size_bytes": int(st.st_size),
|
| 375 |
+
"mtime": float(st.st_mtime),
|
| 376 |
+
}
|
| 377 |
+
except Exception:
|
| 378 |
+
state[key] = {"size_bytes": -1, "mtime": -1.0}
|
| 379 |
+
return state
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def _format_artifact_compact(state: Dict[str, Dict[str, Any]]) -> str:
|
| 383 |
+
if not state:
|
| 384 |
+
return "none"
|
| 385 |
+
parts = []
|
| 386 |
+
for k in sorted(state.keys()):
|
| 387 |
+
sz = float(state[k].get("size_bytes", 0))
|
| 388 |
+
parts.append(f"{k}:{sz/1024.0:.1f}KB")
|
| 389 |
+
return ", ".join(parts)
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def _watch_run(
|
| 393 |
+
run_id: str,
|
| 394 |
+
proc: subprocess.Popen,
|
| 395 |
+
started_at: float,
|
| 396 |
+
log_fh,
|
| 397 |
+
heartbeat_sec: float,
|
| 398 |
+
) -> None:
|
| 399 |
+
heartbeat_sec = max(2.0, float(heartbeat_sec))
|
| 400 |
+
last_hb = 0.0
|
| 401 |
+
last_artifact_change = started_at
|
| 402 |
+
last_state: Dict[str, Dict[str, Any]] = {}
|
| 403 |
+
|
| 404 |
+
# Emit periodic progress so logs are not "stuck" during long calls.
|
| 405 |
+
while True:
|
| 406 |
+
now = time.time()
|
| 407 |
+
rc = proc.poll()
|
| 408 |
+
|
| 409 |
+
if (now - last_hb) >= heartbeat_sec:
|
| 410 |
+
try:
|
| 411 |
+
meta_file = _meta_path(run_id)
|
| 412 |
+
meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
|
| 413 |
+
out_files = meta.get("output_files", {}) or {}
|
| 414 |
+
cur_state = _artifact_state(out_files)
|
| 415 |
+
changed = cur_state != last_state
|
| 416 |
+
if changed:
|
| 417 |
+
last_artifact_change = now
|
| 418 |
+
unchanged_for = now - last_artifact_change
|
| 419 |
+
elapsed = now - started_at
|
| 420 |
+
|
| 421 |
+
log_fh.write(
|
| 422 |
+
"[runner] heartbeat "
|
| 423 |
+
f"elapsed={elapsed:.1f}s pid={proc.pid} "
|
| 424 |
+
f"artifacts={len(cur_state)}/{len(out_files)} "
|
| 425 |
+
f"changed={'yes' if changed else 'no'} "
|
| 426 |
+
f"unchanged_for={unchanged_for:.1f}s "
|
| 427 |
+
f"[{_format_artifact_compact(cur_state)}]\n"
|
| 428 |
+
)
|
| 429 |
+
log_fh.flush()
|
| 430 |
+
|
| 431 |
+
meta["last_heartbeat_epoch"] = now
|
| 432 |
+
meta["last_heartbeat_elapsed_sec"] = round(elapsed, 3)
|
| 433 |
+
meta["artifacts_ready_count"] = len(cur_state)
|
| 434 |
+
meta["artifacts_total_count"] = len(out_files)
|
| 435 |
+
meta["artifacts_unchanged_for_sec"] = round(unchanged_for, 3)
|
| 436 |
+
_write_json(meta_file, meta)
|
| 437 |
+
last_state = cur_state
|
| 438 |
+
except Exception as e:
|
| 439 |
+
try:
|
| 440 |
+
log_fh.write(f"[runner] heartbeat_error: {type(e).__name__}: {e}\n")
|
| 441 |
+
log_fh.flush()
|
| 442 |
+
except Exception:
|
| 443 |
+
pass
|
| 444 |
+
last_hb = now
|
| 445 |
+
|
| 446 |
+
if rc is not None:
|
| 447 |
+
return_code = int(rc)
|
| 448 |
+
break
|
| 449 |
+
|
| 450 |
+
time.sleep(1.0)
|
| 451 |
+
|
| 452 |
+
finished_at = time.time()
|
| 453 |
+
try:
|
| 454 |
+
meta_file = _meta_path(run_id)
|
| 455 |
+
meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
|
| 456 |
+
meta["status"] = "succeeded" if return_code == 0 else "failed"
|
| 457 |
+
meta["exit_code"] = int(return_code)
|
| 458 |
+
meta["finished_at_epoch"] = finished_at
|
| 459 |
+
meta["duration_sec"] = round(finished_at - started_at, 3)
|
| 460 |
+
_write_json(meta_file, meta)
|
| 461 |
+
except Exception as e:
|
| 462 |
+
try:
|
| 463 |
+
log_fh.write(f"\n[runner] failed to update metadata: {type(e).__name__}: {e}\n")
|
| 464 |
+
log_fh.flush()
|
| 465 |
+
except Exception:
|
| 466 |
+
pass
|
| 467 |
+
|
| 468 |
+
try:
|
| 469 |
+
log_fh.write(f"\n[runner] process finished with exit_code={return_code}\n")
|
| 470 |
+
log_fh.flush()
|
| 471 |
+
except Exception:
|
| 472 |
+
pass
|
| 473 |
+
finally:
|
| 474 |
+
try:
|
| 475 |
+
log_fh.close()
|
| 476 |
+
except Exception:
|
| 477 |
+
pass
|
| 478 |
+
|
| 479 |
+
|
| 480 |
+
def _start_pipeline(pipeline_script: Path, req: PipelineRequest, variant: str) -> Dict[str, Any]:
|
| 481 |
+
if not pipeline_script.exists():
|
| 482 |
+
raise HTTPException(status_code=500, detail=f"Missing pipeline script: {pipeline_script}")
|
| 483 |
+
|
| 484 |
+
run_id = uuid.uuid4().hex[:12]
|
| 485 |
+
run_dir = _run_dir(run_id)
|
| 486 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 487 |
+
|
| 488 |
+
video_path = _resolve_video_input(req, run_id, run_dir)
|
| 489 |
+
_validate_video_file(video_path)
|
| 490 |
+
out_dir = _resolve_out_dir(req, run_id)
|
| 491 |
+
python_exe = _resolve_python_executable(req)
|
| 492 |
+
|
| 493 |
+
cmd = [
|
| 494 |
+
python_exe,
|
| 495 |
+
"-u",
|
| 496 |
+
str(pipeline_script),
|
| 497 |
+
"--python",
|
| 498 |
+
python_exe,
|
| 499 |
+
*_build_common_args(req, video_path, out_dir),
|
| 500 |
+
]
|
| 501 |
+
|
| 502 |
+
started = time.time()
|
| 503 |
+
logs_path = _logs_path(run_id)
|
| 504 |
+
log_fh = open(logs_path, "a", encoding="utf-8", buffering=1)
|
| 505 |
+
log_fh.write(
|
| 506 |
+
f"[runner] run_id={run_id} variant={variant} started_at_epoch={started}\n"
|
| 507 |
+
f"[runner] command={' '.join(cmd)}\n"
|
| 508 |
+
f"[runner] cwd={PIPELINES_DIR}\n\n"
|
| 509 |
+
f"[runner] heartbeat_interval_sec={req.log_heartbeat_sec}\n"
|
| 510 |
+
f"[runner] python_unbuffered=1\n\n"
|
| 511 |
+
)
|
| 512 |
+
log_fh.flush()
|
| 513 |
+
|
| 514 |
+
child_env = os.environ.copy()
|
| 515 |
+
child_env["PYTHONUNBUFFERED"] = "1"
|
| 516 |
+
child_env.setdefault("PYTHONIOENCODING", "utf-8")
|
| 517 |
+
|
| 518 |
+
proc = subprocess.Popen(
|
| 519 |
+
cmd,
|
| 520 |
+
cwd=str(PIPELINES_DIR),
|
| 521 |
+
stdout=log_fh,
|
| 522 |
+
stderr=subprocess.STDOUT,
|
| 523 |
+
text=True,
|
| 524 |
+
env=child_env,
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
meta = {
|
| 528 |
+
"variant": variant,
|
| 529 |
+
"run_id": run_id,
|
| 530 |
+
"python_executable": python_exe,
|
| 531 |
+
"command": cmd,
|
| 532 |
+
"status": "running",
|
| 533 |
+
"exit_code": None,
|
| 534 |
+
"pid": proc.pid,
|
| 535 |
+
"started_at_epoch": started,
|
| 536 |
+
"finished_at_epoch": None,
|
| 537 |
+
"duration_sec": None,
|
| 538 |
+
"out_dir": str(out_dir),
|
| 539 |
+
"logs_path": str(logs_path),
|
| 540 |
+
"heartbeat_interval_sec": float(req.log_heartbeat_sec),
|
| 541 |
+
"output_files": _build_output_files(out_dir, variant),
|
| 542 |
+
}
|
| 543 |
+
_write_json(_meta_path(run_id), meta)
|
| 544 |
+
|
| 545 |
+
watcher = threading.Thread(
|
| 546 |
+
target=_watch_run,
|
| 547 |
+
args=(run_id, proc, started, log_fh, float(req.log_heartbeat_sec)),
|
| 548 |
+
daemon=True,
|
| 549 |
+
)
|
| 550 |
+
watcher.start()
|
| 551 |
+
|
| 552 |
+
return {
|
| 553 |
+
"run_id": run_id,
|
| 554 |
+
"variant": variant,
|
| 555 |
+
"status": "running",
|
| 556 |
+
"python_executable": python_exe,
|
| 557 |
+
"status_path": f"/runs/{run_id}",
|
| 558 |
+
"logs_path": f"/runs/{run_id}/logs",
|
| 559 |
+
"final_output_path": f"/runs/{run_id}/final-output",
|
| 560 |
+
"final_output_condensed_path": f"/runs/{run_id}/final-output/condensed",
|
| 561 |
+
"out_dir": str(out_dir),
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
@app.get("/health")
|
| 566 |
+
def health() -> Dict[str, str]:
|
| 567 |
+
return {"status": "ok"}
|
| 568 |
+
|
| 569 |
+
|
| 570 |
+
@app.get("/")
|
| 571 |
+
def root() -> Dict[str, Any]:
|
| 572 |
+
return {
|
| 573 |
+
"service": "deployed-meet",
|
| 574 |
+
"status": "ok",
|
| 575 |
+
"docs": "/docs",
|
| 576 |
+
"routes": [
|
| 577 |
+
"/pipeline/full",
|
| 578 |
+
"/pipeline/demo-code",
|
| 579 |
+
"/runs/{run_id}",
|
| 580 |
+
"/runs/{run_id}/logs",
|
| 581 |
+
"/runs/{run_id}/final-output",
|
| 582 |
+
"/runs/{run_id}/final-output/condensed",
|
| 583 |
+
],
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
@app.post("/pipeline/full")
|
| 588 |
+
def pipeline_full(req: PipelineRequest) -> Dict[str, Any]:
|
| 589 |
+
return _start_pipeline(PIPELINES_DIR / "run_pipeline_all.py", req, variant="full")
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
@app.post("/pipeline/demo-code")
|
| 593 |
+
def pipeline_demo_code(req: PipelineRequest) -> Dict[str, Any]:
|
| 594 |
+
return _start_pipeline(PIPELINES_DIR / "run_pipeline_demo_code.py", req, variant="demo_code")
|
| 595 |
+
|
| 596 |
+
|
| 597 |
+
@app.get("/runs/{run_id}")
|
| 598 |
+
def run_status(run_id: str) -> Dict[str, Any]:
|
| 599 |
+
return _get_meta_or_404(run_id)
|
| 600 |
+
|
| 601 |
+
|
| 602 |
+
@app.get("/runs/{run_id}/logs")
|
| 603 |
+
def run_logs(run_id: str, tail_lines: int = 300) -> PlainTextResponse:
|
| 604 |
+
meta = _get_meta_or_404(run_id)
|
| 605 |
+
p = Path(meta.get("logs_path", ""))
|
| 606 |
+
if not p.exists():
|
| 607 |
+
return PlainTextResponse("")
|
| 608 |
+
txt = p.read_text(encoding="utf-8", errors="replace")
|
| 609 |
+
limit = max(1, min(int(tail_lines), 5000))
|
| 610 |
+
return PlainTextResponse(_tail(txt, max_lines=limit))
|
| 611 |
+
|
| 612 |
+
|
| 613 |
+
@app.get("/runs/{run_id}/final-output")
|
| 614 |
+
def run_final_output(run_id: str) -> Any:
|
| 615 |
+
meta = _get_meta_or_404(run_id)
|
| 616 |
+
status = meta.get("status")
|
| 617 |
+
out_file = Path(meta["output_files"]["final_output"])
|
| 618 |
+
|
| 619 |
+
if status == "running":
|
| 620 |
+
return JSONResponse(
|
| 621 |
+
status_code=202,
|
| 622 |
+
content={
|
| 623 |
+
"run_id": run_id,
|
| 624 |
+
"status": status,
|
| 625 |
+
"message": "Pipeline is still running. Check /runs/{run_id}/logs for live progress.",
|
| 626 |
+
"logs_path": f"/runs/{run_id}/logs",
|
| 627 |
+
},
|
| 628 |
+
)
|
| 629 |
+
if status == "failed":
|
| 630 |
+
raise HTTPException(
|
| 631 |
+
status_code=409,
|
| 632 |
+
detail={
|
| 633 |
+
"run_id": run_id,
|
| 634 |
+
"status": status,
|
| 635 |
+
"message": "Pipeline failed. Check logs for details.",
|
| 636 |
+
"logs_path": f"/runs/{run_id}/logs",
|
| 637 |
+
},
|
| 638 |
+
)
|
| 639 |
+
if not out_file.exists():
|
| 640 |
+
raise HTTPException(status_code=404, detail=f"Final output not found: {out_file}")
|
| 641 |
+
return _read_json(out_file)
|
| 642 |
+
|
| 643 |
+
|
| 644 |
+
@app.get("/runs/{run_id}/final-output/condensed")
|
| 645 |
+
def run_final_output_condensed(run_id: str) -> Any:
|
| 646 |
+
meta = _get_meta_or_404(run_id)
|
| 647 |
+
status = meta.get("status")
|
| 648 |
+
out_file = Path(meta["output_files"]["final_output_condensed"])
|
| 649 |
+
|
| 650 |
+
if status == "running":
|
| 651 |
+
return JSONResponse(
|
| 652 |
+
status_code=202,
|
| 653 |
+
content={
|
| 654 |
+
"run_id": run_id,
|
| 655 |
+
"status": status,
|
| 656 |
+
"message": "Pipeline is still running. Check /runs/{run_id}/logs for live progress.",
|
| 657 |
+
"logs_path": f"/runs/{run_id}/logs",
|
| 658 |
+
},
|
| 659 |
+
)
|
| 660 |
+
if status == "failed":
|
| 661 |
+
raise HTTPException(
|
| 662 |
+
status_code=409,
|
| 663 |
+
detail={
|
| 664 |
+
"run_id": run_id,
|
| 665 |
+
"status": status,
|
| 666 |
+
"message": "Pipeline failed. Check logs for details.",
|
| 667 |
+
"logs_path": f"/runs/{run_id}/logs",
|
| 668 |
+
},
|
| 669 |
+
)
|
| 670 |
+
if not out_file.exists():
|
| 671 |
+
raise HTTPException(status_code=404, detail=f"Condensed final output not found: {out_file}")
|
| 672 |
+
return _read_json(out_file)
|
app.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from typing import Any, Dict, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
from run_manager import get_final_output, get_logs, get_status, start_run
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _clean_optional(value: Optional[str]) -> Optional[str]:
|
| 13 |
+
if value is None:
|
| 14 |
+
return None
|
| 15 |
+
text = str(value).strip()
|
| 16 |
+
return text or None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _err_payload(message: str) -> Dict[str, Any]:
|
| 20 |
+
return {"status": "error", "message": message}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def start_pipeline(
|
| 24 |
+
variant: str,
|
| 25 |
+
input_mode: str,
|
| 26 |
+
video_file_path: Optional[str],
|
| 27 |
+
video_url: Optional[str],
|
| 28 |
+
out_dir: Optional[str],
|
| 29 |
+
python_bin: Optional[str],
|
| 30 |
+
deepgram_model: str,
|
| 31 |
+
deepgram_language: Optional[str],
|
| 32 |
+
deepgram_request_timeout_sec: float,
|
| 33 |
+
deepgram_connect_timeout_sec: float,
|
| 34 |
+
deepgram_retries: int,
|
| 35 |
+
deepgram_retry_backoff_sec: float,
|
| 36 |
+
force_deepgram: bool,
|
| 37 |
+
force_keyframes: bool,
|
| 38 |
+
pre_roll_sec: float,
|
| 39 |
+
gemini_model: str,
|
| 40 |
+
similarity_threshold: float,
|
| 41 |
+
temperature: float,
|
| 42 |
+
log_heartbeat_sec: float,
|
| 43 |
+
) -> Tuple[str, Dict[str, Any], str, str]:
|
| 44 |
+
try:
|
| 45 |
+
chosen_video_file = None
|
| 46 |
+
chosen_video_url = None
|
| 47 |
+
mode = (input_mode or "").strip().lower()
|
| 48 |
+
|
| 49 |
+
if mode == "upload file":
|
| 50 |
+
chosen_video_file = _clean_optional(video_file_path)
|
| 51 |
+
if not chosen_video_file:
|
| 52 |
+
raise ValueError("Select a video file for Upload File mode.")
|
| 53 |
+
elif mode == "video url":
|
| 54 |
+
chosen_video_url = _clean_optional(video_url)
|
| 55 |
+
if not chosen_video_url:
|
| 56 |
+
raise ValueError("Provide video_url for Video URL mode.")
|
| 57 |
+
else:
|
| 58 |
+
raise ValueError("Invalid input mode.")
|
| 59 |
+
|
| 60 |
+
result = start_run(
|
| 61 |
+
variant=variant,
|
| 62 |
+
video_file_path=chosen_video_file,
|
| 63 |
+
video_url=chosen_video_url,
|
| 64 |
+
out_dir=_clean_optional(out_dir),
|
| 65 |
+
python_bin=_clean_optional(python_bin),
|
| 66 |
+
deepgram_model=deepgram_model,
|
| 67 |
+
deepgram_language=_clean_optional(deepgram_language),
|
| 68 |
+
deepgram_request_timeout_sec=float(deepgram_request_timeout_sec),
|
| 69 |
+
deepgram_connect_timeout_sec=float(deepgram_connect_timeout_sec),
|
| 70 |
+
deepgram_retries=int(deepgram_retries),
|
| 71 |
+
deepgram_retry_backoff_sec=float(deepgram_retry_backoff_sec),
|
| 72 |
+
force_deepgram=bool(force_deepgram),
|
| 73 |
+
force_keyframes=bool(force_keyframes),
|
| 74 |
+
pre_roll_sec=float(pre_roll_sec),
|
| 75 |
+
gemini_model=gemini_model,
|
| 76 |
+
similarity_threshold=float(similarity_threshold),
|
| 77 |
+
temperature=float(temperature),
|
| 78 |
+
log_heartbeat_sec=float(log_heartbeat_sec),
|
| 79 |
+
)
|
| 80 |
+
run_id = str(result["run_id"])
|
| 81 |
+
logs = get_logs(run_id, tail_lines=120)
|
| 82 |
+
return run_id, result, logs, run_id
|
| 83 |
+
except Exception as e:
|
| 84 |
+
msg = f"{type(e).__name__}: {e}"
|
| 85 |
+
return "", _err_payload(msg), msg, ""
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def refresh_status_logs(run_id: str, tail_lines: int) -> Tuple[Dict[str, Any], str]:
|
| 89 |
+
rid = _clean_optional(run_id)
|
| 90 |
+
if not rid:
|
| 91 |
+
return _err_payload("Enter a run_id."), ""
|
| 92 |
+
try:
|
| 93 |
+
status = get_status(rid)
|
| 94 |
+
logs = get_logs(rid, tail_lines=int(tail_lines))
|
| 95 |
+
return status, logs
|
| 96 |
+
except Exception as e:
|
| 97 |
+
return _err_payload(f"{type(e).__name__}: {e}"), ""
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def fetch_output(run_id: str, condensed: bool) -> Dict[str, Any]:
|
| 101 |
+
rid = _clean_optional(run_id)
|
| 102 |
+
if not rid:
|
| 103 |
+
return _err_payload("Enter a run_id.")
|
| 104 |
+
try:
|
| 105 |
+
return get_final_output(rid, condensed=condensed)
|
| 106 |
+
except Exception as e:
|
| 107 |
+
return _err_payload(f"{type(e).__name__}: {e}")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def watch_run(
|
| 111 |
+
run_id: str,
|
| 112 |
+
tail_lines: int,
|
| 113 |
+
poll_sec: float,
|
| 114 |
+
):
|
| 115 |
+
rid = _clean_optional(run_id)
|
| 116 |
+
if not rid:
|
| 117 |
+
yield _err_payload("Enter a run_id."), "", None, None
|
| 118 |
+
return
|
| 119 |
+
|
| 120 |
+
sleep_sec = max(1.0, float(poll_sec))
|
| 121 |
+
max_tail = max(10, min(int(tail_lines), 5000))
|
| 122 |
+
|
| 123 |
+
while True:
|
| 124 |
+
try:
|
| 125 |
+
status = get_status(rid)
|
| 126 |
+
logs = get_logs(rid, tail_lines=max_tail)
|
| 127 |
+
except Exception as e:
|
| 128 |
+
yield _err_payload(f"{type(e).__name__}: {e}"), "", None, None
|
| 129 |
+
return
|
| 130 |
+
|
| 131 |
+
state = str(status.get("status", "unknown")).lower()
|
| 132 |
+
if state in {"succeeded", "failed"}:
|
| 133 |
+
full_payload = None
|
| 134 |
+
condensed_payload = None
|
| 135 |
+
if state == "succeeded":
|
| 136 |
+
try:
|
| 137 |
+
full_payload = get_final_output(rid, condensed=False)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
full_payload = _err_payload(f"{type(e).__name__}: {e}")
|
| 140 |
+
try:
|
| 141 |
+
condensed_payload = get_final_output(rid, condensed=True)
|
| 142 |
+
except Exception as e:
|
| 143 |
+
condensed_payload = _err_payload(f"{type(e).__name__}: {e}")
|
| 144 |
+
yield status, logs, full_payload, condensed_payload
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
yield status, logs, None, None
|
| 148 |
+
time.sleep(sleep_sec)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
with gr.Blocks(title="deployed-meet") as demo:
|
| 152 |
+
gr.Markdown(
|
| 153 |
+
"""
|
| 154 |
+
# deployed-meet (Gradio)
|
| 155 |
+
Start either pipeline variant, then monitor logs and fetch final outputs by `run_id`.
|
| 156 |
+
- `full`: Gemini on all keyframe types.
|
| 157 |
+
- `demo-code`: Gemini only on demo keyframes, slides+code are OCR/transcript based.
|
| 158 |
+
"""
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
with gr.Tab("Start Run"):
|
| 162 |
+
variant = gr.Dropdown(
|
| 163 |
+
choices=[
|
| 164 |
+
("Full pipeline (Gemini on slides/code/demo)", "full"),
|
| 165 |
+
("Demo-only Gemini pipeline (slides+code OCR)", "demo-code"),
|
| 166 |
+
],
|
| 167 |
+
value="demo-code",
|
| 168 |
+
label="Pipeline Variant",
|
| 169 |
+
)
|
| 170 |
+
input_mode = gr.Radio(
|
| 171 |
+
choices=["Upload File", "Video URL"],
|
| 172 |
+
value="Upload File",
|
| 173 |
+
label="Input Mode",
|
| 174 |
+
)
|
| 175 |
+
video_file = gr.File(label="Video File", type="filepath")
|
| 176 |
+
video_url = gr.Textbox(label="Video URL", placeholder="https://.../meeting.mp4")
|
| 177 |
+
|
| 178 |
+
out_dir = gr.Textbox(
|
| 179 |
+
label="Output Directory (optional)",
|
| 180 |
+
placeholder="run_001",
|
| 181 |
+
)
|
| 182 |
+
python_bin = gr.Textbox(
|
| 183 |
+
label="Python Executable (optional)",
|
| 184 |
+
placeholder="Leave blank to auto-resolve",
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 188 |
+
deepgram_model = gr.Textbox(label="Deepgram Model", value="nova-3")
|
| 189 |
+
deepgram_language = gr.Textbox(label="Deepgram Language (optional)", value="")
|
| 190 |
+
deepgram_request_timeout_sec = gr.Number(label="Deepgram Request Timeout (sec)", value=1200.0)
|
| 191 |
+
deepgram_connect_timeout_sec = gr.Number(label="Deepgram Connect Timeout (sec)", value=30.0)
|
| 192 |
+
deepgram_retries = gr.Number(label="Deepgram Retries", value=3, precision=0)
|
| 193 |
+
deepgram_retry_backoff_sec = gr.Number(label="Deepgram Retry Backoff (sec)", value=2.0)
|
| 194 |
+
force_deepgram = gr.Checkbox(label="Force Deepgram Re-run", value=False)
|
| 195 |
+
force_keyframes = gr.Checkbox(label="Force Keyframe Re-run", value=False)
|
| 196 |
+
pre_roll_sec = gr.Number(label="Pre-roll Seconds", value=3.0)
|
| 197 |
+
gemini_model = gr.Textbox(label="Gemini Model", value="gemini-2.5-flash")
|
| 198 |
+
similarity_threshold = gr.Number(label="Similarity Threshold", value=0.82)
|
| 199 |
+
temperature = gr.Number(label="Temperature", value=0.2)
|
| 200 |
+
log_heartbeat_sec = gr.Number(label="Heartbeat Log Interval (sec)", value=10.0)
|
| 201 |
+
|
| 202 |
+
start_btn = gr.Button("Start Pipeline", variant="primary")
|
| 203 |
+
start_run_id = gr.Textbox(label="Run ID", interactive=False)
|
| 204 |
+
start_status = gr.JSON(label="Start Response / Error")
|
| 205 |
+
start_logs = gr.Textbox(label="Initial Logs", lines=14)
|
| 206 |
+
|
| 207 |
+
with gr.Tab("Track Run"):
|
| 208 |
+
track_run_id = gr.Textbox(label="Run ID", placeholder="Paste run_id from Start tab")
|
| 209 |
+
tail_lines = gr.Slider(label="Log Tail Lines", minimum=50, maximum=3000, value=300, step=50)
|
| 210 |
+
poll_sec = gr.Slider(label="Live Poll Interval (sec)", minimum=1, maximum=20, value=3, step=1)
|
| 211 |
+
|
| 212 |
+
with gr.Row():
|
| 213 |
+
refresh_btn = gr.Button("Refresh Status + Logs")
|
| 214 |
+
watch_btn = gr.Button("Watch Live")
|
| 215 |
+
full_btn = gr.Button("Fetch Final Output")
|
| 216 |
+
condensed_btn = gr.Button("Fetch Condensed Output")
|
| 217 |
+
|
| 218 |
+
track_status = gr.JSON(label="Run Status")
|
| 219 |
+
track_logs = gr.Textbox(label="Run Logs", lines=22)
|
| 220 |
+
track_full_output = gr.JSON(label="Final Output")
|
| 221 |
+
track_condensed_output = gr.JSON(label="Condensed Final Output")
|
| 222 |
+
|
| 223 |
+
start_btn.click(
|
| 224 |
+
fn=start_pipeline,
|
| 225 |
+
inputs=[
|
| 226 |
+
variant,
|
| 227 |
+
input_mode,
|
| 228 |
+
video_file,
|
| 229 |
+
video_url,
|
| 230 |
+
out_dir,
|
| 231 |
+
python_bin,
|
| 232 |
+
deepgram_model,
|
| 233 |
+
deepgram_language,
|
| 234 |
+
deepgram_request_timeout_sec,
|
| 235 |
+
deepgram_connect_timeout_sec,
|
| 236 |
+
deepgram_retries,
|
| 237 |
+
deepgram_retry_backoff_sec,
|
| 238 |
+
force_deepgram,
|
| 239 |
+
force_keyframes,
|
| 240 |
+
pre_roll_sec,
|
| 241 |
+
gemini_model,
|
| 242 |
+
similarity_threshold,
|
| 243 |
+
temperature,
|
| 244 |
+
log_heartbeat_sec,
|
| 245 |
+
],
|
| 246 |
+
outputs=[start_run_id, start_status, start_logs, track_run_id],
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
refresh_btn.click(
|
| 250 |
+
fn=refresh_status_logs,
|
| 251 |
+
inputs=[track_run_id, tail_lines],
|
| 252 |
+
outputs=[track_status, track_logs],
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
watch_btn.click(
|
| 256 |
+
fn=watch_run,
|
| 257 |
+
inputs=[track_run_id, tail_lines, poll_sec],
|
| 258 |
+
outputs=[track_status, track_logs, track_full_output, track_condensed_output],
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
full_btn.click(
|
| 262 |
+
fn=lambda rid: fetch_output(rid, False),
|
| 263 |
+
inputs=[track_run_id],
|
| 264 |
+
outputs=[track_full_output],
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
condensed_btn.click(
|
| 268 |
+
fn=lambda rid: fetch_output(rid, True),
|
| 269 |
+
inputs=[track_run_id],
|
| 270 |
+
outputs=[track_condensed_output],
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
if __name__ == "__main__":
|
| 275 |
+
demo.queue(default_concurrency_limit=2).launch(
|
| 276 |
+
server_name="0.0.0.0",
|
| 277 |
+
server_port=int(os.getenv("PORT", "7860")),
|
| 278 |
+
show_error=True,
|
| 279 |
+
)
|
pipelines/assign_utterances_to_keyframes.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import argparse
|
| 3 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def safe_str(x: Any) -> str:
|
| 7 |
+
return "" if x is None else str(x)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def extract_list(data: Any) -> List[Dict[str, Any]]:
|
| 11 |
+
# Accept either a list of items, or a dict that contains a list under common keys.
|
| 12 |
+
if isinstance(data, list):
|
| 13 |
+
return [x for x in data if isinstance(x, dict)]
|
| 14 |
+
if isinstance(data, dict):
|
| 15 |
+
for k in ["utterances", "items", "segments", "results", "data"]:
|
| 16 |
+
if k in data and isinstance(data[k], list):
|
| 17 |
+
return [x for x in data[k] if isinstance(x, dict)]
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def extract_keyframes(data: Any) -> List[Dict[str, Any]]:
|
| 22 |
+
# Accept either a list of keyframes, or a dict that contains a list under common keys.
|
| 23 |
+
if isinstance(data, list):
|
| 24 |
+
return [x for x in data if isinstance(x, dict)]
|
| 25 |
+
if isinstance(data, dict):
|
| 26 |
+
for k in ["keyframes", "items", "results", "data"]:
|
| 27 |
+
if k in data and isinstance(data[k], list):
|
| 28 |
+
return [x for x in data[k] if isinstance(x, dict)]
|
| 29 |
+
return []
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def get_time_field(d: Dict[str, Any], keys: List[str]) -> Optional[float]:
|
| 33 |
+
for k in keys:
|
| 34 |
+
if k in d:
|
| 35 |
+
try:
|
| 36 |
+
v = d[k]
|
| 37 |
+
if v is None:
|
| 38 |
+
continue
|
| 39 |
+
return float(v)
|
| 40 |
+
except Exception:
|
| 41 |
+
continue
|
| 42 |
+
return None
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_utterance_times(u: Dict[str, Any]) -> Tuple[Optional[float], Optional[float]]:
|
| 46 |
+
# Try common fields for start/end times
|
| 47 |
+
start = get_time_field(u, ["start_sec", "start_s", "start", "start_time", "t_start", "begin", "from"])
|
| 48 |
+
end = get_time_field(u, ["end_sec", "end_s", "end", "end_time", "t_end", "finish", "to"])
|
| 49 |
+
|
| 50 |
+
# If only one is present, treat utterance as a point-in-time
|
| 51 |
+
if start is not None and end is None:
|
| 52 |
+
end = start
|
| 53 |
+
if end is not None and start is None:
|
| 54 |
+
start = end
|
| 55 |
+
|
| 56 |
+
return start, end
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_utterance_text(u: Dict[str, Any]) -> str:
|
| 60 |
+
for k in ["text", "utterance", "content", "transcript", "sentence"]:
|
| 61 |
+
if k in u and safe_str(u[k]).strip():
|
| 62 |
+
return safe_str(u[k]).strip()
|
| 63 |
+
|
| 64 |
+
# Some formats store words list
|
| 65 |
+
if "words" in u and isinstance(u["words"], list):
|
| 66 |
+
parts = []
|
| 67 |
+
for w in u["words"]:
|
| 68 |
+
if isinstance(w, dict):
|
| 69 |
+
t = w.get("word") or w.get("text")
|
| 70 |
+
if t:
|
| 71 |
+
parts.append(str(t))
|
| 72 |
+
elif isinstance(w, str):
|
| 73 |
+
parts.append(w)
|
| 74 |
+
if parts:
|
| 75 |
+
return " ".join(parts).strip()
|
| 76 |
+
|
| 77 |
+
return ""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def overlaps(a0: float, a1: float, b0: float, b1: float) -> bool:
|
| 81 |
+
# Closed-open overlap check: [a0, a1) overlaps [b0, b1) iff max(starts) < min(ends)
|
| 82 |
+
return max(a0, b0) < min(a1, b1)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def main():
|
| 86 |
+
ap = argparse.ArgumentParser()
|
| 87 |
+
ap.add_argument("keyframes_json", help="Path to keyframes JSON (e.g. keyframes_parsed.json)")
|
| 88 |
+
ap.add_argument("utterances_json", help="Path to utterances.json")
|
| 89 |
+
ap.add_argument("-o", "--out", default="keyframes_with_utterances.json", help="Output JSON path")
|
| 90 |
+
ap.add_argument(
|
| 91 |
+
"--pre-roll-sec",
|
| 92 |
+
type=float,
|
| 93 |
+
default=3.0,
|
| 94 |
+
help="Seconds before each keyframe start that should also belong to that keyframe.",
|
| 95 |
+
)
|
| 96 |
+
args = ap.parse_args()
|
| 97 |
+
|
| 98 |
+
# Load keyframes
|
| 99 |
+
with open(args.keyframes_json, "r", encoding="utf-8") as f:
|
| 100 |
+
kf_raw = json.load(f)
|
| 101 |
+
|
| 102 |
+
keyframes_list = extract_keyframes(kf_raw)
|
| 103 |
+
if not keyframes_list:
|
| 104 |
+
raise ValueError(
|
| 105 |
+
"No keyframes found. Expected a list, or an object containing keyframes under one of: "
|
| 106 |
+
"keyframes/items/results/data."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
# Sort keyframes by time
|
| 110 |
+
keyframes = sorted(
|
| 111 |
+
keyframes_list,
|
| 112 |
+
key=lambda k: (
|
| 113 |
+
float(k.get("t_sec", 0.0) or 0.0),
|
| 114 |
+
int(k.get("keyframe_idx", 0) or 0),
|
| 115 |
+
),
|
| 116 |
+
)
|
| 117 |
+
if not keyframes:
|
| 118 |
+
raise ValueError("No keyframes found in keyframes JSON")
|
| 119 |
+
|
| 120 |
+
pre_roll_sec = max(0.0, float(args.pre_roll_sec))
|
| 121 |
+
|
| 122 |
+
# Precompute keyframe times and windows.
|
| 123 |
+
# window i:
|
| 124 |
+
# - first keyframe: [t_0, t_1)
|
| 125 |
+
# - others: [max(t_i - pre_roll_sec, t_{i-1}), t_{i+1})
|
| 126 |
+
# This makes [t_i - pre_roll_sec, t_i) belong to BOTH keyframe i and keyframe i-1.
|
| 127 |
+
t = [float(kf.get("t_sec", 0.0) or 0.0) for kf in keyframes]
|
| 128 |
+
n = len(t)
|
| 129 |
+
windows: List[Tuple[float, float]] = []
|
| 130 |
+
for i in range(n):
|
| 131 |
+
if i == 0:
|
| 132 |
+
start = t[i]
|
| 133 |
+
else:
|
| 134 |
+
start = max(t[i] - pre_roll_sec, t[i - 1])
|
| 135 |
+
end = t[i + 1] if i < n - 1 else float("inf")
|
| 136 |
+
windows.append((start, end))
|
| 137 |
+
|
| 138 |
+
# Prepare output keyframes (copy + add assigned_utterances)
|
| 139 |
+
out_keyframes: List[Dict[str, Any]] = []
|
| 140 |
+
for kf in keyframes:
|
| 141 |
+
kf_out = dict(kf)
|
| 142 |
+
kf_out["assigned_utterances"] = []
|
| 143 |
+
out_keyframes.append(kf_out)
|
| 144 |
+
|
| 145 |
+
# Load utterances
|
| 146 |
+
with open(args.utterances_json, "r", encoding="utf-8") as f:
|
| 147 |
+
u_raw = json.load(f)
|
| 148 |
+
|
| 149 |
+
utterances = extract_list(u_raw)
|
| 150 |
+
if not utterances:
|
| 151 |
+
raise ValueError(
|
| 152 |
+
"No utterances found. Expected utterances.json to be a list, or a dict containing a list under "
|
| 153 |
+
"one of: utterances/items/segments/results/data."
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
unassigned = []
|
| 157 |
+
multi_assigned = 0
|
| 158 |
+
assigned_total = 0
|
| 159 |
+
|
| 160 |
+
for u in utterances:
|
| 161 |
+
text = get_utterance_text(u).strip()
|
| 162 |
+
u_start, u_end = get_utterance_times(u)
|
| 163 |
+
|
| 164 |
+
if u_start is None or u_end is None or not text:
|
| 165 |
+
unassigned.append({"reason": "missing_text_or_time", "utterance": u})
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
u_start = float(u_start)
|
| 169 |
+
u_end = float(u_end)
|
| 170 |
+
if u_end < u_start:
|
| 171 |
+
u_start, u_end = u_end, u_start
|
| 172 |
+
|
| 173 |
+
# Make point-in-time utterances half-open with tiny duration
|
| 174 |
+
if u_end == u_start:
|
| 175 |
+
u_end = u_start + 1e-6
|
| 176 |
+
|
| 177 |
+
matched_indexes = []
|
| 178 |
+
for i, (w0, w1) in enumerate(windows):
|
| 179 |
+
if overlaps(u_start, u_end, w0, w1):
|
| 180 |
+
matched_indexes.append(i)
|
| 181 |
+
|
| 182 |
+
if not matched_indexes:
|
| 183 |
+
# Fallback for degenerate boundary conditions.
|
| 184 |
+
for i, (w0, w1) in enumerate(windows):
|
| 185 |
+
eps = 1e-9
|
| 186 |
+
if overlaps(u_start - eps, u_end + eps, w0, w1):
|
| 187 |
+
matched_indexes.append(i)
|
| 188 |
+
|
| 189 |
+
if not matched_indexes:
|
| 190 |
+
unassigned.append({"reason": "no_overlapping_keyframe_window", "utterance": u})
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
# Keep indexes sorted and unique.
|
| 194 |
+
matched_indexes = sorted(set(matched_indexes))
|
| 195 |
+
|
| 196 |
+
if len(matched_indexes) > 1:
|
| 197 |
+
multi_assigned += 1
|
| 198 |
+
|
| 199 |
+
payload = dict(u)
|
| 200 |
+
payload["_text"] = text
|
| 201 |
+
payload["_start_sec"] = u_start
|
| 202 |
+
payload["_end_sec"] = u_end
|
| 203 |
+
payload["_overlaps_sorted_indexes"] = matched_indexes
|
| 204 |
+
|
| 205 |
+
for idx in matched_indexes:
|
| 206 |
+
payload2 = dict(payload)
|
| 207 |
+
payload2["_assigned_sorted_index"] = idx
|
| 208 |
+
payload2["_assigned_keyframe_idx"] = out_keyframes[idx].get("keyframe_idx")
|
| 209 |
+
payload2["_assigned_t_sec"] = out_keyframes[idx].get("t_sec")
|
| 210 |
+
out_keyframes[idx]["assigned_utterances"].append(payload2)
|
| 211 |
+
assigned_total += 1
|
| 212 |
+
|
| 213 |
+
# Sort utterances inside each keyframe by start time
|
| 214 |
+
for kf in out_keyframes:
|
| 215 |
+
kf["assigned_utterances"].sort(key=lambda x: float(x.get("_start_sec", 0.0) or 0.0))
|
| 216 |
+
|
| 217 |
+
out = {
|
| 218 |
+
"meta": {
|
| 219 |
+
"keyframes_file": args.keyframes_json,
|
| 220 |
+
"utterances_file": args.utterances_json,
|
| 221 |
+
"keyframes_count": len(out_keyframes),
|
| 222 |
+
"utterances_count": len(utterances),
|
| 223 |
+
"assigned_total": assigned_total, # counts duplicates if an utterance overlaps multiple keyframes
|
| 224 |
+
"multi_assigned_utterances": multi_assigned,
|
| 225 |
+
"unassigned_count": len(unassigned),
|
| 226 |
+
"pre_roll_sec": pre_roll_sec,
|
| 227 |
+
"window_strategy": (
|
| 228 |
+
"pre-roll overlap windows: "
|
| 229 |
+
"first [t_0, t_1), others [max(t_i-pre_roll_sec, t_{i-1}), t_{i+1}), "
|
| 230 |
+
"last ends at +inf"
|
| 231 |
+
),
|
| 232 |
+
},
|
| 233 |
+
"keyframes": out_keyframes,
|
| 234 |
+
"unassigned_utterances": unassigned,
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
with open(args.out, "w", encoding="utf-8") as f:
|
| 238 |
+
json.dump(out, f, ensure_ascii=False, indent=2)
|
| 239 |
+
|
| 240 |
+
print(f"Done. Wrote: {args.out}")
|
| 241 |
+
print(f"Keyframes: {len(out_keyframes)}")
|
| 242 |
+
print(f"Utterances: {len(utterances)}")
|
| 243 |
+
print(f"Assigned total (including duplicates): {assigned_total}")
|
| 244 |
+
print(f"Utterances that overlapped multiple keyframes: {multi_assigned}")
|
| 245 |
+
print(f"Unassigned utterances: {len(unassigned)}")
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
if __name__ == "__main__":
|
| 249 |
+
main()
|
pipelines/build_final_output.py
ADDED
|
@@ -0,0 +1,758 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# build_final_output.py
|
| 2 |
+
# Usage:
|
| 3 |
+
# pip install google-genai pydantic python-dotenv
|
| 4 |
+
# set GEMINI_API_KEY=...
|
| 5 |
+
# python build_final_output.py ^
|
| 6 |
+
# --keyframes "C:\meet-agent\out_folder\keyframes_with_utterances.json" ^
|
| 7 |
+
# --out "C:\meet-agent\out_folder\final_output.json" ^
|
| 8 |
+
# --model "gemini-2.5-flash"
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import time
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 17 |
+
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
from pydantic import BaseModel, Field
|
| 20 |
+
from google import genai
|
| 21 |
+
from google.genai import types
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# -----------------------------
|
| 25 |
+
# Helpers
|
| 26 |
+
# -----------------------------
|
| 27 |
+
def log(msg: str) -> None:
|
| 28 |
+
print(msg, flush=True)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def load_json(path: str) -> Any:
|
| 32 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 33 |
+
return json.load(f)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def save_json(path: str, obj: Any) -> None:
|
| 37 |
+
out_dir = os.path.dirname(path)
|
| 38 |
+
if out_dir:
|
| 39 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 40 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 41 |
+
json.dump(obj, f, ensure_ascii=False, indent=2)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def sec_to_hhmmss(t: float) -> str:
|
| 45 |
+
t = max(0.0, float(t))
|
| 46 |
+
hh = int(t // 3600)
|
| 47 |
+
mm = int((t % 3600) // 60)
|
| 48 |
+
ss = int(t % 60)
|
| 49 |
+
return f"{hh:02d}:{mm:02d}:{ss:02d}"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def tokenize(s: str) -> List[str]:
|
| 53 |
+
s = s.lower()
|
| 54 |
+
s = re.sub(r"[^a-z0-9_]+", " ", s)
|
| 55 |
+
toks = [t for t in s.split() if t]
|
| 56 |
+
return toks
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def jaccard_similarity(a: str, b: str) -> float:
|
| 60 |
+
sa, sb = set(tokenize(a)), set(tokenize(b))
|
| 61 |
+
if not sa and not sb:
|
| 62 |
+
return 1.0
|
| 63 |
+
if not sa or not sb:
|
| 64 |
+
return 0.0
|
| 65 |
+
return len(sa & sb) / max(1, len(sa | sb))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def safe_join_text(lines: List[str], max_chars: int = 8000) -> str:
|
| 69 |
+
"""Join lines but prevent prompt bloat."""
|
| 70 |
+
out = []
|
| 71 |
+
total = 0
|
| 72 |
+
for ln in lines:
|
| 73 |
+
if total + len(ln) + 1 > max_chars:
|
| 74 |
+
break
|
| 75 |
+
out.append(ln)
|
| 76 |
+
total += len(ln) + 1
|
| 77 |
+
return "\n".join(out)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def frame_signature(frame: Optional[Dict[str, Any]]) -> str:
|
| 81 |
+
"""Build a signature string for similarity comparison to previous keyframe."""
|
| 82 |
+
if not frame:
|
| 83 |
+
return ""
|
| 84 |
+
on_screen = frame.get("on_screen_text") or []
|
| 85 |
+
screen_parse = frame.get("screen_parse") or {}
|
| 86 |
+
screen_parse_text = summarize_screen_parse(screen_parse, max_regions=3, max_region_lines=6, max_ocr_lines=30, max_chars=2500)
|
| 87 |
+
on_screen_small = safe_join_text(on_screen[:80], max_chars=2500)
|
| 88 |
+
return f"{on_screen_small}\n{screen_parse_text}"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def diff_lists(prev: List[str], cur: List[str], max_items: int = 25) -> Tuple[List[str], List[str]]:
|
| 92 |
+
prev_set, cur_set = set(prev), set(cur)
|
| 93 |
+
added = [x for x in cur if x not in prev_set][:max_items]
|
| 94 |
+
removed = [x for x in prev if x not in cur_set][:max_items]
|
| 95 |
+
return added, removed
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def summarize_screen_parse(
|
| 99 |
+
screen_parse: Optional[Dict[str, Any]],
|
| 100 |
+
max_regions: int = 8,
|
| 101 |
+
max_region_lines: int = 12,
|
| 102 |
+
max_ocr_lines: int = 120,
|
| 103 |
+
max_chars: int = 9000,
|
| 104 |
+
) -> str:
|
| 105 |
+
if not isinstance(screen_parse, dict) or not screen_parse:
|
| 106 |
+
return "unknown"
|
| 107 |
+
|
| 108 |
+
parts: List[str] = []
|
| 109 |
+
frame_w = screen_parse.get("frame_w")
|
| 110 |
+
frame_h = screen_parse.get("frame_h")
|
| 111 |
+
if frame_w is not None and frame_h is not None:
|
| 112 |
+
parts.append(f"frame_size: {frame_w}x{frame_h}")
|
| 113 |
+
|
| 114 |
+
regions = screen_parse.get("layout_regions") or []
|
| 115 |
+
if regions:
|
| 116 |
+
region_lines: List[str] = []
|
| 117 |
+
for i, region in enumerate(regions[:max_regions]):
|
| 118 |
+
label = region.get("label", "unknown")
|
| 119 |
+
conf = region.get("conf", "unknown")
|
| 120 |
+
box = region.get("box", [])
|
| 121 |
+
text_lines = region.get("text_lines") or []
|
| 122 |
+
text_lines_clean = [str(x).strip() for x in text_lines if str(x).strip()][:max_region_lines]
|
| 123 |
+
text_preview = " | ".join(text_lines_clean)
|
| 124 |
+
region_lines.append(
|
| 125 |
+
f"region[{i}] label={label}, conf={conf}, box={box}, text_lines={text_preview}"
|
| 126 |
+
)
|
| 127 |
+
parts.append("layout_regions:\n" + "\n".join(region_lines))
|
| 128 |
+
|
| 129 |
+
ocr_lines = screen_parse.get("ocr_lines") or []
|
| 130 |
+
if ocr_lines:
|
| 131 |
+
ocr_text: List[str] = []
|
| 132 |
+
for item in ocr_lines[:max_ocr_lines]:
|
| 133 |
+
txt = str(item.get("text", "")).strip()
|
| 134 |
+
if txt:
|
| 135 |
+
ocr_text.append(txt)
|
| 136 |
+
if ocr_text:
|
| 137 |
+
parts.append("ocr_lines:\n" + safe_join_text(ocr_text, max_chars=max_chars))
|
| 138 |
+
|
| 139 |
+
merged = "\n\n".join(parts).strip()
|
| 140 |
+
if not merged:
|
| 141 |
+
return "unknown"
|
| 142 |
+
return merged[:max_chars]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def split_sentences(text: str) -> List[str]:
|
| 146 |
+
if not text:
|
| 147 |
+
return []
|
| 148 |
+
parts = re.split(r"(?<=[.!?])\s+", str(text).strip())
|
| 149 |
+
out = []
|
| 150 |
+
for p in parts:
|
| 151 |
+
p = p.strip()
|
| 152 |
+
if p:
|
| 153 |
+
out.append(p)
|
| 154 |
+
return out
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def build_content_change_summary(
|
| 158 |
+
prev_content_summary: Optional[str],
|
| 159 |
+
cur_content_summary: Optional[str],
|
| 160 |
+
max_items: int = 6,
|
| 161 |
+
) -> str:
|
| 162 |
+
prev = (prev_content_summary or "").strip()
|
| 163 |
+
cur = (cur_content_summary or "").strip()
|
| 164 |
+
if not prev:
|
| 165 |
+
return "Initial keyframe in sequence; no previous content summary to diff against."
|
| 166 |
+
if not cur:
|
| 167 |
+
return "Current content summary is empty or unknown; unable to compute precise content diff."
|
| 168 |
+
if prev == cur:
|
| 169 |
+
return "No material content-summary change from the previous keyframe."
|
| 170 |
+
|
| 171 |
+
prev_sentences = split_sentences(prev)
|
| 172 |
+
cur_sentences = split_sentences(cur)
|
| 173 |
+
prev_set = set(prev_sentences)
|
| 174 |
+
cur_set = set(cur_sentences)
|
| 175 |
+
|
| 176 |
+
added = [s for s in cur_sentences if s not in prev_set][:max_items]
|
| 177 |
+
removed = [s for s in prev_sentences if s not in cur_set][:max_items]
|
| 178 |
+
|
| 179 |
+
# If sentence-level diff fails (e.g., heavy rewrites), use token-level fallback.
|
| 180 |
+
if not added and not removed:
|
| 181 |
+
prev_tokens = set(tokenize(prev))
|
| 182 |
+
cur_tokens = set(tokenize(cur))
|
| 183 |
+
added_tokens = sorted(list(cur_tokens - prev_tokens))[:12]
|
| 184 |
+
removed_tokens = sorted(list(prev_tokens - cur_tokens))[:12]
|
| 185 |
+
if not added_tokens and not removed_tokens:
|
| 186 |
+
return "Content summary wording changed but underlying content differences are unclear."
|
| 187 |
+
out = []
|
| 188 |
+
if added_tokens:
|
| 189 |
+
out.append("Added/updated terms: " + ", ".join(added_tokens))
|
| 190 |
+
if removed_tokens:
|
| 191 |
+
out.append("Removed/de-emphasized terms: " + ", ".join(removed_tokens))
|
| 192 |
+
return " ".join(out)
|
| 193 |
+
|
| 194 |
+
chunks = []
|
| 195 |
+
if added:
|
| 196 |
+
chunks.append(
|
| 197 |
+
"Added/updated in current content summary: "
|
| 198 |
+
+ " ; ".join(a[:240] for a in added)
|
| 199 |
+
)
|
| 200 |
+
if removed:
|
| 201 |
+
chunks.append(
|
| 202 |
+
"Removed/de-emphasized vs previous content summary: "
|
| 203 |
+
+ " ; ".join(r[:240] for r in removed)
|
| 204 |
+
)
|
| 205 |
+
return " ".join(chunks).strip()
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def extract_speakers_from_utterances(utterances: List[Dict[str, Any]]) -> List[str]:
|
| 209 |
+
"""Unique speakers in order of first appearance."""
|
| 210 |
+
seen = set()
|
| 211 |
+
out = []
|
| 212 |
+
for u in utterances or []:
|
| 213 |
+
spk = str(u.get("speaker", "")).strip()
|
| 214 |
+
if not spk:
|
| 215 |
+
spk = "unknown"
|
| 216 |
+
if spk not in seen:
|
| 217 |
+
seen.add(spk)
|
| 218 |
+
out.append(spk)
|
| 219 |
+
return out
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# -----------------------------
|
| 223 |
+
# Pydantic schema for Gemini
|
| 224 |
+
# -----------------------------
|
| 225 |
+
class FrameChange(BaseModel):
|
| 226 |
+
changed_summary: str = Field(
|
| 227 |
+
...,
|
| 228 |
+
description="Only the content-summary diff from previous keyframe to current keyframe.",
|
| 229 |
+
)
|
| 230 |
+
possible_reason: str = Field(
|
| 231 |
+
...,
|
| 232 |
+
description="Why it could have happened (grounded in utterances/on-screen info; if unknown say unknown).",
|
| 233 |
+
)
|
| 234 |
+
added_elements: List[str] = Field(
|
| 235 |
+
default_factory=list,
|
| 236 |
+
description="Notable on-screen text elements that appeared (from diff).",
|
| 237 |
+
)
|
| 238 |
+
removed_elements: List[str] = Field(
|
| 239 |
+
default_factory=list,
|
| 240 |
+
description="Notable on-screen text elements that disappeared (from diff).",
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
class FrameSummary(BaseModel):
|
| 245 |
+
keyframe_idx: int
|
| 246 |
+
frame_type: str
|
| 247 |
+
t_sec: float
|
| 248 |
+
timestamp: str
|
| 249 |
+
image_path: str
|
| 250 |
+
|
| 251 |
+
on_screen_text: List[str] = Field(default_factory=list)
|
| 252 |
+
|
| 253 |
+
# NEW: all speakers present in this keyframe's utterances
|
| 254 |
+
speakers: List[str] = Field(
|
| 255 |
+
default_factory=list,
|
| 256 |
+
description="Unique list of speakers who spoke during this keyframe (from assigned utterances).",
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
utterance_time_start: Optional[str] = None
|
| 260 |
+
utterance_time_end: Optional[str] = None
|
| 261 |
+
|
| 262 |
+
# UPDATED requirements: must explicitly mention speakers
|
| 263 |
+
utterance_summary: str = Field(
|
| 264 |
+
...,
|
| 265 |
+
description="Summary of utterances during this keyframe; must explicitly attribute statements to speakers.",
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# More detailed
|
| 269 |
+
content_summary: str = Field(
|
| 270 |
+
...,
|
| 271 |
+
description="Detailed frame content summary grounded in frame_type, timestamp, on_screen_text, and screen_parse.",
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Combined synthesis
|
| 275 |
+
combined_summary: str = Field(
|
| 276 |
+
...,
|
| 277 |
+
description="Summary that combines utterance_summary and content_summary.",
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
# NEW: change summary for every keyframe transition (prev -> current). null for first keyframe.
|
| 281 |
+
frame_change: Optional[FrameChange] = None
|
| 282 |
+
|
| 283 |
+
similarity_to_prev: float = 0.0
|
| 284 |
+
reused_prev_content: bool = False
|
| 285 |
+
notes: List[str] = Field(default_factory=list)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
class FinalOutput(BaseModel):
|
| 289 |
+
meta: Dict[str, Any]
|
| 290 |
+
keyframes: List[FrameSummary]
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# -----------------------------
|
| 294 |
+
# History manager (diminishing returns)
|
| 295 |
+
# -----------------------------
|
| 296 |
+
@dataclass
|
| 297 |
+
class HistoryState:
|
| 298 |
+
recent_frames: List[Dict[str, Any]]
|
| 299 |
+
long_memory: str
|
| 300 |
+
long_memory_max_chars: int = 4500
|
| 301 |
+
|
| 302 |
+
def __init__(self):
|
| 303 |
+
self.recent_frames = []
|
| 304 |
+
self.long_memory = ""
|
| 305 |
+
|
| 306 |
+
def add_frame(self, frame_summary_obj: Dict[str, Any], keep_recent: int = 4):
|
| 307 |
+
self.recent_frames.append(frame_summary_obj)
|
| 308 |
+
if len(self.recent_frames) > keep_recent:
|
| 309 |
+
to_compress = self.recent_frames[:-keep_recent]
|
| 310 |
+
self.recent_frames = self.recent_frames[-keep_recent:]
|
| 311 |
+
return to_compress
|
| 312 |
+
return []
|
| 313 |
+
|
| 314 |
+
def build_history_context(self) -> str:
|
| 315 |
+
parts = []
|
| 316 |
+
if self.long_memory.strip():
|
| 317 |
+
parts.append("LONG_MEMORY (old history, low weight):\n" + self.long_memory.strip())
|
| 318 |
+
|
| 319 |
+
if self.recent_frames:
|
| 320 |
+
parts.append("RECENT_HISTORY (high weight, most recent first):")
|
| 321 |
+
for fr in reversed(self.recent_frames):
|
| 322 |
+
parts.append(
|
| 323 |
+
f"- [{fr.get('timestamp','??')}] {fr.get('frame_type','?').upper()} "
|
| 324 |
+
f"combined_summary: {fr.get('combined_summary','')[:900]}"
|
| 325 |
+
)
|
| 326 |
+
return "\n".join(parts).strip()
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
# -----------------------------
|
| 330 |
+
# Gemini calls
|
| 331 |
+
# -----------------------------
|
| 332 |
+
def gemini_client() -> genai.Client:
|
| 333 |
+
load_dotenv()
|
| 334 |
+
|
| 335 |
+
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
|
| 336 |
+
if not api_key:
|
| 337 |
+
raise ValueError("Missing GEMINI_API_KEY in environment (.env not loaded or key not set).")
|
| 338 |
+
|
| 339 |
+
return genai.Client(api_key=api_key)
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
def call_gemini_structured(
|
| 343 |
+
client: genai.Client,
|
| 344 |
+
model: str,
|
| 345 |
+
system_instruction: str,
|
| 346 |
+
user_prompt: str,
|
| 347 |
+
schema_model: Any,
|
| 348 |
+
temperature: float = 0.2,
|
| 349 |
+
max_retries: int = 3,
|
| 350 |
+
) -> Any:
|
| 351 |
+
last_err = None
|
| 352 |
+
for attempt in range(1, max_retries + 1):
|
| 353 |
+
try:
|
| 354 |
+
resp = client.models.generate_content(
|
| 355 |
+
model=model,
|
| 356 |
+
contents=user_prompt,
|
| 357 |
+
config=types.GenerateContentConfig(
|
| 358 |
+
system_instruction=system_instruction,
|
| 359 |
+
response_mime_type="application/json",
|
| 360 |
+
response_schema=schema_model,
|
| 361 |
+
temperature=temperature,
|
| 362 |
+
),
|
| 363 |
+
)
|
| 364 |
+
if getattr(resp, "parsed", None) is not None:
|
| 365 |
+
return resp.parsed
|
| 366 |
+
|
| 367 |
+
txt = getattr(resp, "text", None)
|
| 368 |
+
if not txt:
|
| 369 |
+
raise ValueError("Gemini returned no text/parsed output.")
|
| 370 |
+
return json.loads(txt)
|
| 371 |
+
except Exception as e:
|
| 372 |
+
last_err = e
|
| 373 |
+
time.sleep(0.7 * attempt)
|
| 374 |
+
|
| 375 |
+
raise RuntimeError(f"Gemini structured call failed after retries: {last_err}")
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def compress_into_long_memory(
|
| 379 |
+
client: genai.Client,
|
| 380 |
+
model: str,
|
| 381 |
+
existing_long_memory: str,
|
| 382 |
+
frames_to_compress: List[Dict[str, Any]],
|
| 383 |
+
max_chars: int,
|
| 384 |
+
) -> str:
|
| 385 |
+
if not frames_to_compress:
|
| 386 |
+
return existing_long_memory
|
| 387 |
+
|
| 388 |
+
bullets = []
|
| 389 |
+
for fr in frames_to_compress:
|
| 390 |
+
bullets.append(
|
| 391 |
+
f"[{fr.get('timestamp','??')}][{fr.get('frame_type','?')}] "
|
| 392 |
+
f"{fr.get('combined_summary','')[:500]}"
|
| 393 |
+
)
|
| 394 |
+
chunk = "\n".join(bullets)
|
| 395 |
+
|
| 396 |
+
system = (
|
| 397 |
+
"You compress meeting history. Output must be short, factual, and useful.\n"
|
| 398 |
+
"Do not invent details. Prefer concrete technical points and transitions.\n"
|
| 399 |
+
"Keep it under the requested character budget."
|
| 400 |
+
)
|
| 401 |
+
prompt = (
|
| 402 |
+
f"Existing LONG_MEMORY (may be empty):\n{existing_long_memory}\n\n"
|
| 403 |
+
f"New older frames to merge (older history):\n{chunk}\n\n"
|
| 404 |
+
f"Task:\n"
|
| 405 |
+
f"1) Merge them into LONG_MEMORY.\n"
|
| 406 |
+
f"2) Keep the result <= {max_chars} characters.\n"
|
| 407 |
+
f"3) Use bullet points.\n"
|
| 408 |
+
f"Return ONLY plain text."
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
resp = client.models.generate_content(
|
| 412 |
+
model=model,
|
| 413 |
+
contents=prompt,
|
| 414 |
+
config=types.GenerateContentConfig(
|
| 415 |
+
system_instruction=system,
|
| 416 |
+
temperature=0.2,
|
| 417 |
+
max_output_tokens=800,
|
| 418 |
+
),
|
| 419 |
+
)
|
| 420 |
+
text = (getattr(resp, "text", "") or "").strip()
|
| 421 |
+
if not text:
|
| 422 |
+
merged = (existing_long_memory + "\n" + chunk).strip()
|
| 423 |
+
return merged[:max_chars]
|
| 424 |
+
return text[:max_chars]
|
| 425 |
+
|
| 426 |
+
|
| 427 |
+
# -----------------------------
|
| 428 |
+
# Core processing logic
|
| 429 |
+
# -----------------------------
|
| 430 |
+
def build_prompt_for_frame(
|
| 431 |
+
frame: Dict[str, Any],
|
| 432 |
+
history_context: str,
|
| 433 |
+
prev_frame: Optional[Dict[str, Any]],
|
| 434 |
+
prev_content_summary: Optional[str],
|
| 435 |
+
similarity_to_prev: float,
|
| 436 |
+
is_similar: bool,
|
| 437 |
+
transition_diff: Optional[Dict[str, Any]],
|
| 438 |
+
) -> Tuple[str, str]:
|
| 439 |
+
frame_type = (frame.get("frame_type") or "").lower()
|
| 440 |
+
timestamp = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
|
| 441 |
+
t_sec = float(frame.get("t_sec", 0.0))
|
| 442 |
+
|
| 443 |
+
on_screen_text = frame.get("on_screen_text") or []
|
| 444 |
+
screen_parse_summary = summarize_screen_parse(
|
| 445 |
+
frame.get("screen_parse") or {},
|
| 446 |
+
max_regions=8,
|
| 447 |
+
max_region_lines=14,
|
| 448 |
+
max_ocr_lines=140,
|
| 449 |
+
max_chars=12000,
|
| 450 |
+
)
|
| 451 |
+
assigned_utterances = frame.get("assigned_utterances") or []
|
| 452 |
+
speakers = extract_speakers_from_utterances(assigned_utterances)
|
| 453 |
+
|
| 454 |
+
u_start_ts = None
|
| 455 |
+
u_end_ts = None
|
| 456 |
+
if assigned_utterances:
|
| 457 |
+
u_start = min(float(u.get("_start_sec", u.get("start", t_sec))) for u in assigned_utterances)
|
| 458 |
+
u_end = max(float(u.get("_end_sec", u.get("end", t_sec))) for u in assigned_utterances)
|
| 459 |
+
u_start_ts = sec_to_hhmmss(u_start)
|
| 460 |
+
u_end_ts = sec_to_hhmmss(u_end)
|
| 461 |
+
|
| 462 |
+
utt_lines = []
|
| 463 |
+
for u in assigned_utterances[:60]:
|
| 464 |
+
s = float(u.get("_start_sec", u.get("start", 0.0)))
|
| 465 |
+
e = float(u.get("_end_sec", u.get("end", 0.0)))
|
| 466 |
+
spk = str(u.get("speaker", "unknown")).strip() or "unknown"
|
| 467 |
+
txt = (u.get("text", "") or "").strip()
|
| 468 |
+
utt_lines.append(f"[{sec_to_hhmmss(s)}-{sec_to_hhmmss(e)}][{spk}] {txt}")
|
| 469 |
+
utterances_block = safe_join_text(utt_lines, max_chars=12000)
|
| 470 |
+
|
| 471 |
+
reuse_instruction = ""
|
| 472 |
+
if is_similar:
|
| 473 |
+
reuse_instruction = (
|
| 474 |
+
"IMPORTANT: This frame content is very similar to the previous keyframe.\n"
|
| 475 |
+
"Do NOT repeat the entire explanation.\n"
|
| 476 |
+
"Reuse prior context and focus on what is new.\n"
|
| 477 |
+
"frame_change must still be filled if a previous keyframe exists.\n"
|
| 478 |
+
)
|
| 479 |
+
|
| 480 |
+
prev_block = ""
|
| 481 |
+
prev_content_summary_block = "PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\nnone\n\n"
|
| 482 |
+
if prev_frame is not None:
|
| 483 |
+
prev_idx = prev_frame.get("keyframe_idx", -1)
|
| 484 |
+
prev_ts = prev_frame.get("timestamp") or sec_to_hhmmss(prev_frame.get("t_sec", 0.0))
|
| 485 |
+
prev_type = (prev_frame.get("frame_type") or "unknown").lower()
|
| 486 |
+
prev_block = (
|
| 487 |
+
"PREVIOUS_KEYFRAME:\n"
|
| 488 |
+
f"- keyframe_idx: {prev_idx}\n"
|
| 489 |
+
f"- frame_type: {prev_type}\n"
|
| 490 |
+
f"- timestamp: {prev_ts}\n\n"
|
| 491 |
+
)
|
| 492 |
+
prev_content_summary_block = (
|
| 493 |
+
"PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\n"
|
| 494 |
+
f"{(prev_content_summary or 'unknown').strip()}\n\n"
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
transition_diff_block = ""
|
| 498 |
+
if transition_diff is not None:
|
| 499 |
+
transition_diff_block = (
|
| 500 |
+
"KEYFRAME_TRANSITION_DIFF (computed from on_screen_text):\n"
|
| 501 |
+
f"added_elements: {transition_diff.get('added_elements', [])}\n"
|
| 502 |
+
f"removed_elements: {transition_diff.get('removed_elements', [])}\n\n"
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
system_instruction = (
|
| 506 |
+
"You are generating time-aware meeting notes per keyframe.\n"
|
| 507 |
+
"You must follow the provided schema exactly and return JSON only.\n"
|
| 508 |
+
"Do not invent facts not present in the inputs.\n"
|
| 509 |
+
"If something is unknown, say unknown.\n"
|
| 510 |
+
"History has diminishing importance: RECENT_HISTORY is high weight, LONG_MEMORY is low weight.\n"
|
| 511 |
+
"Speaker attribution is required for utterance summary.\n"
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
on_screen_capped = on_screen_text[:350]
|
| 515 |
+
|
| 516 |
+
if frame_type == "slides":
|
| 517 |
+
content_task = (
|
| 518 |
+
"For slides:\n"
|
| 519 |
+
"- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
|
| 520 |
+
" Cover headings, bullets, numbers, claims, and relationships visible on screen.\n"
|
| 521 |
+
"- combined_summary must combine utterance_summary + content_summary.\n"
|
| 522 |
+
)
|
| 523 |
+
elif frame_type == "code":
|
| 524 |
+
content_task = (
|
| 525 |
+
"For code:\n"
|
| 526 |
+
"- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
|
| 527 |
+
" Cover files/modules, functions/classes, logic, inputs/outputs, and config if visible.\n"
|
| 528 |
+
"- combined_summary must combine utterance_summary + content_summary.\n"
|
| 529 |
+
)
|
| 530 |
+
else:
|
| 531 |
+
content_task = (
|
| 532 |
+
"For demo:\n"
|
| 533 |
+
"- content_summary must use frame_type + timestamp + on_screen_text + screen_parse.\n"
|
| 534 |
+
" Cover screens, controls, state transitions, and resulting behavior.\n"
|
| 535 |
+
"- combined_summary must combine utterance_summary + content_summary.\n"
|
| 536 |
+
)
|
| 537 |
+
|
| 538 |
+
output_rules = (
|
| 539 |
+
"OUTPUT_RULES (must follow exactly):\n"
|
| 540 |
+
"- Always populate: on_screen_text, speakers, utterance_summary, content_summary, combined_summary.\n"
|
| 541 |
+
"- utterance_summary must use utterance timestamps + speaker + text provided.\n"
|
| 542 |
+
"- content_summary must be grounded in frame_type + timestamp + on_screen_text + screen_parse.\n"
|
| 543 |
+
"- combined_summary must summarize utterance_summary and content_summary.\n"
|
| 544 |
+
"- If previous keyframe exists, frame_change must be present.\n"
|
| 545 |
+
" - changed_summary must be only the difference between previous and current content_summary.\n"
|
| 546 |
+
" - possible_reason remains grounded in utterances/on-screen evidence; else unknown.\n"
|
| 547 |
+
" - added_elements and removed_elements must use provided diff lists.\n"
|
| 548 |
+
"- If no previous keyframe exists, frame_change must be null.\n"
|
| 549 |
+
)
|
| 550 |
+
|
| 551 |
+
user_prompt = (
|
| 552 |
+
f"{prev_block}"
|
| 553 |
+
f"CURRENT_KEYFRAME:\n"
|
| 554 |
+
f"- keyframe_idx: {frame.get('keyframe_idx')}\n"
|
| 555 |
+
f"- frame_type: {frame_type}\n"
|
| 556 |
+
f"- t_sec: {t_sec}\n"
|
| 557 |
+
f"- timestamp: {timestamp}\n"
|
| 558 |
+
f"- image_path: {frame.get('image_path')}\n"
|
| 559 |
+
f"- similarity_to_prev: {similarity_to_prev:.3f}\n"
|
| 560 |
+
f"- detected_speakers: {speakers}\n"
|
| 561 |
+
f"- utterance_time_range: {u_start_ts}-{u_end_ts}\n\n"
|
| 562 |
+
f"ON_SCREEN_TEXT (list):\n{on_screen_capped}\n\n"
|
| 563 |
+
f"SCREEN_PARSE (structured parse of current frame):\n{screen_parse_summary}\n\n"
|
| 564 |
+
f"ASSIGNED_UTTERANCES (time-stamped, includes speaker):\n{utterances_block}\n\n"
|
| 565 |
+
f"{transition_diff_block}"
|
| 566 |
+
f"{prev_content_summary_block}"
|
| 567 |
+
f"HISTORY_CONTEXT:\n{history_context}\n\n"
|
| 568 |
+
f"{output_rules}\n\n"
|
| 569 |
+
f"{reuse_instruction}\n"
|
| 570 |
+
f"{content_task}\n"
|
| 571 |
+
f"Now produce the JSON output for this keyframe following the schema."
|
| 572 |
+
)
|
| 573 |
+
|
| 574 |
+
return system_instruction, user_prompt
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def keyframe_items(keyframes_data: Any) -> List[Dict[str, Any]]:
|
| 578 |
+
if isinstance(keyframes_data, dict):
|
| 579 |
+
return keyframes_data.get("keyframes", []) or []
|
| 580 |
+
if isinstance(keyframes_data, list):
|
| 581 |
+
return keyframes_data
|
| 582 |
+
return []
|
| 583 |
+
|
| 584 |
+
def main():
|
| 585 |
+
ap = argparse.ArgumentParser()
|
| 586 |
+
ap.add_argument("--keyframes", required=True, help="Path to keyframes_with_utterances.json")
|
| 587 |
+
ap.add_argument("--out", required=True, help="Output path for final JSON")
|
| 588 |
+
ap.add_argument("--model", default="gemini-2.5-flash", help="Gemini model id")
|
| 589 |
+
ap.add_argument("--similarity_threshold", type=float, default=0.82, help="Similarity threshold for 'reuse prev content'")
|
| 590 |
+
ap.add_argument("--temperature", type=float, default=0.2)
|
| 591 |
+
args = ap.parse_args()
|
| 592 |
+
|
| 593 |
+
log("Starting build_final_output.py ...")
|
| 594 |
+
log(f"Keyframes file: {args.keyframes}")
|
| 595 |
+
log(f"Output file: {args.out}")
|
| 596 |
+
log(f"Model: {args.model}")
|
| 597 |
+
|
| 598 |
+
keyframes_data = load_json(args.keyframes)
|
| 599 |
+
keyframes_list = keyframe_items(keyframes_data)
|
| 600 |
+
if not keyframes_list:
|
| 601 |
+
raise ValueError("No keyframes found in input keyframes file.")
|
| 602 |
+
|
| 603 |
+
# Process keyframes in chronological order.
|
| 604 |
+
keyframes_list = sorted(
|
| 605 |
+
keyframes_list,
|
| 606 |
+
key=lambda x: (
|
| 607 |
+
float(x.get("t_sec", 0.0)),
|
| 608 |
+
int(x.get("keyframe_idx", 0)),
|
| 609 |
+
),
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
log(f"Loaded keyframes: {len(keyframes_list)}")
|
| 613 |
+
|
| 614 |
+
log("Initializing Gemini client (loading .env + API key)...")
|
| 615 |
+
client = gemini_client()
|
| 616 |
+
log("Gemini client ready.")
|
| 617 |
+
|
| 618 |
+
output = {
|
| 619 |
+
"meta": {
|
| 620 |
+
"keyframes_file": args.keyframes,
|
| 621 |
+
"model": args.model,
|
| 622 |
+
"generated_at_epoch": time.time(),
|
| 623 |
+
"rules": {
|
| 624 |
+
"process_order": "keyframes in chronological order",
|
| 625 |
+
"history": "recent detailed + long_memory compressed (diminishing returns)",
|
| 626 |
+
"similarity_threshold": args.similarity_threshold,
|
| 627 |
+
"transition_change_each_keyframe": True,
|
| 628 |
+
"speakers_per_keyframe": True,
|
| 629 |
+
"utterance_summary_requires_speaker_attribution": True,
|
| 630 |
+
"content_summary_uses_screen_parse": True,
|
| 631 |
+
"combined_summary_synthesizes_utterance_and_content": True,
|
| 632 |
+
"change_summary_is_content_diff": True,
|
| 633 |
+
},
|
| 634 |
+
},
|
| 635 |
+
"keyframes": [],
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
history_state = HistoryState()
|
| 639 |
+
|
| 640 |
+
prev_frame_obj: Optional[Dict[str, Any]] = None
|
| 641 |
+
prev_frame_summary: Optional[Dict[str, Any]] = None
|
| 642 |
+
|
| 643 |
+
global_kf_done = 0
|
| 644 |
+
global_kf_total = len(keyframes_list)
|
| 645 |
+
log(f"Total keyframes to process: {global_kf_total}")
|
| 646 |
+
|
| 647 |
+
for frame in keyframes_list:
|
| 648 |
+
global_kf_done += 1
|
| 649 |
+
kf_idx = frame.get("keyframe_idx")
|
| 650 |
+
kf_ts = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
|
| 651 |
+
kf_type = (frame.get("frame_type") or "unknown").lower()
|
| 652 |
+
utt_count = len(frame.get("assigned_utterances") or [])
|
| 653 |
+
log(f"[{global_kf_done}/{global_kf_total}] Keyframe {kf_idx} @ {kf_ts} | type={kf_type} | utterances={utt_count}")
|
| 654 |
+
|
| 655 |
+
sig_cur = frame_signature(frame)
|
| 656 |
+
sig_prev = frame_signature(prev_frame_obj)
|
| 657 |
+
sim = jaccard_similarity(sig_prev, sig_cur) if prev_frame_obj else 0.0
|
| 658 |
+
is_similar = (prev_frame_obj is not None) and (sim >= args.similarity_threshold)
|
| 659 |
+
log(f" similarity_to_prev={sim:.3f} | reused_prev_content={is_similar}")
|
| 660 |
+
|
| 661 |
+
transition_diff = None
|
| 662 |
+
if prev_frame_obj is not None:
|
| 663 |
+
prev_text = (prev_frame_obj.get("on_screen_text") or [])
|
| 664 |
+
cur_text = (frame.get("on_screen_text") or [])
|
| 665 |
+
added, removed = diff_lists(prev_text, cur_text, max_items=40)
|
| 666 |
+
transition_diff = {"added_elements": added, "removed_elements": removed}
|
| 667 |
+
|
| 668 |
+
history_context = history_state.build_history_context()
|
| 669 |
+
|
| 670 |
+
system_instruction, user_prompt = build_prompt_for_frame(
|
| 671 |
+
frame=frame,
|
| 672 |
+
history_context=history_context,
|
| 673 |
+
prev_frame=prev_frame_obj,
|
| 674 |
+
prev_content_summary=(prev_frame_summary or {}).get("content_summary"),
|
| 675 |
+
similarity_to_prev=sim,
|
| 676 |
+
is_similar=is_similar,
|
| 677 |
+
transition_diff=transition_diff,
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
log(" -> Calling Gemini ...")
|
| 681 |
+
t_call = time.time()
|
| 682 |
+
parsed = call_gemini_structured(
|
| 683 |
+
client=client,
|
| 684 |
+
model=args.model,
|
| 685 |
+
system_instruction=system_instruction,
|
| 686 |
+
user_prompt=user_prompt,
|
| 687 |
+
schema_model=FrameSummary,
|
| 688 |
+
temperature=args.temperature,
|
| 689 |
+
max_retries=3,
|
| 690 |
+
)
|
| 691 |
+
log(f" <- Gemini done in {time.time() - t_call:.1f}s")
|
| 692 |
+
|
| 693 |
+
if isinstance(parsed, BaseModel):
|
| 694 |
+
parsed_dict = parsed.model_dump()
|
| 695 |
+
else:
|
| 696 |
+
parsed_dict = dict(parsed)
|
| 697 |
+
|
| 698 |
+
parsed_dict["similarity_to_prev"] = float(sim)
|
| 699 |
+
parsed_dict["reused_prev_content"] = bool(is_similar)
|
| 700 |
+
if "notes" not in parsed_dict:
|
| 701 |
+
parsed_dict["notes"] = []
|
| 702 |
+
if is_similar:
|
| 703 |
+
parsed_dict["notes"].append("High similarity to previous keyframe; instructed incremental update.")
|
| 704 |
+
if prev_frame_summary is not None:
|
| 705 |
+
parsed_dict["notes"].append("Keyframe-to-keyframe transition diff computed and provided (frame_change required).")
|
| 706 |
+
|
| 707 |
+
# Enforce change summary as strict diff of previous vs current content_summary.
|
| 708 |
+
if prev_frame_summary is None:
|
| 709 |
+
parsed_dict["frame_change"] = None
|
| 710 |
+
else:
|
| 711 |
+
prev_content_summary = (prev_frame_summary or {}).get("content_summary")
|
| 712 |
+
current_content_summary = parsed_dict.get("content_summary")
|
| 713 |
+
existing_change = parsed_dict.get("frame_change") or {}
|
| 714 |
+
if not isinstance(existing_change, dict):
|
| 715 |
+
existing_change = {}
|
| 716 |
+
existing_change["changed_summary"] = build_content_change_summary(
|
| 717 |
+
prev_content_summary=prev_content_summary,
|
| 718 |
+
cur_content_summary=current_content_summary,
|
| 719 |
+
)
|
| 720 |
+
existing_change["possible_reason"] = str(existing_change.get("possible_reason", "")).strip() or "unknown"
|
| 721 |
+
existing_change["added_elements"] = (transition_diff or {}).get("added_elements", [])
|
| 722 |
+
existing_change["removed_elements"] = (transition_diff or {}).get("removed_elements", [])
|
| 723 |
+
parsed_dict["frame_change"] = existing_change
|
| 724 |
+
|
| 725 |
+
output["keyframes"].append(parsed_dict)
|
| 726 |
+
|
| 727 |
+
to_compress = history_state.add_frame(
|
| 728 |
+
frame_summary_obj={
|
| 729 |
+
"timestamp": parsed_dict.get("timestamp"),
|
| 730 |
+
"frame_type": parsed_dict.get("frame_type"),
|
| 731 |
+
"combined_summary": parsed_dict.get("combined_summary", ""),
|
| 732 |
+
},
|
| 733 |
+
keep_recent=4,
|
| 734 |
+
)
|
| 735 |
+
if to_compress:
|
| 736 |
+
log(f" -> Compressing {len(to_compress)} older frame(s) into LONG_MEMORY ...")
|
| 737 |
+
history_state.long_memory = compress_into_long_memory(
|
| 738 |
+
client=client,
|
| 739 |
+
model=args.model,
|
| 740 |
+
existing_long_memory=history_state.long_memory,
|
| 741 |
+
frames_to_compress=to_compress,
|
| 742 |
+
max_chars=history_state.long_memory_max_chars,
|
| 743 |
+
)
|
| 744 |
+
log(" <- LONG_MEMORY updated.")
|
| 745 |
+
|
| 746 |
+
prev_frame_obj = frame
|
| 747 |
+
prev_frame_summary = parsed_dict
|
| 748 |
+
|
| 749 |
+
log("\nAll keyframes processed. Writing output JSON ...")
|
| 750 |
+
save_json(args.out, output)
|
| 751 |
+
log(f"Done. Wrote: {args.out}")
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
if __name__ == "__main__":
|
| 755 |
+
main()
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
|
pipelines/build_final_output_demo_code.py
ADDED
|
@@ -0,0 +1,549 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Demo-only Gemini build stage (kept in demo-code route for compatibility).
|
| 4 |
+
|
| 5 |
+
Behavior:
|
| 6 |
+
- `demo` keyframes: summarized with Gemini.
|
| 7 |
+
- `slides`, `code`, and `none` keyframes: NO Gemini call; output is built from OCR + utterances.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import re
|
| 16 |
+
import time
|
| 17 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
from pydantic import BaseModel, Field
|
| 21 |
+
from google import genai
|
| 22 |
+
from google.genai import types
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def log(msg: str) -> None:
|
| 26 |
+
print(msg, flush=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_json(path: str) -> Any:
|
| 30 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 31 |
+
return json.load(f)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def save_json(path: str, obj: Any) -> None:
|
| 35 |
+
out_dir = os.path.dirname(path)
|
| 36 |
+
if out_dir:
|
| 37 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 38 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 39 |
+
json.dump(obj, f, ensure_ascii=False, indent=2)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def sec_to_hhmmss(t: float) -> str:
|
| 43 |
+
t = max(0.0, float(t))
|
| 44 |
+
hh = int(t // 3600)
|
| 45 |
+
mm = int((t % 3600) // 60)
|
| 46 |
+
ss = int(t % 60)
|
| 47 |
+
return f"{hh:02d}:{mm:02d}:{ss:02d}"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def tokenize(s: str) -> List[str]:
|
| 51 |
+
s = s.lower()
|
| 52 |
+
s = re.sub(r"[^a-z0-9_]+", " ", s)
|
| 53 |
+
return [t for t in s.split() if t]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def jaccard_similarity(a: str, b: str) -> float:
|
| 57 |
+
sa, sb = set(tokenize(a)), set(tokenize(b))
|
| 58 |
+
if not sa and not sb:
|
| 59 |
+
return 1.0
|
| 60 |
+
if not sa or not sb:
|
| 61 |
+
return 0.0
|
| 62 |
+
return len(sa & sb) / max(1, len(sa | sb))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def safe_join_text(lines: List[str], max_chars: int = 8000) -> str:
|
| 66 |
+
out = []
|
| 67 |
+
total = 0
|
| 68 |
+
for ln in lines:
|
| 69 |
+
if total + len(ln) + 1 > max_chars:
|
| 70 |
+
break
|
| 71 |
+
out.append(ln)
|
| 72 |
+
total += len(ln) + 1
|
| 73 |
+
return "\n".join(out)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def split_sentences(text: str) -> List[str]:
|
| 77 |
+
if not text:
|
| 78 |
+
return []
|
| 79 |
+
parts = re.split(r"(?<=[.!?])\s+", str(text).strip())
|
| 80 |
+
return [p.strip() for p in parts if p.strip()]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def build_content_change_summary(
|
| 84 |
+
prev_content_summary: Optional[str],
|
| 85 |
+
cur_content_summary: Optional[str],
|
| 86 |
+
max_items: int = 6,
|
| 87 |
+
) -> str:
|
| 88 |
+
prev = (prev_content_summary or "").strip()
|
| 89 |
+
cur = (cur_content_summary or "").strip()
|
| 90 |
+
if not prev:
|
| 91 |
+
return "Initial keyframe in sequence; no previous content summary to diff against."
|
| 92 |
+
if not cur:
|
| 93 |
+
return "Current content summary is empty or unknown; unable to compute precise content diff."
|
| 94 |
+
if prev == cur:
|
| 95 |
+
return "No material content-summary change from the previous keyframe."
|
| 96 |
+
|
| 97 |
+
prev_sentences = split_sentences(prev)
|
| 98 |
+
cur_sentences = split_sentences(cur)
|
| 99 |
+
prev_set = set(prev_sentences)
|
| 100 |
+
cur_set = set(cur_sentences)
|
| 101 |
+
|
| 102 |
+
added = [s for s in cur_sentences if s not in prev_set][:max_items]
|
| 103 |
+
removed = [s for s in prev_sentences if s not in cur_set][:max_items]
|
| 104 |
+
|
| 105 |
+
if not added and not removed:
|
| 106 |
+
prev_tokens = set(tokenize(prev))
|
| 107 |
+
cur_tokens = set(tokenize(cur))
|
| 108 |
+
added_tokens = sorted(list(cur_tokens - prev_tokens))[:12]
|
| 109 |
+
removed_tokens = sorted(list(prev_tokens - cur_tokens))[:12]
|
| 110 |
+
if not added_tokens and not removed_tokens:
|
| 111 |
+
return "Content summary wording changed but underlying content differences are unclear."
|
| 112 |
+
out = []
|
| 113 |
+
if added_tokens:
|
| 114 |
+
out.append("Added/updated terms: " + ", ".join(added_tokens))
|
| 115 |
+
if removed_tokens:
|
| 116 |
+
out.append("Removed/de-emphasized terms: " + ", ".join(removed_tokens))
|
| 117 |
+
return " ".join(out)
|
| 118 |
+
|
| 119 |
+
chunks = []
|
| 120 |
+
if added:
|
| 121 |
+
chunks.append(
|
| 122 |
+
"Added/updated in current content summary: "
|
| 123 |
+
+ " ; ".join(a[:240] for a in added)
|
| 124 |
+
)
|
| 125 |
+
if removed:
|
| 126 |
+
chunks.append(
|
| 127 |
+
"Removed/de-emphasized vs previous content summary: "
|
| 128 |
+
+ " ; ".join(r[:240] for r in removed)
|
| 129 |
+
)
|
| 130 |
+
return " ".join(chunks).strip()
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def frame_signature(frame: Optional[Dict[str, Any]]) -> str:
|
| 134 |
+
if not frame:
|
| 135 |
+
return ""
|
| 136 |
+
on_screen = frame.get("on_screen_text") or []
|
| 137 |
+
return safe_join_text([str(x) for x in on_screen[:120]], max_chars=3000)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def diff_lists(prev: List[str], cur: List[str], max_items: int = 25) -> Tuple[List[str], List[str]]:
|
| 141 |
+
prev_set, cur_set = set(prev), set(cur)
|
| 142 |
+
added = [x for x in cur if x not in prev_set][:max_items]
|
| 143 |
+
removed = [x for x in prev if x not in cur_set][:max_items]
|
| 144 |
+
return added, removed
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def summarize_screen_parse(
|
| 148 |
+
screen_parse: Optional[Dict[str, Any]],
|
| 149 |
+
max_regions: int = 8,
|
| 150 |
+
max_region_lines: int = 12,
|
| 151 |
+
max_ocr_lines: int = 120,
|
| 152 |
+
max_chars: int = 9000,
|
| 153 |
+
) -> str:
|
| 154 |
+
if not isinstance(screen_parse, dict) or not screen_parse:
|
| 155 |
+
return "unknown"
|
| 156 |
+
|
| 157 |
+
parts: List[str] = []
|
| 158 |
+
frame_w = screen_parse.get("frame_w")
|
| 159 |
+
frame_h = screen_parse.get("frame_h")
|
| 160 |
+
if frame_w is not None and frame_h is not None:
|
| 161 |
+
parts.append(f"frame_size: {frame_w}x{frame_h}")
|
| 162 |
+
|
| 163 |
+
regions = screen_parse.get("layout_regions") or []
|
| 164 |
+
if regions:
|
| 165 |
+
region_lines: List[str] = []
|
| 166 |
+
for i, region in enumerate(regions[:max_regions]):
|
| 167 |
+
label = region.get("label", "unknown")
|
| 168 |
+
conf = region.get("conf", "unknown")
|
| 169 |
+
box = region.get("box", [])
|
| 170 |
+
text_lines = region.get("text_lines") or []
|
| 171 |
+
text_lines_clean = [str(x).strip() for x in text_lines if str(x).strip()][:max_region_lines]
|
| 172 |
+
text_preview = " | ".join(text_lines_clean)
|
| 173 |
+
region_lines.append(
|
| 174 |
+
f"region[{i}] label={label}, conf={conf}, box={box}, text_lines={text_preview}"
|
| 175 |
+
)
|
| 176 |
+
parts.append("layout_regions:\n" + "\n".join(region_lines))
|
| 177 |
+
|
| 178 |
+
ocr_lines = screen_parse.get("ocr_lines") or []
|
| 179 |
+
if ocr_lines:
|
| 180 |
+
ocr_text: List[str] = []
|
| 181 |
+
for item in ocr_lines[:max_ocr_lines]:
|
| 182 |
+
txt = str(item.get("text", "")).strip()
|
| 183 |
+
if txt:
|
| 184 |
+
ocr_text.append(txt)
|
| 185 |
+
if ocr_text:
|
| 186 |
+
parts.append("ocr_lines:\n" + safe_join_text(ocr_text, max_chars=max_chars))
|
| 187 |
+
|
| 188 |
+
merged = "\n\n".join(parts).strip()
|
| 189 |
+
if not merged:
|
| 190 |
+
return "unknown"
|
| 191 |
+
return merged[:max_chars]
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def extract_speakers_from_utterances(utterances: List[Dict[str, Any]]) -> List[str]:
|
| 195 |
+
seen = set()
|
| 196 |
+
out = []
|
| 197 |
+
for u in utterances or []:
|
| 198 |
+
spk = str(u.get("speaker", "")).strip() or "unknown"
|
| 199 |
+
if spk not in seen:
|
| 200 |
+
seen.add(spk)
|
| 201 |
+
out.append(spk)
|
| 202 |
+
return out
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def utterance_time_bounds(utterances: List[Dict[str, Any]], default_t: float) -> Tuple[Optional[str], Optional[str]]:
|
| 206 |
+
if not utterances:
|
| 207 |
+
return None, None
|
| 208 |
+
starts = []
|
| 209 |
+
ends = []
|
| 210 |
+
for u in utterances:
|
| 211 |
+
try:
|
| 212 |
+
starts.append(float(u.get("_start_sec", u.get("start", default_t))))
|
| 213 |
+
ends.append(float(u.get("_end_sec", u.get("end", default_t))))
|
| 214 |
+
except Exception:
|
| 215 |
+
continue
|
| 216 |
+
if not starts or not ends:
|
| 217 |
+
return None, None
|
| 218 |
+
return sec_to_hhmmss(min(starts)), sec_to_hhmmss(max(ends))
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def build_utterance_lines(utterances: List[Dict[str, Any]], max_lines: int = 80) -> List[str]:
|
| 222 |
+
lines: List[str] = []
|
| 223 |
+
for u in utterances[:max_lines]:
|
| 224 |
+
try:
|
| 225 |
+
s = float(u.get("_start_sec", u.get("start", 0.0)))
|
| 226 |
+
e = float(u.get("_end_sec", u.get("end", 0.0)))
|
| 227 |
+
except Exception:
|
| 228 |
+
s, e = 0.0, 0.0
|
| 229 |
+
spk = str(u.get("speaker", "unknown")).strip() or "unknown"
|
| 230 |
+
txt = (u.get("text", "") or "").strip()
|
| 231 |
+
if not txt:
|
| 232 |
+
continue
|
| 233 |
+
lines.append(f"[{sec_to_hhmmss(s)}-{sec_to_hhmmss(e)}][{spk}] {txt}")
|
| 234 |
+
return lines
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def local_summary_for_non_demo(frame: Dict[str, Any]) -> Dict[str, str]:
|
| 238 |
+
frame_type = str(frame.get("frame_type", "unknown")).lower()
|
| 239 |
+
ocr_lines = [str(x).strip() for x in (frame.get("on_screen_text") or []) if str(x).strip()]
|
| 240 |
+
utter_lines = build_utterance_lines(frame.get("assigned_utterances") or [], max_lines=20)
|
| 241 |
+
|
| 242 |
+
if utter_lines:
|
| 243 |
+
utterance_summary = " | ".join(utter_lines[:8])
|
| 244 |
+
else:
|
| 245 |
+
utterance_summary = "No assigned utterances for this keyframe."
|
| 246 |
+
|
| 247 |
+
if ocr_lines:
|
| 248 |
+
content_summary = (
|
| 249 |
+
f"{frame_type.upper()} keyframe. OCR extracted on-screen text (top lines): "
|
| 250 |
+
+ " | ".join(ocr_lines[:25])
|
| 251 |
+
)
|
| 252 |
+
else:
|
| 253 |
+
content_summary = f"{frame_type.upper()} keyframe. OCR text not available."
|
| 254 |
+
|
| 255 |
+
combined_summary = (
|
| 256 |
+
f"Local (no Gemini) summary for {frame_type} frame. "
|
| 257 |
+
f"Utterances: {utterance_summary} "
|
| 258 |
+
f"Content: {content_summary}"
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
return {
|
| 262 |
+
"utterance_summary": utterance_summary,
|
| 263 |
+
"content_summary": content_summary,
|
| 264 |
+
"combined_summary": combined_summary,
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
class DemoGeminiSummary(BaseModel):
|
| 269 |
+
utterance_summary: str = Field(
|
| 270 |
+
...,
|
| 271 |
+
description="Summary of utterances for this frame with explicit speaker attribution where available.",
|
| 272 |
+
)
|
| 273 |
+
content_summary: str = Field(
|
| 274 |
+
...,
|
| 275 |
+
description="Detailed description of what changed or is shown in this demo frame.",
|
| 276 |
+
)
|
| 277 |
+
combined_summary: str = Field(
|
| 278 |
+
...,
|
| 279 |
+
description="Combined summary merging utterances and visual content.",
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def gemini_client() -> genai.Client:
|
| 284 |
+
load_dotenv()
|
| 285 |
+
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
|
| 286 |
+
if not api_key:
|
| 287 |
+
raise ValueError("Missing GEMINI_API_KEY in environment (.env not loaded or key not set).")
|
| 288 |
+
return genai.Client(api_key=api_key)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def call_gemini_structured(
|
| 292 |
+
client: genai.Client,
|
| 293 |
+
model: str,
|
| 294 |
+
system_instruction: str,
|
| 295 |
+
user_prompt: str,
|
| 296 |
+
schema_model: Any,
|
| 297 |
+
temperature: float = 0.2,
|
| 298 |
+
max_retries: int = 3,
|
| 299 |
+
) -> Any:
|
| 300 |
+
last_err = None
|
| 301 |
+
for attempt in range(1, max_retries + 1):
|
| 302 |
+
try:
|
| 303 |
+
resp = client.models.generate_content(
|
| 304 |
+
model=model,
|
| 305 |
+
contents=user_prompt,
|
| 306 |
+
config=types.GenerateContentConfig(
|
| 307 |
+
system_instruction=system_instruction,
|
| 308 |
+
response_mime_type="application/json",
|
| 309 |
+
response_schema=schema_model,
|
| 310 |
+
temperature=temperature,
|
| 311 |
+
),
|
| 312 |
+
)
|
| 313 |
+
if getattr(resp, "parsed", None) is not None:
|
| 314 |
+
return resp.parsed
|
| 315 |
+
txt = getattr(resp, "text", None)
|
| 316 |
+
if not txt:
|
| 317 |
+
raise ValueError("Gemini returned no text/parsed output.")
|
| 318 |
+
return json.loads(txt)
|
| 319 |
+
except Exception as e:
|
| 320 |
+
last_err = e
|
| 321 |
+
time.sleep(0.7 * attempt)
|
| 322 |
+
raise RuntimeError(f"Gemini structured call failed after retries: {last_err}")
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def build_demo_prompt(
|
| 326 |
+
frame: Dict[str, Any],
|
| 327 |
+
prev_content_summary: Optional[str],
|
| 328 |
+
similarity_to_prev: float,
|
| 329 |
+
is_similar: bool,
|
| 330 |
+
) -> Tuple[str, str]:
|
| 331 |
+
frame_type = str(frame.get("frame_type", "unknown")).lower()
|
| 332 |
+
timestamp = frame.get("timestamp") or sec_to_hhmmss(frame.get("t_sec", 0.0))
|
| 333 |
+
t_sec = float(frame.get("t_sec", 0.0))
|
| 334 |
+
on_screen_text = frame.get("on_screen_text") or []
|
| 335 |
+
screen_parse_summary = summarize_screen_parse(frame.get("screen_parse") or {})
|
| 336 |
+
utterances_block = safe_join_text(
|
| 337 |
+
build_utterance_lines(frame.get("assigned_utterances") or [], max_lines=80),
|
| 338 |
+
max_chars=12000,
|
| 339 |
+
)
|
| 340 |
+
reuse_instruction = ""
|
| 341 |
+
if is_similar:
|
| 342 |
+
reuse_instruction = (
|
| 343 |
+
"Frame is highly similar to previous keyframe. Reuse context and focus on what changed.\n"
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
prev_block = "PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\nnone\n"
|
| 347 |
+
if prev_content_summary:
|
| 348 |
+
prev_block = f"PREVIOUS_KEYFRAME_CONTENT_SUMMARY:\n{prev_content_summary}\n"
|
| 349 |
+
|
| 350 |
+
system_instruction = (
|
| 351 |
+
"You generate keyframe-level meeting notes for demo screens only.\n"
|
| 352 |
+
"Ground all claims in provided utterances and OCR/screen parse.\n"
|
| 353 |
+
"Do not invent facts.\n"
|
| 354 |
+
"Return strict JSON only following schema."
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
user_prompt = (
|
| 358 |
+
f"CURRENT_KEYFRAME:\n"
|
| 359 |
+
f"- frame_type: {frame_type}\n"
|
| 360 |
+
f"- keyframe_idx: {frame.get('keyframe_idx')}\n"
|
| 361 |
+
f"- t_sec: {t_sec}\n"
|
| 362 |
+
f"- timestamp: {timestamp}\n"
|
| 363 |
+
f"- image_path: {frame.get('image_path')}\n"
|
| 364 |
+
f"- similarity_to_prev: {similarity_to_prev:.3f}\n\n"
|
| 365 |
+
f"ON_SCREEN_TEXT:\n{on_screen_text[:350]}\n\n"
|
| 366 |
+
f"SCREEN_PARSE:\n{screen_parse_summary}\n\n"
|
| 367 |
+
f"ASSIGNED_UTTERANCES:\n{utterances_block}\n\n"
|
| 368 |
+
f"{prev_block}\n"
|
| 369 |
+
f"{reuse_instruction}\n"
|
| 370 |
+
f"Requirements:\n"
|
| 371 |
+
f"- utterance_summary: attribute statements to speakers when present.\n"
|
| 372 |
+
f"- content_summary: describe what is visible/changed in this frame.\n"
|
| 373 |
+
f"- combined_summary: merge utterance + visual context.\n"
|
| 374 |
+
)
|
| 375 |
+
return system_instruction, user_prompt
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def keyframe_items(keyframes_data: Any) -> List[Dict[str, Any]]:
|
| 379 |
+
if isinstance(keyframes_data, dict):
|
| 380 |
+
return keyframes_data.get("keyframes", []) or []
|
| 381 |
+
if isinstance(keyframes_data, list):
|
| 382 |
+
return keyframes_data
|
| 383 |
+
return []
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def main() -> None:
|
| 387 |
+
ap = argparse.ArgumentParser()
|
| 388 |
+
ap.add_argument("--keyframes", required=True, help="Path to keyframes_with_utterances.json")
|
| 389 |
+
ap.add_argument("--out", required=True, help="Output path for final JSON")
|
| 390 |
+
ap.add_argument("--model", default="gemini-2.5-flash", help="Gemini model id")
|
| 391 |
+
ap.add_argument("--similarity-threshold", type=float, default=0.82)
|
| 392 |
+
ap.add_argument("--temperature", type=float, default=0.2)
|
| 393 |
+
args = ap.parse_args()
|
| 394 |
+
|
| 395 |
+
keyframes_data = load_json(args.keyframes)
|
| 396 |
+
keyframes_list = keyframe_items(keyframes_data)
|
| 397 |
+
if not keyframes_list:
|
| 398 |
+
raise ValueError("No keyframes found in input keyframes file.")
|
| 399 |
+
|
| 400 |
+
keyframes_list = sorted(
|
| 401 |
+
keyframes_list,
|
| 402 |
+
key=lambda x: (float(x.get("t_sec", 0.0)), int(x.get("keyframe_idx", 0))),
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
demo_count = sum(1 for kf in keyframes_list if str(kf.get("frame_type", "")).lower() == "demo")
|
| 406 |
+
code_count = sum(1 for kf in keyframes_list if str(kf.get("frame_type", "")).lower() == "code")
|
| 407 |
+
gemini_target_count = demo_count
|
| 408 |
+
local_only_count = len(keyframes_list) - gemini_target_count
|
| 409 |
+
log(
|
| 410 |
+
f"Loaded keyframes: total={len(keyframes_list)} demo={demo_count} "
|
| 411 |
+
f"code={code_count} local_only={local_only_count}"
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
client: Optional[genai.Client] = None
|
| 415 |
+
if gemini_target_count > 0:
|
| 416 |
+
log("Initializing Gemini client (demo frames only)...")
|
| 417 |
+
client = gemini_client()
|
| 418 |
+
log("Gemini client ready.")
|
| 419 |
+
|
| 420 |
+
output: Dict[str, Any] = {
|
| 421 |
+
"meta": {
|
| 422 |
+
"keyframes_file": args.keyframes,
|
| 423 |
+
"model": args.model,
|
| 424 |
+
"generated_at_epoch": time.time(),
|
| 425 |
+
"rules": {
|
| 426 |
+
"demo_frames_use_gemini": True,
|
| 427 |
+
"slides_code_none_use_local_ocr_only": True,
|
| 428 |
+
"similarity_threshold": args.similarity_threshold,
|
| 429 |
+
"frame_change_is_deterministic_content_diff": True,
|
| 430 |
+
},
|
| 431 |
+
"counts": {
|
| 432 |
+
"total_keyframes": len(keyframes_list),
|
| 433 |
+
"demo_keyframes": demo_count,
|
| 434 |
+
"code_keyframes": code_count,
|
| 435 |
+
"gemini_keyframes": gemini_target_count,
|
| 436 |
+
"local_only_keyframes": local_only_count,
|
| 437 |
+
"gemini_calls": 0,
|
| 438 |
+
},
|
| 439 |
+
},
|
| 440 |
+
"keyframes": [],
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
prev_frame_obj: Optional[Dict[str, Any]] = None
|
| 444 |
+
prev_content_summary: Optional[str] = None
|
| 445 |
+
|
| 446 |
+
for idx, frame in enumerate(keyframes_list, start=1):
|
| 447 |
+
frame_type = str(frame.get("frame_type", "unknown")).lower()
|
| 448 |
+
t_sec = float(frame.get("t_sec", 0.0))
|
| 449 |
+
timestamp = frame.get("timestamp") or sec_to_hhmmss(t_sec)
|
| 450 |
+
on_screen_text = [str(x).strip() for x in (frame.get("on_screen_text") or []) if str(x).strip()]
|
| 451 |
+
assigned_utterances = frame.get("assigned_utterances") or []
|
| 452 |
+
speakers = extract_speakers_from_utterances(assigned_utterances)
|
| 453 |
+
utt_start_ts, utt_end_ts = utterance_time_bounds(assigned_utterances, default_t=t_sec)
|
| 454 |
+
|
| 455 |
+
sim = 0.0
|
| 456 |
+
is_similar = False
|
| 457 |
+
if prev_frame_obj is not None:
|
| 458 |
+
sim = jaccard_similarity(frame_signature(prev_frame_obj), frame_signature(frame))
|
| 459 |
+
is_similar = sim >= float(args.similarity_threshold)
|
| 460 |
+
|
| 461 |
+
log(
|
| 462 |
+
f"[{idx}/{len(keyframes_list)}] keyframe={frame.get('keyframe_idx')} "
|
| 463 |
+
f"type={frame_type} time={timestamp} similarity={sim:.3f}"
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
if frame_type == "demo":
|
| 467 |
+
if client is None:
|
| 468 |
+
raise RuntimeError("Internal error: demo frame encountered but Gemini client is not initialized.")
|
| 469 |
+
system_instruction, user_prompt = build_demo_prompt(
|
| 470 |
+
frame=frame,
|
| 471 |
+
prev_content_summary=prev_content_summary,
|
| 472 |
+
similarity_to_prev=sim,
|
| 473 |
+
is_similar=is_similar,
|
| 474 |
+
)
|
| 475 |
+
t0 = time.time()
|
| 476 |
+
parsed = call_gemini_structured(
|
| 477 |
+
client=client,
|
| 478 |
+
model=args.model,
|
| 479 |
+
system_instruction=system_instruction,
|
| 480 |
+
user_prompt=user_prompt,
|
| 481 |
+
schema_model=DemoGeminiSummary,
|
| 482 |
+
temperature=args.temperature,
|
| 483 |
+
max_retries=3,
|
| 484 |
+
)
|
| 485 |
+
log(f" Gemini done in {time.time() - t0:.1f}s")
|
| 486 |
+
output["meta"]["counts"]["gemini_calls"] += 1
|
| 487 |
+
if isinstance(parsed, BaseModel):
|
| 488 |
+
summary_payload = parsed.model_dump()
|
| 489 |
+
else:
|
| 490 |
+
summary_payload = dict(parsed)
|
| 491 |
+
summary_source = "gemini_demo_only"
|
| 492 |
+
else:
|
| 493 |
+
summary_payload = local_summary_for_non_demo(frame)
|
| 494 |
+
summary_source = "local_ocr_only"
|
| 495 |
+
|
| 496 |
+
transition_diff = {"added_elements": [], "removed_elements": []}
|
| 497 |
+
if prev_frame_obj is not None:
|
| 498 |
+
prev_text = [str(x).strip() for x in (prev_frame_obj.get("on_screen_text") or []) if str(x).strip()]
|
| 499 |
+
cur_text = on_screen_text
|
| 500 |
+
added, removed = diff_lists(prev_text, cur_text, max_items=40)
|
| 501 |
+
transition_diff = {"added_elements": added, "removed_elements": removed}
|
| 502 |
+
|
| 503 |
+
frame_change = None
|
| 504 |
+
if prev_content_summary is not None:
|
| 505 |
+
frame_change = {
|
| 506 |
+
"changed_summary": build_content_change_summary(
|
| 507 |
+
prev_content_summary=prev_content_summary,
|
| 508 |
+
cur_content_summary=summary_payload.get("content_summary"),
|
| 509 |
+
),
|
| 510 |
+
"possible_reason": (
|
| 511 |
+
"Computed from keyframe OCR and utterance differences; no transition LLM call used."
|
| 512 |
+
),
|
| 513 |
+
"added_elements": transition_diff["added_elements"],
|
| 514 |
+
"removed_elements": transition_diff["removed_elements"],
|
| 515 |
+
}
|
| 516 |
+
|
| 517 |
+
out_frame = {
|
| 518 |
+
"keyframe_idx": int(frame.get("keyframe_idx", idx - 1)),
|
| 519 |
+
"frame_type": frame_type,
|
| 520 |
+
"t_sec": t_sec,
|
| 521 |
+
"timestamp": timestamp,
|
| 522 |
+
"image_path": str(frame.get("image_path", "")),
|
| 523 |
+
"on_screen_text": on_screen_text[:400],
|
| 524 |
+
"speakers": speakers,
|
| 525 |
+
"utterance_time_start": utt_start_ts,
|
| 526 |
+
"utterance_time_end": utt_end_ts,
|
| 527 |
+
"utterance_summary": str(summary_payload.get("utterance_summary", "")).strip(),
|
| 528 |
+
"content_summary": str(summary_payload.get("content_summary", "")).strip(),
|
| 529 |
+
"combined_summary": str(summary_payload.get("combined_summary", "")).strip(),
|
| 530 |
+
"frame_change": frame_change,
|
| 531 |
+
"similarity_to_prev": float(sim),
|
| 532 |
+
"reused_prev_content": bool(is_similar and frame_type == "demo"),
|
| 533 |
+
"notes": [
|
| 534 |
+
f"summary_source={summary_source}",
|
| 535 |
+
"Only demo keyframes are sent to Gemini in this pipeline.",
|
| 536 |
+
],
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
output["keyframes"].append(out_frame)
|
| 540 |
+
prev_frame_obj = frame
|
| 541 |
+
prev_content_summary = out_frame.get("content_summary")
|
| 542 |
+
|
| 543 |
+
save_json(args.out, output)
|
| 544 |
+
log(f"Done. Wrote: {args.out}")
|
| 545 |
+
log(f"Gemini calls made: {output['meta']['counts']['gemini_calls']}")
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
if __name__ == "__main__":
|
| 549 |
+
main()
|
pipelines/condense_final_output.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# condense_final_output.py
|
| 2 |
+
# Usage:
|
| 3 |
+
# python condense_final_output.py --in "C:\meet-agent\out_folder\final_output.json" --out "C:\meet-agent\out_folder\final_output_condensed.json"
|
| 4 |
+
#
|
| 5 |
+
# What it does:
|
| 6 |
+
# - Reads the "final_output.json" produced by your build script
|
| 7 |
+
# - Produces a condensed version with only:
|
| 8 |
+
# - keyframe (idx, timestamp, type, t_sec, image_path)
|
| 9 |
+
# - combined_summary
|
| 10 |
+
# - changed_summary (from transition_change/frame_change/demo_change if present)
|
| 11 |
+
# - Supports both input schemas:
|
| 12 |
+
# 1) new: {"meta": ..., "keyframes": [...]}
|
| 13 |
+
# 2) old: {"meta": ..., "topics": [{"keyframes": [...]}]}
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
from typing import Any, Dict, Optional
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_json(path: str) -> Any:
|
| 22 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 23 |
+
return json.load(f)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def save_json(path: str, obj: Any) -> None:
|
| 27 |
+
out_dir = os.path.dirname(path)
|
| 28 |
+
if out_dir:
|
| 29 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 30 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 31 |
+
json.dump(obj, f, ensure_ascii=False, indent=2)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def pick_changed_summary(kf: Dict[str, Any]) -> Optional[str]:
|
| 35 |
+
"""
|
| 36 |
+
Tries multiple locations, because your schema may store change summaries under different keys
|
| 37 |
+
depending on how you implemented transitions.
|
| 38 |
+
|
| 39 |
+
Priority order:
|
| 40 |
+
1) transition_change.changed_summary
|
| 41 |
+
2) frame_change.changed_summary
|
| 42 |
+
3) demo_change.changed_summary
|
| 43 |
+
4) changed_summary at root (fallback)
|
| 44 |
+
"""
|
| 45 |
+
for container_key in ("transition_change", "frame_change", "demo_change"):
|
| 46 |
+
container = kf.get(container_key)
|
| 47 |
+
if isinstance(container, dict):
|
| 48 |
+
cs = container.get("changed_summary")
|
| 49 |
+
if isinstance(cs, str) and cs.strip():
|
| 50 |
+
return cs.strip()
|
| 51 |
+
|
| 52 |
+
cs_root = kf.get("changed_summary")
|
| 53 |
+
if isinstance(cs_root, str) and cs_root.strip():
|
| 54 |
+
return cs_root.strip()
|
| 55 |
+
|
| 56 |
+
return None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def condense_keyframe(kf: Dict[str, Any]) -> Dict[str, Any]:
|
| 60 |
+
return {
|
| 61 |
+
"keyframe": {
|
| 62 |
+
"keyframe_idx": kf.get("keyframe_idx"),
|
| 63 |
+
"timestamp": kf.get("timestamp"),
|
| 64 |
+
"frame_type": kf.get("frame_type"),
|
| 65 |
+
"t_sec": kf.get("t_sec"),
|
| 66 |
+
"image_path": kf.get("image_path"),
|
| 67 |
+
},
|
| 68 |
+
"combined_summary": kf.get("combined_summary"),
|
| 69 |
+
"changed_summary": pick_changed_summary(kf),
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def condense(final_obj: Dict[str, Any]) -> Dict[str, Any]:
|
| 74 |
+
out_meta: Dict[str, Any] = {
|
| 75 |
+
"source": final_obj.get("meta", {}),
|
| 76 |
+
"notes": "Condensed output: keyframe + combined_summary + changed_summary",
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# New schema: root keyframes list
|
| 80 |
+
root_keyframes = final_obj.get("keyframes", [])
|
| 81 |
+
if isinstance(root_keyframes, list):
|
| 82 |
+
out: Dict[str, Any] = {
|
| 83 |
+
"meta": {**out_meta, "input_schema": "root_keyframes"},
|
| 84 |
+
"keyframes": [],
|
| 85 |
+
}
|
| 86 |
+
for kf in root_keyframes:
|
| 87 |
+
if not isinstance(kf, dict):
|
| 88 |
+
continue
|
| 89 |
+
out["keyframes"].append(condense_keyframe(kf))
|
| 90 |
+
return out
|
| 91 |
+
|
| 92 |
+
# Old schema: topics[] with keyframes[]
|
| 93 |
+
out = {
|
| 94 |
+
"meta": {**out_meta, "input_schema": "topics"},
|
| 95 |
+
"topics": [],
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
topics = final_obj.get("topics", [])
|
| 99 |
+
if not isinstance(topics, list):
|
| 100 |
+
topics = []
|
| 101 |
+
|
| 102 |
+
for t in topics:
|
| 103 |
+
if not isinstance(t, dict):
|
| 104 |
+
continue
|
| 105 |
+
|
| 106 |
+
topic_out = {
|
| 107 |
+
"topic": t.get("topic"),
|
| 108 |
+
"start": t.get("start"),
|
| 109 |
+
"end": t.get("end"),
|
| 110 |
+
"start_ts": t.get("start_ts"),
|
| 111 |
+
"end_ts": t.get("end_ts"),
|
| 112 |
+
"keyframes": [],
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
keyframes = t.get("keyframes", [])
|
| 116 |
+
if not isinstance(keyframes, list):
|
| 117 |
+
keyframes = []
|
| 118 |
+
|
| 119 |
+
for kf in keyframes:
|
| 120 |
+
if not isinstance(kf, dict):
|
| 121 |
+
continue
|
| 122 |
+
topic_out["keyframes"].append(condense_keyframe(kf))
|
| 123 |
+
|
| 124 |
+
out["topics"].append(topic_out)
|
| 125 |
+
|
| 126 |
+
return out
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def main() -> None:
|
| 130 |
+
ap = argparse.ArgumentParser()
|
| 131 |
+
ap.add_argument("--in", dest="inp", required=True, help="Path to final_output.json")
|
| 132 |
+
ap.add_argument("--out", dest="out", required=True, help="Path to write condensed JSON")
|
| 133 |
+
args = ap.parse_args()
|
| 134 |
+
|
| 135 |
+
final_obj = load_json(args.inp)
|
| 136 |
+
if not isinstance(final_obj, dict):
|
| 137 |
+
raise ValueError("Input JSON root must be an object/dict (expected FinalOutput-like structure).")
|
| 138 |
+
|
| 139 |
+
condensed = condense(final_obj)
|
| 140 |
+
save_json(args.out, condensed)
|
| 141 |
+
print(f"Wrote condensed JSON: {args.out}")
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
pipelines/deepgram_extract_utterances.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
deepgram_extract_utterances.py
|
| 4 |
+
|
| 5 |
+
Extract speaker-attributed utterances (start, end, speaker, text)
|
| 6 |
+
from a meeting MP4 using Deepgram.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import json
|
| 13 |
+
import mimetypes
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import time
|
| 17 |
+
from typing import Any, Dict, List, Optional
|
| 18 |
+
|
| 19 |
+
import httpx
|
| 20 |
+
from dotenv import load_dotenv
|
| 21 |
+
from deepgram import DeepgramClient, PrerecordedOptions, FileSource
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# load .env at startup
|
| 25 |
+
load_dotenv()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _die(msg: str, code: int = 1) -> None:
|
| 29 |
+
print(f"Error: {msg}", file=sys.stderr)
|
| 30 |
+
sys.exit(code)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _load_file_source(path: str):
|
| 34 |
+
if not os.path.isfile(path):
|
| 35 |
+
_die(f"File not found: {path}")
|
| 36 |
+
|
| 37 |
+
with open(path, "rb") as f:
|
| 38 |
+
data = f.read()
|
| 39 |
+
|
| 40 |
+
mime, _ = mimetypes.guess_type(path)
|
| 41 |
+
if not mime:
|
| 42 |
+
mime = "application/octet-stream"
|
| 43 |
+
|
| 44 |
+
# IMPORTANT: return a dict, NOT FileSource()
|
| 45 |
+
return {
|
| 46 |
+
"buffer": data,
|
| 47 |
+
"mimetype": mime,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _extract_utterances(result: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 53 |
+
utterances = result.get("results", {}).get("utterances", [])
|
| 54 |
+
out: List[Dict[str, Any]] = []
|
| 55 |
+
|
| 56 |
+
for u in utterances:
|
| 57 |
+
out.append(
|
| 58 |
+
{
|
| 59 |
+
"start": float(u.get("start", 0.0)),
|
| 60 |
+
"end": float(u.get("end", 0.0)),
|
| 61 |
+
"speaker": u.get("speaker"),
|
| 62 |
+
"text": (u.get("transcript") or "").strip(),
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
return out
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _is_non_retryable_error(exc: Exception) -> bool:
|
| 70 |
+
code = getattr(exc, "status_code", None)
|
| 71 |
+
if isinstance(code, int) and 400 <= code < 500:
|
| 72 |
+
return True
|
| 73 |
+
status = getattr(exc, "status", None)
|
| 74 |
+
if isinstance(status, int) and 400 <= status < 500:
|
| 75 |
+
return True
|
| 76 |
+
msg = str(exc).lower()
|
| 77 |
+
# Deepgram SDK exceptions often encode status in message text.
|
| 78 |
+
if "status: 4" in msg or "bad request" in msg or "unsupported data" in msg:
|
| 79 |
+
return True
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def transcribe_and_extract(
|
| 84 |
+
path: str,
|
| 85 |
+
model: str = "nova-3",
|
| 86 |
+
language: Optional[str] = None,
|
| 87 |
+
request_timeout_sec: float = 1200.0,
|
| 88 |
+
connect_timeout_sec: float = 30.0,
|
| 89 |
+
retries: int = 3,
|
| 90 |
+
retry_backoff_sec: float = 2.0,
|
| 91 |
+
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
| 92 |
+
api_key = os.getenv("DEEPGRAM_API_KEY")
|
| 93 |
+
if not api_key:
|
| 94 |
+
_die("DEEPGRAM_API_KEY not found in environment or .env")
|
| 95 |
+
|
| 96 |
+
client = DeepgramClient(api_key=api_key)
|
| 97 |
+
|
| 98 |
+
source = _load_file_source(path)
|
| 99 |
+
|
| 100 |
+
options_kwargs: Dict[str, Any] = {
|
| 101 |
+
"model": model,
|
| 102 |
+
"smart_format": True,
|
| 103 |
+
"punctuate": True,
|
| 104 |
+
"utterances": True,
|
| 105 |
+
"diarize": True,
|
| 106 |
+
}
|
| 107 |
+
if language:
|
| 108 |
+
options_kwargs["language"] = language
|
| 109 |
+
|
| 110 |
+
options = PrerecordedOptions(**options_kwargs)
|
| 111 |
+
|
| 112 |
+
# Deepgram SDK default HTTP timeout is 30s; long recordings often exceed that.
|
| 113 |
+
timeout = httpx.Timeout(float(request_timeout_sec), connect=float(connect_timeout_sec))
|
| 114 |
+
retries = max(1, int(retries))
|
| 115 |
+
|
| 116 |
+
last_err: Optional[Exception] = None
|
| 117 |
+
response = None
|
| 118 |
+
for attempt in range(1, retries + 1):
|
| 119 |
+
try:
|
| 120 |
+
response = client.listen.rest.v("1").transcribe_file(
|
| 121 |
+
source,
|
| 122 |
+
options,
|
| 123 |
+
timeout=timeout,
|
| 124 |
+
)
|
| 125 |
+
break
|
| 126 |
+
except Exception as e:
|
| 127 |
+
last_err = e
|
| 128 |
+
if _is_non_retryable_error(e):
|
| 129 |
+
# Client/input errors won't succeed on retry.
|
| 130 |
+
raise
|
| 131 |
+
if attempt >= retries:
|
| 132 |
+
raise
|
| 133 |
+
wait_sec = float(retry_backoff_sec) * attempt
|
| 134 |
+
print(
|
| 135 |
+
f"Deepgram request failed (attempt {attempt}/{retries}): {type(e).__name__}: {e}. "
|
| 136 |
+
f"Retrying in {wait_sec:.1f}s..."
|
| 137 |
+
)
|
| 138 |
+
time.sleep(wait_sec)
|
| 139 |
+
|
| 140 |
+
if response is None:
|
| 141 |
+
raise RuntimeError(f"Deepgram transcription failed after {retries} attempts: {last_err}")
|
| 142 |
+
|
| 143 |
+
result_dict = response.to_dict() if hasattr(response, "to_dict") else dict(response)
|
| 144 |
+
|
| 145 |
+
return {
|
| 146 |
+
"input_file": os.path.abspath(path),
|
| 147 |
+
"model": model,
|
| 148 |
+
"utterances": _extract_utterances(result_dict),
|
| 149 |
+
}, result_dict
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def main() -> None:
|
| 153 |
+
parser = argparse.ArgumentParser()
|
| 154 |
+
parser.add_argument("input", help="Path to meeting file (.mp4, .wav, .mp3)")
|
| 155 |
+
parser.add_argument("-o", "--output", default="utterances.json")
|
| 156 |
+
parser.add_argument("--raw", help="Optional raw Deepgram response JSON")
|
| 157 |
+
parser.add_argument("--model", default="nova-3")
|
| 158 |
+
parser.add_argument("--language", help="Optional language code (e.g. en, en-US)")
|
| 159 |
+
parser.add_argument(
|
| 160 |
+
"--request-timeout-sec",
|
| 161 |
+
type=float,
|
| 162 |
+
default=1200.0,
|
| 163 |
+
help="HTTP request timeout for Deepgram API call (default: 1200s).",
|
| 164 |
+
)
|
| 165 |
+
parser.add_argument(
|
| 166 |
+
"--connect-timeout-sec",
|
| 167 |
+
type=float,
|
| 168 |
+
default=30.0,
|
| 169 |
+
help="HTTP connect timeout for Deepgram API call (default: 30s).",
|
| 170 |
+
)
|
| 171 |
+
parser.add_argument(
|
| 172 |
+
"--retries",
|
| 173 |
+
type=int,
|
| 174 |
+
default=3,
|
| 175 |
+
help="Number of retry attempts for Deepgram call (default: 3).",
|
| 176 |
+
)
|
| 177 |
+
parser.add_argument(
|
| 178 |
+
"--retry-backoff-sec",
|
| 179 |
+
type=float,
|
| 180 |
+
default=2.0,
|
| 181 |
+
help="Base retry backoff seconds; actual sleep is base * attempt (default: 2.0).",
|
| 182 |
+
)
|
| 183 |
+
args = parser.parse_args()
|
| 184 |
+
|
| 185 |
+
extracted, raw = transcribe_and_extract(
|
| 186 |
+
args.input,
|
| 187 |
+
model=args.model,
|
| 188 |
+
language=args.language,
|
| 189 |
+
request_timeout_sec=float(args.request_timeout_sec),
|
| 190 |
+
connect_timeout_sec=float(args.connect_timeout_sec),
|
| 191 |
+
retries=int(args.retries),
|
| 192 |
+
retry_backoff_sec=float(args.retry_backoff_sec),
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
with open(args.output, "w", encoding="utf-8") as f:
|
| 196 |
+
json.dump(extracted, f, ensure_ascii=False, indent=2)
|
| 197 |
+
|
| 198 |
+
if args.raw:
|
| 199 |
+
with open(args.raw, "w", encoding="utf-8") as f:
|
| 200 |
+
json.dump(raw, f, ensure_ascii=False, indent=2)
|
| 201 |
+
|
| 202 |
+
print(f"Saved utterances to {args.output}")
|
| 203 |
+
if args.raw:
|
| 204 |
+
print(f"Saved raw response to {args.raw}")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
if __name__ == "__main__":
|
| 208 |
+
main()
|
pipelines/models/yolov8x-doclaynet.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fd403628e5377fc08105df49489fc4a8997d1376589470865d874f1ee918317
|
| 3 |
+
size 136821929
|
pipelines/run_pipeline_all.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pipeline orchestrator.
|
| 4 |
+
|
| 5 |
+
Runs:
|
| 6 |
+
1) deepgram_extract_utterances.py (parallel)
|
| 7 |
+
2) smart_keyframes_and_classify.py (parallel)
|
| 8 |
+
3) assign_utterances_to_keyframes.py (after 1+2)
|
| 9 |
+
4) build_final_output.py (after 3)
|
| 10 |
+
5) condense_final_output.py (after 4)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import subprocess
|
| 17 |
+
import sys
|
| 18 |
+
import time
|
| 19 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Dict, List, Sequence, Tuple
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def run_command(name: str, cmd: Sequence[str], cwd: Path) -> None:
|
| 25 |
+
start = time.perf_counter()
|
| 26 |
+
print(f"\n[{name}] START")
|
| 27 |
+
print(f"[{name}] CMD: {' '.join(cmd)}")
|
| 28 |
+
result = subprocess.run(cmd, cwd=str(cwd))
|
| 29 |
+
dur = time.perf_counter() - start
|
| 30 |
+
if result.returncode != 0:
|
| 31 |
+
raise RuntimeError(f"[{name}] failed with exit code {result.returncode}")
|
| 32 |
+
print(f"[{name}] DONE in {dur:.2f}s")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def run_parallel(commands: List[Tuple[str, List[str]]], cwd: Path) -> None:
|
| 36 |
+
if not commands:
|
| 37 |
+
return
|
| 38 |
+
with ThreadPoolExecutor(max_workers=len(commands)) as ex:
|
| 39 |
+
futures = {
|
| 40 |
+
ex.submit(run_command, name, cmd, cwd): name
|
| 41 |
+
for name, cmd in commands
|
| 42 |
+
}
|
| 43 |
+
for fut in as_completed(futures):
|
| 44 |
+
fut.result()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def require_file(path: Path, step_name: str) -> None:
|
| 48 |
+
if not path.exists():
|
| 49 |
+
raise FileNotFoundError(f"[{step_name}] expected output not found: {path}")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def main() -> None:
|
| 53 |
+
ap = argparse.ArgumentParser(description="Run full meeting summarization pipeline.")
|
| 54 |
+
ap.add_argument("--video", required=True, help="Path to meeting video/audio input.")
|
| 55 |
+
ap.add_argument("--out", required=True, help="Output directory for pipeline artifacts.")
|
| 56 |
+
|
| 57 |
+
ap.add_argument("--python", default=sys.executable, help="Python executable to use.")
|
| 58 |
+
|
| 59 |
+
ap.add_argument("--deepgram-model", default="nova-3", help="Deepgram model.")
|
| 60 |
+
ap.add_argument("--deepgram-language", default=None, help="Deepgram language (optional).")
|
| 61 |
+
ap.add_argument(
|
| 62 |
+
"--deepgram-raw-out",
|
| 63 |
+
default=None,
|
| 64 |
+
help="Optional path for raw Deepgram response JSON.",
|
| 65 |
+
)
|
| 66 |
+
ap.add_argument(
|
| 67 |
+
"--deepgram-request-timeout-sec",
|
| 68 |
+
type=float,
|
| 69 |
+
default=1200.0,
|
| 70 |
+
help="HTTP request timeout for Deepgram call.",
|
| 71 |
+
)
|
| 72 |
+
ap.add_argument(
|
| 73 |
+
"--deepgram-connect-timeout-sec",
|
| 74 |
+
type=float,
|
| 75 |
+
default=30.0,
|
| 76 |
+
help="HTTP connect timeout for Deepgram call.",
|
| 77 |
+
)
|
| 78 |
+
ap.add_argument(
|
| 79 |
+
"--deepgram-retries",
|
| 80 |
+
type=int,
|
| 81 |
+
default=3,
|
| 82 |
+
help="Retry attempts for Deepgram call.",
|
| 83 |
+
)
|
| 84 |
+
ap.add_argument(
|
| 85 |
+
"--deepgram-retry-backoff-sec",
|
| 86 |
+
type=float,
|
| 87 |
+
default=2.0,
|
| 88 |
+
help="Base retry backoff seconds for Deepgram call.",
|
| 89 |
+
)
|
| 90 |
+
ap.add_argument(
|
| 91 |
+
"--force-deepgram",
|
| 92 |
+
action="store_true",
|
| 93 |
+
help="Re-run Deepgram even if utterances.json already exists.",
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
ap.add_argument("--force-keyframes", action="store_true", help="Pass --force to smart keyframe script.")
|
| 97 |
+
ap.add_argument("--pre-roll-sec", type=float, default=3.0, help="Pre-roll seconds for utterance assignment.")
|
| 98 |
+
|
| 99 |
+
ap.add_argument("--gemini-model", default="gemini-2.5-flash", help="Gemini model id.")
|
| 100 |
+
ap.add_argument("--similarity-threshold", type=float, default=0.82, help="Similarity threshold for build step.")
|
| 101 |
+
ap.add_argument("--temperature", type=float, default=0.2, help="Gemini temperature for build step.")
|
| 102 |
+
args = ap.parse_args()
|
| 103 |
+
|
| 104 |
+
repo_dir = Path(__file__).resolve().parent
|
| 105 |
+
out_dir = Path(args.out).resolve()
|
| 106 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 107 |
+
|
| 108 |
+
video_path = Path(args.video).resolve()
|
| 109 |
+
if not video_path.exists():
|
| 110 |
+
raise FileNotFoundError(f"Input video not found: {video_path}")
|
| 111 |
+
|
| 112 |
+
deepgram_script = repo_dir / "deepgram_extract_utterances.py"
|
| 113 |
+
smart_kf_script = repo_dir / "smart_keyframes_and_classify.py"
|
| 114 |
+
assign_script = repo_dir / "assign_utterances_to_keyframes.py"
|
| 115 |
+
build_script = repo_dir / "build_final_output.py"
|
| 116 |
+
condense_script = repo_dir / "condense_final_output.py"
|
| 117 |
+
|
| 118 |
+
for s in [deepgram_script, smart_kf_script, assign_script, build_script, condense_script]:
|
| 119 |
+
if not s.exists():
|
| 120 |
+
raise FileNotFoundError(f"Script not found: {s}")
|
| 121 |
+
|
| 122 |
+
utterances_json = out_dir / "utterances.json"
|
| 123 |
+
keyframes_parsed_json = out_dir / "keyframes_parsed.json"
|
| 124 |
+
keyframes_with_utterances_json = out_dir / "keyframes_with_utterances.json"
|
| 125 |
+
final_output_json = out_dir / "final_output.json"
|
| 126 |
+
final_output_condensed_json = out_dir / "final_output_condensed.json"
|
| 127 |
+
deepgram_raw_json = Path(args.deepgram_raw_out).resolve() if args.deepgram_raw_out else None
|
| 128 |
+
|
| 129 |
+
python_exe = str(Path(args.python))
|
| 130 |
+
|
| 131 |
+
# 1 + 2 in parallel
|
| 132 |
+
deepgram_cmd = [
|
| 133 |
+
python_exe,
|
| 134 |
+
str(deepgram_script),
|
| 135 |
+
str(video_path),
|
| 136 |
+
"-o",
|
| 137 |
+
str(utterances_json),
|
| 138 |
+
"--model",
|
| 139 |
+
str(args.deepgram_model),
|
| 140 |
+
"--request-timeout-sec",
|
| 141 |
+
str(args.deepgram_request_timeout_sec),
|
| 142 |
+
"--connect-timeout-sec",
|
| 143 |
+
str(args.deepgram_connect_timeout_sec),
|
| 144 |
+
"--retries",
|
| 145 |
+
str(args.deepgram_retries),
|
| 146 |
+
"--retry-backoff-sec",
|
| 147 |
+
str(args.deepgram_retry_backoff_sec),
|
| 148 |
+
]
|
| 149 |
+
if args.deepgram_language:
|
| 150 |
+
deepgram_cmd.extend(["--language", str(args.deepgram_language)])
|
| 151 |
+
if deepgram_raw_json is not None:
|
| 152 |
+
deepgram_cmd.extend(["--raw", str(deepgram_raw_json)])
|
| 153 |
+
|
| 154 |
+
smart_kf_cmd = [
|
| 155 |
+
python_exe,
|
| 156 |
+
str(smart_kf_script),
|
| 157 |
+
"--video",
|
| 158 |
+
str(video_path),
|
| 159 |
+
"--out",
|
| 160 |
+
str(out_dir),
|
| 161 |
+
]
|
| 162 |
+
if args.force_keyframes:
|
| 163 |
+
smart_kf_cmd.append("--force")
|
| 164 |
+
|
| 165 |
+
parallel_commands: List[Tuple[str, List[str]]] = []
|
| 166 |
+
if args.force_deepgram or (not utterances_json.exists()):
|
| 167 |
+
parallel_commands.append(("deepgram_extract_utterances", deepgram_cmd))
|
| 168 |
+
else:
|
| 169 |
+
print(f"[deepgram_extract_utterances] SKIP (exists): {utterances_json}")
|
| 170 |
+
|
| 171 |
+
if args.force_keyframes or (not keyframes_parsed_json.exists()):
|
| 172 |
+
parallel_commands.append(("smart_keyframes_and_classify", smart_kf_cmd))
|
| 173 |
+
else:
|
| 174 |
+
print(f"[smart_keyframes_and_classify] SKIP (exists): {keyframes_parsed_json}")
|
| 175 |
+
|
| 176 |
+
if parallel_commands:
|
| 177 |
+
print("Running Step 1+2 in parallel...")
|
| 178 |
+
run_parallel(parallel_commands, cwd=repo_dir)
|
| 179 |
+
else:
|
| 180 |
+
print("Skipping Step 1+2 (all required artifacts already exist).")
|
| 181 |
+
|
| 182 |
+
require_file(utterances_json, "deepgram_extract_utterances")
|
| 183 |
+
require_file(keyframes_parsed_json, "smart_keyframes_and_classify")
|
| 184 |
+
|
| 185 |
+
# 3 assign
|
| 186 |
+
assign_cmd = [
|
| 187 |
+
python_exe,
|
| 188 |
+
str(assign_script),
|
| 189 |
+
str(keyframes_parsed_json),
|
| 190 |
+
str(utterances_json),
|
| 191 |
+
"-o",
|
| 192 |
+
str(keyframes_with_utterances_json),
|
| 193 |
+
"--pre-roll-sec",
|
| 194 |
+
str(args.pre_roll_sec),
|
| 195 |
+
]
|
| 196 |
+
run_command("assign_utterances_to_keyframes", assign_cmd, cwd=repo_dir)
|
| 197 |
+
require_file(keyframes_with_utterances_json, "assign_utterances_to_keyframes")
|
| 198 |
+
|
| 199 |
+
# 4 build
|
| 200 |
+
build_cmd = [
|
| 201 |
+
python_exe,
|
| 202 |
+
str(build_script),
|
| 203 |
+
"--keyframes",
|
| 204 |
+
str(keyframes_with_utterances_json),
|
| 205 |
+
"--out",
|
| 206 |
+
str(final_output_json),
|
| 207 |
+
"--model",
|
| 208 |
+
str(args.gemini_model),
|
| 209 |
+
"--similarity_threshold",
|
| 210 |
+
str(args.similarity_threshold),
|
| 211 |
+
"--temperature",
|
| 212 |
+
str(args.temperature),
|
| 213 |
+
]
|
| 214 |
+
run_command("build_final_output", build_cmd, cwd=repo_dir)
|
| 215 |
+
require_file(final_output_json, "build_final_output")
|
| 216 |
+
|
| 217 |
+
# 5 condense
|
| 218 |
+
condense_cmd = [
|
| 219 |
+
python_exe,
|
| 220 |
+
str(condense_script),
|
| 221 |
+
"--in",
|
| 222 |
+
str(final_output_json),
|
| 223 |
+
"--out",
|
| 224 |
+
str(final_output_condensed_json),
|
| 225 |
+
]
|
| 226 |
+
run_command("condense_final_output", condense_cmd, cwd=repo_dir)
|
| 227 |
+
require_file(final_output_condensed_json, "condense_final_output")
|
| 228 |
+
|
| 229 |
+
print("\nPipeline completed successfully.")
|
| 230 |
+
print(f"Utterances: {utterances_json}")
|
| 231 |
+
print(f"Keyframes parsed: {keyframes_parsed_json}")
|
| 232 |
+
print(f"Keyframes+utterances: {keyframes_with_utterances_json}")
|
| 233 |
+
print(f"Final output: {final_output_json}")
|
| 234 |
+
print(f"Condensed output: {final_output_condensed_json}")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
main()
|
pipelines/run_pipeline_demo_code.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Demo-only Gemini pipeline orchestrator (kept in demo-code route for compatibility).
|
| 4 |
+
|
| 5 |
+
Pipeline steps:
|
| 6 |
+
1) deepgram_extract_utterances.py (parallel)
|
| 7 |
+
2) smart_keyframes_and_classify.py (parallel)
|
| 8 |
+
3) assign_utterances_to_keyframes.py
|
| 9 |
+
4) build_final_output_demo_code.py (Gemini for demo only; slides+code local OCR/transcript)
|
| 10 |
+
5) condense_final_output.py
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import subprocess
|
| 17 |
+
import sys
|
| 18 |
+
import time
|
| 19 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import List, Sequence, Tuple
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def run_command(name: str, cmd: Sequence[str], cwd: Path) -> None:
|
| 25 |
+
start = time.perf_counter()
|
| 26 |
+
print(f"\n[{name}] START")
|
| 27 |
+
print(f"[{name}] CMD: {' '.join(cmd)}")
|
| 28 |
+
result = subprocess.run(cmd, cwd=str(cwd))
|
| 29 |
+
dur = time.perf_counter() - start
|
| 30 |
+
if result.returncode != 0:
|
| 31 |
+
raise RuntimeError(f"[{name}] failed with exit code {result.returncode}")
|
| 32 |
+
print(f"[{name}] DONE in {dur:.2f}s")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def run_parallel(commands: List[Tuple[str, List[str]]], cwd: Path) -> None:
|
| 36 |
+
if not commands:
|
| 37 |
+
return
|
| 38 |
+
with ThreadPoolExecutor(max_workers=len(commands)) as ex:
|
| 39 |
+
futures = {ex.submit(run_command, name, cmd, cwd): name for name, cmd in commands}
|
| 40 |
+
for fut in as_completed(futures):
|
| 41 |
+
fut.result()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def require_file(path: Path, step_name: str) -> None:
|
| 45 |
+
if not path.exists():
|
| 46 |
+
raise FileNotFoundError(f"[{step_name}] expected output not found: {path}")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def main() -> None:
|
| 50 |
+
ap = argparse.ArgumentParser(description="Run demo-only Gemini meeting pipeline (demo-code route alias).")
|
| 51 |
+
ap.add_argument("--video", required=True, help="Path to meeting video/audio input.")
|
| 52 |
+
ap.add_argument("--out", required=True, help="Output directory for pipeline artifacts.")
|
| 53 |
+
|
| 54 |
+
ap.add_argument("--python", default=sys.executable, help="Python executable to use.")
|
| 55 |
+
|
| 56 |
+
ap.add_argument("--deepgram-model", default="nova-3", help="Deepgram model.")
|
| 57 |
+
ap.add_argument("--deepgram-language", default=None, help="Deepgram language (optional).")
|
| 58 |
+
ap.add_argument(
|
| 59 |
+
"--deepgram-raw-out",
|
| 60 |
+
default=None,
|
| 61 |
+
help="Optional path for raw Deepgram response JSON.",
|
| 62 |
+
)
|
| 63 |
+
ap.add_argument(
|
| 64 |
+
"--deepgram-request-timeout-sec",
|
| 65 |
+
type=float,
|
| 66 |
+
default=1200.0,
|
| 67 |
+
help="HTTP request timeout for Deepgram call.",
|
| 68 |
+
)
|
| 69 |
+
ap.add_argument(
|
| 70 |
+
"--deepgram-connect-timeout-sec",
|
| 71 |
+
type=float,
|
| 72 |
+
default=30.0,
|
| 73 |
+
help="HTTP connect timeout for Deepgram call.",
|
| 74 |
+
)
|
| 75 |
+
ap.add_argument(
|
| 76 |
+
"--deepgram-retries",
|
| 77 |
+
type=int,
|
| 78 |
+
default=3,
|
| 79 |
+
help="Retry attempts for Deepgram call.",
|
| 80 |
+
)
|
| 81 |
+
ap.add_argument(
|
| 82 |
+
"--deepgram-retry-backoff-sec",
|
| 83 |
+
type=float,
|
| 84 |
+
default=2.0,
|
| 85 |
+
help="Base retry backoff seconds for Deepgram call.",
|
| 86 |
+
)
|
| 87 |
+
ap.add_argument(
|
| 88 |
+
"--force-deepgram",
|
| 89 |
+
action="store_true",
|
| 90 |
+
help="Re-run Deepgram even if utterances.json already exists.",
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
ap.add_argument("--force-keyframes", action="store_true", help="Pass --force to smart keyframe script.")
|
| 94 |
+
ap.add_argument("--pre-roll-sec", type=float, default=3.0, help="Pre-roll seconds for utterance assignment.")
|
| 95 |
+
|
| 96 |
+
ap.add_argument("--gemini-model", default="gemini-2.5-flash", help="Gemini model id.")
|
| 97 |
+
ap.add_argument(
|
| 98 |
+
"--similarity-threshold",
|
| 99 |
+
type=float,
|
| 100 |
+
default=0.82,
|
| 101 |
+
help="Similarity threshold for demo prompt reuse logic.",
|
| 102 |
+
)
|
| 103 |
+
ap.add_argument("--temperature", type=float, default=0.2, help="Gemini temperature for demo keyframes.")
|
| 104 |
+
args = ap.parse_args()
|
| 105 |
+
|
| 106 |
+
pipeline_dir = Path(__file__).resolve().parent
|
| 107 |
+
repo_dir = pipeline_dir
|
| 108 |
+
|
| 109 |
+
out_dir = Path(args.out).resolve()
|
| 110 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
|
| 112 |
+
video_path = Path(args.video).resolve()
|
| 113 |
+
if not video_path.exists():
|
| 114 |
+
raise FileNotFoundError(f"Input video not found: {video_path}")
|
| 115 |
+
|
| 116 |
+
deepgram_script = repo_dir / "deepgram_extract_utterances.py"
|
| 117 |
+
smart_kf_script = repo_dir / "smart_keyframes_and_classify.py"
|
| 118 |
+
assign_script = repo_dir / "assign_utterances_to_keyframes.py"
|
| 119 |
+
build_demo_script = pipeline_dir / "build_final_output_demo_code.py"
|
| 120 |
+
condense_script = repo_dir / "condense_final_output.py"
|
| 121 |
+
|
| 122 |
+
for s in [deepgram_script, smart_kf_script, assign_script, build_demo_script, condense_script]:
|
| 123 |
+
if not s.exists():
|
| 124 |
+
raise FileNotFoundError(f"Script not found: {s}")
|
| 125 |
+
|
| 126 |
+
utterances_json = out_dir / "utterances.json"
|
| 127 |
+
keyframes_parsed_json = out_dir / "keyframes_parsed.json"
|
| 128 |
+
keyframes_with_utterances_json = out_dir / "keyframes_with_utterances.json"
|
| 129 |
+
final_output_json = out_dir / "final_output_demo_code.json"
|
| 130 |
+
final_output_condensed_json = out_dir / "final_output_demo_code_condensed.json"
|
| 131 |
+
deepgram_raw_json = Path(args.deepgram_raw_out).resolve() if args.deepgram_raw_out else None
|
| 132 |
+
|
| 133 |
+
python_exe = str(Path(args.python))
|
| 134 |
+
|
| 135 |
+
deepgram_cmd = [
|
| 136 |
+
python_exe,
|
| 137 |
+
str(deepgram_script),
|
| 138 |
+
str(video_path),
|
| 139 |
+
"-o",
|
| 140 |
+
str(utterances_json),
|
| 141 |
+
"--model",
|
| 142 |
+
str(args.deepgram_model),
|
| 143 |
+
"--request-timeout-sec",
|
| 144 |
+
str(args.deepgram_request_timeout_sec),
|
| 145 |
+
"--connect-timeout-sec",
|
| 146 |
+
str(args.deepgram_connect_timeout_sec),
|
| 147 |
+
"--retries",
|
| 148 |
+
str(args.deepgram_retries),
|
| 149 |
+
"--retry-backoff-sec",
|
| 150 |
+
str(args.deepgram_retry_backoff_sec),
|
| 151 |
+
]
|
| 152 |
+
if args.deepgram_language:
|
| 153 |
+
deepgram_cmd.extend(["--language", str(args.deepgram_language)])
|
| 154 |
+
if deepgram_raw_json is not None:
|
| 155 |
+
deepgram_cmd.extend(["--raw", str(deepgram_raw_json)])
|
| 156 |
+
|
| 157 |
+
smart_kf_cmd = [
|
| 158 |
+
python_exe,
|
| 159 |
+
str(smart_kf_script),
|
| 160 |
+
"--video",
|
| 161 |
+
str(video_path),
|
| 162 |
+
"--out",
|
| 163 |
+
str(out_dir),
|
| 164 |
+
"--no-yolo-for-non-demo",
|
| 165 |
+
]
|
| 166 |
+
if args.force_keyframes:
|
| 167 |
+
smart_kf_cmd.append("--force")
|
| 168 |
+
|
| 169 |
+
parallel_commands: List[Tuple[str, List[str]]] = []
|
| 170 |
+
if args.force_deepgram or (not utterances_json.exists()):
|
| 171 |
+
parallel_commands.append(("deepgram_extract_utterances", deepgram_cmd))
|
| 172 |
+
else:
|
| 173 |
+
print(f"[deepgram_extract_utterances] SKIP (exists): {utterances_json}")
|
| 174 |
+
|
| 175 |
+
if args.force_keyframes or (not keyframes_parsed_json.exists()):
|
| 176 |
+
parallel_commands.append(("smart_keyframes_and_classify", smart_kf_cmd))
|
| 177 |
+
else:
|
| 178 |
+
print(f"[smart_keyframes_and_classify] SKIP (exists): {keyframes_parsed_json}")
|
| 179 |
+
|
| 180 |
+
if parallel_commands:
|
| 181 |
+
print("Running Step 1+2 in parallel...")
|
| 182 |
+
run_parallel(parallel_commands, cwd=repo_dir)
|
| 183 |
+
else:
|
| 184 |
+
print("Skipping Step 1+2 (all required artifacts already exist).")
|
| 185 |
+
|
| 186 |
+
require_file(utterances_json, "deepgram_extract_utterances")
|
| 187 |
+
require_file(keyframes_parsed_json, "smart_keyframes_and_classify")
|
| 188 |
+
|
| 189 |
+
assign_cmd = [
|
| 190 |
+
python_exe,
|
| 191 |
+
str(assign_script),
|
| 192 |
+
str(keyframes_parsed_json),
|
| 193 |
+
str(utterances_json),
|
| 194 |
+
"-o",
|
| 195 |
+
str(keyframes_with_utterances_json),
|
| 196 |
+
"--pre-roll-sec",
|
| 197 |
+
str(args.pre_roll_sec),
|
| 198 |
+
]
|
| 199 |
+
run_command("assign_utterances_to_keyframes", assign_cmd, cwd=repo_dir)
|
| 200 |
+
require_file(keyframes_with_utterances_json, "assign_utterances_to_keyframes")
|
| 201 |
+
|
| 202 |
+
build_cmd = [
|
| 203 |
+
python_exe,
|
| 204 |
+
str(build_demo_script),
|
| 205 |
+
"--keyframes",
|
| 206 |
+
str(keyframes_with_utterances_json),
|
| 207 |
+
"--out",
|
| 208 |
+
str(final_output_json),
|
| 209 |
+
"--model",
|
| 210 |
+
str(args.gemini_model),
|
| 211 |
+
"--similarity-threshold",
|
| 212 |
+
str(args.similarity_threshold),
|
| 213 |
+
"--temperature",
|
| 214 |
+
str(args.temperature),
|
| 215 |
+
]
|
| 216 |
+
run_command("build_final_output_demo_code", build_cmd, cwd=repo_dir)
|
| 217 |
+
require_file(final_output_json, "build_final_output_demo_code")
|
| 218 |
+
|
| 219 |
+
condense_cmd = [
|
| 220 |
+
python_exe,
|
| 221 |
+
str(condense_script),
|
| 222 |
+
"--in",
|
| 223 |
+
str(final_output_json),
|
| 224 |
+
"--out",
|
| 225 |
+
str(final_output_condensed_json),
|
| 226 |
+
]
|
| 227 |
+
run_command("condense_final_output", condense_cmd, cwd=repo_dir)
|
| 228 |
+
require_file(final_output_condensed_json, "condense_final_output")
|
| 229 |
+
|
| 230 |
+
print("\nDemo-only Gemini pipeline completed successfully.")
|
| 231 |
+
print(f"Utterances: {utterances_json}")
|
| 232 |
+
print(f"Keyframes parsed: {keyframes_parsed_json}")
|
| 233 |
+
print(f"Keyframes+utterances: {keyframes_with_utterances_json}")
|
| 234 |
+
print(f"Final output (demo-only Gemini): {final_output_json}")
|
| 235 |
+
print(f"Condensed output (demo-only Gemini): {final_output_condensed_json}")
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
main()
|
pipelines/smart_keyframes_and_classify.py
ADDED
|
@@ -0,0 +1,1443 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# smart_keyframes_and_classify.py
|
| 2 |
+
import argparse
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import time
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
+
import re
|
| 10 |
+
import concurrent.futures as cf
|
| 11 |
+
|
| 12 |
+
import cv2
|
| 13 |
+
import numpy as np
|
| 14 |
+
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import clip
|
| 18 |
+
import torch
|
| 19 |
+
from PIL import Image
|
| 20 |
+
except Exception:
|
| 21 |
+
clip = None
|
| 22 |
+
torch = None
|
| 23 |
+
Image = None
|
| 24 |
+
|
| 25 |
+
# Local models (layout + OCR)
|
| 26 |
+
# pip install ultralytics paddleocr paddlepaddle opencv-python numpy python-dotenv
|
| 27 |
+
from ultralytics import YOLO
|
| 28 |
+
|
| 29 |
+
# Avoid oneDNN fused-conv issues seen in some Paddle/PaddleOCR builds on CPU.
|
| 30 |
+
# Use hard overrides (not setdefault) so shell/.env values cannot re-enable it.
|
| 31 |
+
os.environ["FLAGS_use_mkldnn"] = "0"
|
| 32 |
+
os.environ["FLAGS_enable_mkldnn"] = "0"
|
| 33 |
+
os.environ["FLAGS_use_onednn"] = "0"
|
| 34 |
+
|
| 35 |
+
# Compatibility patch for NumPy>=2 with imgaug (transitive dep of PaddleOCR).
|
| 36 |
+
# imgaug expects np.sctypes, removed in NumPy 2.0.
|
| 37 |
+
if not hasattr(np, "sctypes"):
|
| 38 |
+
def _np_type(name: str, default):
|
| 39 |
+
return getattr(np, name, default)
|
| 40 |
+
|
| 41 |
+
np.sctypes = {
|
| 42 |
+
"int": [_np_type("int8", int), _np_type("int16", int), _np_type("int32", int), _np_type("int64", int)],
|
| 43 |
+
"uint": [_np_type("uint8", int), _np_type("uint16", int), _np_type("uint32", int), _np_type("uint64", int)],
|
| 44 |
+
"float": [_np_type("float16", float), _np_type("float32", float), _np_type("float64", float)],
|
| 45 |
+
"complex": [_np_type("complex64", complex), _np_type("complex128", complex)],
|
| 46 |
+
"others": [_np_type("bool_", bool), _np_type("object_", object), _np_type("str_", str), _np_type("bytes_", bytes)],
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
from paddleocr import PaddleOCR
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ============================================================
|
| 53 |
+
# EDIT THESE IN CODE (no tuning args needed in the command)
|
| 54 |
+
# ============================================================
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _env_bool(name: str, default: bool) -> bool:
|
| 58 |
+
raw = os.getenv(name)
|
| 59 |
+
if raw is None:
|
| 60 |
+
return bool(default)
|
| 61 |
+
return str(raw).strip().lower() in {"1", "true", "yes", "y", "on"}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _auto_has_cuda() -> bool:
|
| 65 |
+
try:
|
| 66 |
+
return bool(torch is not None and torch.cuda.is_available())
|
| 67 |
+
except Exception:
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
# Candidate sampling (local, no API)
|
| 71 |
+
SAMPLE_FPS = 1.0
|
| 72 |
+
RESIZE_W = 360
|
| 73 |
+
CANDIDATE_PERCENTILE = 70.0
|
| 74 |
+
MAX_CANDIDATES = 180
|
| 75 |
+
|
| 76 |
+
# Final cap
|
| 77 |
+
MAX_FRAMES = 150
|
| 78 |
+
|
| 79 |
+
# Fast/parse resize for local inference (CLIP)
|
| 80 |
+
FAST_FRAME_MAX_W = 720
|
| 81 |
+
|
| 82 |
+
# Parallelism removed (no LLM calls)
|
| 83 |
+
BASE_SLEEP_SEC = 0.0
|
| 84 |
+
|
| 85 |
+
# Local screen parsing (required)
|
| 86 |
+
ENABLE_LOCAL_SCREEN_PARSE = True
|
| 87 |
+
|
| 88 |
+
# Layout detector weights (DocLayNet-style YOLO weights recommended)
|
| 89 |
+
# Example: models/yolov8n-doclaynet.pt
|
| 90 |
+
LAYOUT_YOLO_WEIGHTS = os.getenv("LAYOUT_YOLO_WEIGHTS", "models/yolov8x-doclaynet.pt")
|
| 91 |
+
LAYOUT_CONF = float(os.getenv("LAYOUT_CONF", "0.25"))
|
| 92 |
+
LAYOUT_IOU = float(os.getenv("LAYOUT_IOU", "0.45"))
|
| 93 |
+
|
| 94 |
+
# YOLO runtime settings
|
| 95 |
+
# Defaults are deployment-safe (CPU on non-GPU hosts), but can be overridden via env.
|
| 96 |
+
YOLO_DEVICE = os.getenv("YOLO_DEVICE", "0" if _auto_has_cuda() else "cpu")
|
| 97 |
+
YOLO_IMGSZ = int(os.getenv("YOLO_IMGSZ", "640")) # try 512 for more speed if acceptable
|
| 98 |
+
|
| 99 |
+
# OCR
|
| 100 |
+
OCR_LANG = os.getenv("OCR_LANG", "en")
|
| 101 |
+
OCR_MIN_CONF = float(os.getenv("OCR_MIN_CONF", "0.45"))
|
| 102 |
+
|
| 103 |
+
# OCR runtime settings (GPU + crop-only OCR)
|
| 104 |
+
USE_GPU = _env_bool("OCR_GPU", _auto_has_cuda())
|
| 105 |
+
OCR_CROP_MAX_REGIONS = int(os.getenv("OCR_CROP_MAX_REGIONS", "10"))
|
| 106 |
+
|
| 107 |
+
# Downscale OCR crops by frame type (slides/demo faster; code keeps max)
|
| 108 |
+
OCR_CROP_SCALE_BY_TYPE = {
|
| 109 |
+
"slides": float(os.getenv("OCR_CROP_SCALE_SLIDES", "0.80")),
|
| 110 |
+
"demo": float(os.getenv("OCR_CROP_SCALE_DEMO", "0.75")),
|
| 111 |
+
"code": float(os.getenv("OCR_CROP_SCALE_CODE", "1.00")),
|
| 112 |
+
"none": float(os.getenv("OCR_CROP_SCALE_NONE", "0.75")),
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
# Resize input frame BEFORE YOLO+OCR in step 3 (slides/demo smaller; code max)
|
| 116 |
+
PARSE_MAX_W_BY_TYPE = {
|
| 117 |
+
"slides": int(os.getenv("PARSE_MAX_W_SLIDES", "1280")),
|
| 118 |
+
"demo": int(os.getenv("PARSE_MAX_W_DEMO", "1280")),
|
| 119 |
+
"none": int(os.getenv("PARSE_MAX_W_NONE", "1280")),
|
| 120 |
+
"code": int(os.getenv("PARSE_MAX_W_CODE", "99999")), # effectively "no resize"
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
# CLIP frame type classifier
|
| 124 |
+
# -----------------------------
|
| 125 |
+
# CLIP setup (more robust, fewer “code” false-positives)
|
| 126 |
+
# Strategy:
|
| 127 |
+
# 1) Use multiple POS prompts per class (ensembling)
|
| 128 |
+
# 2) Add NEG prompts per class (especially for "code") and score = mean(pos) - mean(neg)
|
| 129 |
+
# This makes "slides with code screenshots" stay as slides, and prevents "demo with code words" -> code.
|
| 130 |
+
# -----------------------------
|
| 131 |
+
|
| 132 |
+
CLIP_MODEL_NAME = os.getenv("CLIP_MODEL_NAME", "ViT-B/32")
|
| 133 |
+
|
| 134 |
+
# class labels (keep as-is)
|
| 135 |
+
CLIP_CLASS_LABELS = ["slides", "code", "demo", "none"]
|
| 136 |
+
|
| 137 |
+
# scoring mode used by your classifier code (implement if you haven't):
|
| 138 |
+
# score(class) = mean(sim(image, pos_prompts)) - mean(sim(image, neg_prompts))
|
| 139 |
+
CLIP_SCORE_MODE = os.getenv("CLIP_SCORE_MODE", "pos_minus_neg")
|
| 140 |
+
|
| 141 |
+
# If your pipeline supports a minimum margin between top-1 and top-2 to accept the prediction:
|
| 142 |
+
# (helps when frames are ambiguous)
|
| 143 |
+
CLIP_MIN_MARGIN = float(os.getenv("CLIP_MIN_MARGIN", "0.03"))
|
| 144 |
+
|
| 145 |
+
# Prompt bank: POS and NEG per class
|
| 146 |
+
CLIP_PROMPT_BANK = {
|
| 147 |
+
"slides": {
|
| 148 |
+
"pos": [
|
| 149 |
+
"a screenshot of a presentation slide (PowerPoint or Google Slides)",
|
| 150 |
+
"a slide with a large title at the top and bullet points below",
|
| 151 |
+
"a slide canvas with wide margins and centered content",
|
| 152 |
+
"a lecture slide with sections, headings, and bullet lists",
|
| 153 |
+
"a slide that may include a small embedded screenshot (code or UI) but is still a slide",
|
| 154 |
+
"a shared slide deck page in a video meeting (16:9 slide layout)",
|
| 155 |
+
],
|
| 156 |
+
"neg": [
|
| 157 |
+
"a full screen web application dashboard with navigation sidebar",
|
| 158 |
+
"a desktop application interface with many clickable controls",
|
| 159 |
+
"a full screen code editor filling the screen",
|
| 160 |
+
"a terminal window filling the screen",
|
| 161 |
+
"a webcam grid of meeting participants",
|
| 162 |
+
],
|
| 163 |
+
},
|
| 164 |
+
|
| 165 |
+
"code": {
|
| 166 |
+
"pos": [
|
| 167 |
+
"a full screen code editor filling most of the screen with many lines of code",
|
| 168 |
+
"an IDE with syntax highlighting and line numbers, code dominates the screen",
|
| 169 |
+
"a programming editor with file tree sidebar and editor pane, not inside a slide",
|
| 170 |
+
"a terminal and code editor side by side with readable code dominating",
|
| 171 |
+
],
|
| 172 |
+
"neg": [
|
| 173 |
+
"a presentation slide that contains a screenshot of code",
|
| 174 |
+
"a slide with a code snippet as part of a slide deck",
|
| 175 |
+
"a slide with a code image and slide title and bullets",
|
| 176 |
+
"a demo UI screen that contains a small code panel",
|
| 177 |
+
],
|
| 178 |
+
},
|
| 179 |
+
|
| 180 |
+
"demo": {
|
| 181 |
+
"pos": [
|
| 182 |
+
"a web application dashboard with a left navigation sidebar and multiple panels",
|
| 183 |
+
"a product user interface with buttons, menus, input fields, and toolbars",
|
| 184 |
+
"a browser-based app with tabs, filters, tables, charts, and navigation",
|
| 185 |
+
"a desktop software UI with controls, forms, and interactive elements",
|
| 186 |
+
"a product demo screen where the interface fills the screen (not a slide canvas)",
|
| 187 |
+
],
|
| 188 |
+
"neg": [
|
| 189 |
+
"a PowerPoint or Google Slides presentation slide",
|
| 190 |
+
"a slide with title at top and bullet points",
|
| 191 |
+
"a slide deck page with large margins and a single canvas",
|
| 192 |
+
"a slide with an embedded screenshot of a UI",
|
| 193 |
+
"a slide with a cursor hovering over a tab",
|
| 194 |
+
"a slide with a code snippet or code screenshot",
|
| 195 |
+
],
|
| 196 |
+
},
|
| 197 |
+
|
| 198 |
+
"none": {
|
| 199 |
+
"pos": [
|
| 200 |
+
"a video call gallery view with participants and no shared screen",
|
| 201 |
+
"a mostly blank screen or black screen",
|
| 202 |
+
"a blurred transition frame with no readable content",
|
| 203 |
+
"a loading screen with minimal content",
|
| 204 |
+
],
|
| 205 |
+
"neg": [
|
| 206 |
+
"a presentation slide",
|
| 207 |
+
"a web application dashboard",
|
| 208 |
+
"a full screen code editor",
|
| 209 |
+
],
|
| 210 |
+
},
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
CLIP_CLASS_PROMPTS = [CLIP_PROMPT_BANK[c]["pos"] for c in CLIP_CLASS_LABELS]
|
| 214 |
+
CLIP_CLASS_NEG_PROMPTS = [CLIP_PROMPT_BANK[c]["neg"] for c in CLIP_CLASS_LABELS]
|
| 215 |
+
|
| 216 |
+
# Caps for JSON size
|
| 217 |
+
MAX_OCR_LINES = 300
|
| 218 |
+
|
| 219 |
+
# ---- NEW: hard global time gap between kept keyframes ----
|
| 220 |
+
MIN_KEYFRAME_GAP_SEC = 3.0
|
| 221 |
+
|
| 222 |
+
# Sensitivity rules (VISUAL ONLY)
|
| 223 |
+
SENS = {
|
| 224 |
+
"slides": {"min_gap_sec": 1.2, "diff_mult": 1.60},
|
| 225 |
+
"code": {"min_gap_sec": 0.8, "diff_mult": 0.70},
|
| 226 |
+
"demo": {"min_gap_sec": 0.45, "diff_mult": 0.60},
|
| 227 |
+
"none": {"min_gap_sec": 0.55, "diff_mult": 0.95},
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# Concurrent parsing workers (YOLO + OCR) for KEPT keyframes
|
| 231 |
+
PARSE_WORKERS = int(os.getenv("PARSE_WORKERS", "2"))
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# ----------------------------
|
| 235 |
+
# Data structures
|
| 236 |
+
# ----------------------------
|
| 237 |
+
|
| 238 |
+
@dataclass
|
| 239 |
+
class CandidateFrame:
|
| 240 |
+
t_sec: float
|
| 241 |
+
frame_idx: int
|
| 242 |
+
diff_score: float # diff vs previous sampled frame (local)
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
# ----------------------------
|
| 246 |
+
# Utils
|
| 247 |
+
# ----------------------------
|
| 248 |
+
|
| 249 |
+
def fmt_hhmmss(sec: float) -> str:
|
| 250 |
+
sec = max(0.0, float(sec))
|
| 251 |
+
h = int(sec // 3600)
|
| 252 |
+
m = int((sec % 3600) // 60)
|
| 253 |
+
s = int(sec % 60)
|
| 254 |
+
return f"{h:02d}:{m:02d}:{s:02d}"
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def safe_read_json(path: Path) -> Any:
|
| 258 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def safe_write_json(path: Path, obj: Any) -> None:
|
| 262 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 263 |
+
path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def _probe_video(video_path: Path) -> Tuple[float, float, int]:
|
| 267 |
+
cap = cv2.VideoCapture(str(video_path))
|
| 268 |
+
if not cap.isOpened():
|
| 269 |
+
raise RuntimeError(f"Could not open video: {video_path}")
|
| 270 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
|
| 271 |
+
frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 272 |
+
duration = float(frames / fps) if frames else 0.0
|
| 273 |
+
cap.release()
|
| 274 |
+
return float(fps), float(duration), int(frames)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def _mad_diff(a: np.ndarray, b: np.ndarray) -> float:
|
| 278 |
+
return float(np.mean(np.abs(a.astype(np.int16) - b.astype(np.int16))))
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _downscale_gray(frame_bgr: np.ndarray, resize_w: int) -> np.ndarray:
|
| 282 |
+
h, w = frame_bgr.shape[:2]
|
| 283 |
+
new_w = int(resize_w)
|
| 284 |
+
new_h = int(h * (new_w / max(1, w)))
|
| 285 |
+
small = cv2.resize(frame_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
| 286 |
+
return cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def _resize_frame_max_w(frame_bgr: np.ndarray, max_w: int) -> np.ndarray:
|
| 290 |
+
h, w = frame_bgr.shape[:2]
|
| 291 |
+
if w <= max_w:
|
| 292 |
+
return frame_bgr
|
| 293 |
+
new_w = int(max_w)
|
| 294 |
+
new_h = int(h * (new_w / w))
|
| 295 |
+
return cv2.resize(frame_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def _single_line(s: str, max_len: int = 220) -> str:
|
| 299 |
+
if s is None:
|
| 300 |
+
return ""
|
| 301 |
+
s = str(s).replace("\r", " ").replace("\n", " ")
|
| 302 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 303 |
+
if len(s) > max_len:
|
| 304 |
+
s = s[: max(0, max_len - 1)].rstrip() + "…"
|
| 305 |
+
return s
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
# ----------------------------
|
| 309 |
+
# Video frame reader (single capture)
|
| 310 |
+
# ----------------------------
|
| 311 |
+
|
| 312 |
+
class VideoReader:
|
| 313 |
+
def __init__(self, video_path: Path):
|
| 314 |
+
self.cap = cv2.VideoCapture(str(video_path))
|
| 315 |
+
if not self.cap.isOpened():
|
| 316 |
+
raise RuntimeError(f"Could not open video: {video_path}")
|
| 317 |
+
|
| 318 |
+
def read_at_frame(self, frame_idx: int) -> Optional[np.ndarray]:
|
| 319 |
+
self.cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
|
| 320 |
+
ret, frame = self.cap.read()
|
| 321 |
+
if not ret:
|
| 322 |
+
return None
|
| 323 |
+
return frame
|
| 324 |
+
|
| 325 |
+
def close(self) -> None:
|
| 326 |
+
try:
|
| 327 |
+
self.cap.release()
|
| 328 |
+
except Exception:
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# ----------------------------
|
| 333 |
+
# Local screen parse helpers (YOLO layout + PaddleOCR)
|
| 334 |
+
# ----------------------------
|
| 335 |
+
|
| 336 |
+
def _xyxy_to_int(xyxy):
|
| 337 |
+
x1, y1, x2, y2 = xyxy
|
| 338 |
+
return [int(round(x1)), int(round(y1)), int(round(x2)), int(round(y2))]
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def _clip_box(box, w, h):
|
| 342 |
+
x1, y1, x2, y2 = box
|
| 343 |
+
x1 = max(0, min(x1, w - 1))
|
| 344 |
+
y1 = max(0, min(y1, h - 1))
|
| 345 |
+
x2 = max(0, min(x2, w - 1))
|
| 346 |
+
y2 = max(0, min(y2, h - 1))
|
| 347 |
+
if x2 < x1:
|
| 348 |
+
x1, x2 = x2, x1
|
| 349 |
+
if y2 < y1:
|
| 350 |
+
y1, y2 = y2, y1
|
| 351 |
+
return [x1, y1, x2, y2]
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def _box_center(box):
|
| 355 |
+
x1, y1, x2, y2 = box
|
| 356 |
+
return ((x1 + x2) / 2.0, (y1 + y2) / 2.0)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def _zone_for_box(box, W, H):
|
| 360 |
+
cx, cy = _box_center(box)
|
| 361 |
+
if cy < 0.18 * H:
|
| 362 |
+
return "top"
|
| 363 |
+
if cy > 0.85 * H:
|
| 364 |
+
return "bottom"
|
| 365 |
+
if cx < 0.33 * W:
|
| 366 |
+
return "left"
|
| 367 |
+
if cx > 0.67 * W:
|
| 368 |
+
return "right"
|
| 369 |
+
return "center"
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
def _sort_reading_order(items):
|
| 373 |
+
return sorted(items, key=lambda it: (it["box"][1], it["box"][0]))
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
def run_layout_yolo(layout_model: YOLO, frame_bgr: np.ndarray) -> List[dict]:
|
| 377 |
+
H, W = frame_bgr.shape[:2]
|
| 378 |
+
res = layout_model.predict(
|
| 379 |
+
source=frame_bgr,
|
| 380 |
+
conf=LAYOUT_CONF,
|
| 381 |
+
iou=LAYOUT_IOU,
|
| 382 |
+
imgsz=YOLO_IMGSZ,
|
| 383 |
+
device=YOLO_DEVICE,
|
| 384 |
+
verbose=False
|
| 385 |
+
)[0]
|
| 386 |
+
|
| 387 |
+
regions = []
|
| 388 |
+
names = res.names
|
| 389 |
+
if res.boxes is None:
|
| 390 |
+
return regions
|
| 391 |
+
|
| 392 |
+
for b in res.boxes:
|
| 393 |
+
cls_id = int(b.cls.item())
|
| 394 |
+
conf = float(b.conf.item())
|
| 395 |
+
label = str(names.get(cls_id, f"class_{cls_id}"))
|
| 396 |
+
box = _xyxy_to_int(b.xyxy[0].tolist())
|
| 397 |
+
box = _clip_box(box, W, H)
|
| 398 |
+
regions.append({"label": label, "conf": conf, "box": box})
|
| 399 |
+
|
| 400 |
+
return _sort_reading_order(regions)
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def run_paddle_ocr(ocr: PaddleOCR, frame_bgr: np.ndarray) -> List[dict]:
|
| 404 |
+
# Full-frame OCR fallback (kept for safety), with angle cls OFF (cls=False)
|
| 405 |
+
H, W = frame_bgr.shape[:2]
|
| 406 |
+
out = []
|
| 407 |
+
|
| 408 |
+
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 409 |
+
result = ocr.ocr(rgb, cls=False)
|
| 410 |
+
if not result:
|
| 411 |
+
return out
|
| 412 |
+
|
| 413 |
+
lines = result[0] if isinstance(result, list) and len(result) > 0 else []
|
| 414 |
+
if lines is None:
|
| 415 |
+
return out
|
| 416 |
+
if not isinstance(lines, list):
|
| 417 |
+
return out
|
| 418 |
+
|
| 419 |
+
for line in lines:
|
| 420 |
+
if line is None or not isinstance(line, (list, tuple)) or len(line) < 2:
|
| 421 |
+
continue
|
| 422 |
+
quad = line[0]
|
| 423 |
+
pair = line[1]
|
| 424 |
+
if quad is None or pair is None:
|
| 425 |
+
continue
|
| 426 |
+
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
|
| 427 |
+
continue
|
| 428 |
+
text, conf = pair[0], pair[1]
|
| 429 |
+
conf = float(conf)
|
| 430 |
+
if conf < OCR_MIN_CONF:
|
| 431 |
+
continue
|
| 432 |
+
|
| 433 |
+
if not isinstance(quad, (list, tuple)) or len(quad) == 0:
|
| 434 |
+
continue
|
| 435 |
+
xs = [p[0] for p in quad]
|
| 436 |
+
ys = [p[1] for p in quad]
|
| 437 |
+
x1, y1, x2, y2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
|
| 438 |
+
box = _clip_box([x1, y1, x2, y2], W, H)
|
| 439 |
+
|
| 440 |
+
txt = _single_line(text, max_len=220)
|
| 441 |
+
if not txt:
|
| 442 |
+
continue
|
| 443 |
+
|
| 444 |
+
out.append({
|
| 445 |
+
"text": txt,
|
| 446 |
+
"conf": conf,
|
| 447 |
+
"quad": [[float(p[0]), float(p[1])] for p in quad],
|
| 448 |
+
"box": box,
|
| 449 |
+
})
|
| 450 |
+
|
| 451 |
+
if len(out) >= int(MAX_OCR_LINES):
|
| 452 |
+
break
|
| 453 |
+
|
| 454 |
+
return _sort_reading_order(out)
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _is_text_heavy_label(label: str) -> bool:
|
| 458 |
+
lab = (label or "").lower()
|
| 459 |
+
keys = ["title", "text", "list", "table", "header", "heading"]
|
| 460 |
+
return any(k in lab for k in keys)
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def _crop_and_scale(frame_bgr: np.ndarray, box: List[int], scale: float) -> Optional[np.ndarray]:
|
| 464 |
+
x1, y1, x2, y2 = box
|
| 465 |
+
crop = frame_bgr[y1:y2, x1:x2]
|
| 466 |
+
if crop is None or crop.size == 0:
|
| 467 |
+
return None
|
| 468 |
+
if scale is None or float(scale) >= 0.999:
|
| 469 |
+
return crop
|
| 470 |
+
return cv2.resize(crop, (0, 0), fx=float(scale), fy=float(scale), interpolation=cv2.INTER_AREA)
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
def run_paddle_ocr_on_text_regions(
|
| 474 |
+
ocr: PaddleOCR,
|
| 475 |
+
frame_bgr: np.ndarray,
|
| 476 |
+
regions: List[dict],
|
| 477 |
+
frame_type: str,
|
| 478 |
+
max_regions: int = 10,
|
| 479 |
+
) -> List[dict]:
|
| 480 |
+
"""
|
| 481 |
+
OCR ONLY on YOLO text-heavy regions (title/text/list/table/header).
|
| 482 |
+
Angle classifier is OFF via cls=False.
|
| 483 |
+
Crops are optionally downscaled by frame_type (slides/demo faster, code max).
|
| 484 |
+
"""
|
| 485 |
+
H, W = frame_bgr.shape[:2]
|
| 486 |
+
out: List[dict] = []
|
| 487 |
+
|
| 488 |
+
scale = float(OCR_CROP_SCALE_BY_TYPE.get(str(frame_type), 0.80))
|
| 489 |
+
|
| 490 |
+
text_regions = [r for r in regions if _is_text_heavy_label(r.get("label", ""))]
|
| 491 |
+
text_regions = text_regions[: int(max_regions)]
|
| 492 |
+
|
| 493 |
+
# If YOLO didn't detect any text region, fallback to full-frame OCR
|
| 494 |
+
if not text_regions:
|
| 495 |
+
return run_paddle_ocr(ocr, frame_bgr)
|
| 496 |
+
|
| 497 |
+
for r in text_regions:
|
| 498 |
+
box = r["box"]
|
| 499 |
+
x1, y1, x2, y2 = box
|
| 500 |
+
|
| 501 |
+
crop = _crop_and_scale(frame_bgr, box, scale=scale)
|
| 502 |
+
if crop is None or crop.size == 0:
|
| 503 |
+
continue
|
| 504 |
+
|
| 505 |
+
rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
|
| 506 |
+
res = ocr.ocr(rgb, cls=False) # cls OFF (angle cls OFF)
|
| 507 |
+
lines = res[0] if res else []
|
| 508 |
+
if lines is None or not isinstance(lines, list):
|
| 509 |
+
continue
|
| 510 |
+
if not lines:
|
| 511 |
+
continue
|
| 512 |
+
|
| 513 |
+
inv_scale = (1.0 / scale) if scale and scale > 0 else 1.0
|
| 514 |
+
|
| 515 |
+
for line in lines:
|
| 516 |
+
if line is None or not isinstance(line, (list, tuple)) or len(line) < 2:
|
| 517 |
+
continue
|
| 518 |
+
quad = line[0]
|
| 519 |
+
pair = line[1]
|
| 520 |
+
if quad is None or pair is None:
|
| 521 |
+
continue
|
| 522 |
+
if not isinstance(pair, (list, tuple)) or len(pair) < 2:
|
| 523 |
+
continue
|
| 524 |
+
text, conf = pair[0], pair[1]
|
| 525 |
+
conf = float(conf)
|
| 526 |
+
if conf < OCR_MIN_CONF:
|
| 527 |
+
continue
|
| 528 |
+
|
| 529 |
+
if not isinstance(quad, (list, tuple)) or len(quad) == 0:
|
| 530 |
+
continue
|
| 531 |
+
quad_global = []
|
| 532 |
+
for p in quad:
|
| 533 |
+
gx = float(p[0]) * inv_scale + float(x1)
|
| 534 |
+
gy = float(p[1]) * inv_scale + float(y1)
|
| 535 |
+
quad_global.append([gx, gy])
|
| 536 |
+
|
| 537 |
+
xs = [p[0] for p in quad_global]
|
| 538 |
+
ys = [p[1] for p in quad_global]
|
| 539 |
+
gx1, gy1, gx2, gy2 = int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))
|
| 540 |
+
gbox = _clip_box([gx1, gy1, gx2, gy2], W, H)
|
| 541 |
+
|
| 542 |
+
txt = _single_line(text, max_len=220)
|
| 543 |
+
if not txt:
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
out.append({
|
| 547 |
+
"text": txt,
|
| 548 |
+
"conf": conf,
|
| 549 |
+
"quad": quad_global,
|
| 550 |
+
"box": gbox,
|
| 551 |
+
"from_region_label": r.get("label", ""),
|
| 552 |
+
"from_region_box": box,
|
| 553 |
+
"crop_scale": float(scale),
|
| 554 |
+
})
|
| 555 |
+
|
| 556 |
+
if len(out) >= int(MAX_OCR_LINES):
|
| 557 |
+
break
|
| 558 |
+
|
| 559 |
+
if len(out) >= int(MAX_OCR_LINES):
|
| 560 |
+
break
|
| 561 |
+
|
| 562 |
+
return _sort_reading_order(out)
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
def attach_zones(regions: List[dict], W: int, H: int) -> Dict[str, List[dict]]:
|
| 566 |
+
zones = {"top": [], "left": [], "center": [], "right": [], "bottom": []}
|
| 567 |
+
for r in regions:
|
| 568 |
+
z = _zone_for_box(r["box"], W, H)
|
| 569 |
+
zones[z].append(r)
|
| 570 |
+
for z in zones:
|
| 571 |
+
zones[z] = _sort_reading_order(zones[z])
|
| 572 |
+
return zones
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
def guess_title(regions: List[dict], ocr_lines: List[dict]) -> str:
|
| 576 |
+
title_boxes = []
|
| 577 |
+
for r in regions:
|
| 578 |
+
lab = r.get("label", "").lower()
|
| 579 |
+
if ("title" in lab) or (lab == "title") or ("header" in lab and "page" not in lab):
|
| 580 |
+
title_boxes.append(r["box"])
|
| 581 |
+
|
| 582 |
+
def inside(line_box, region_box) -> bool:
|
| 583 |
+
x1, y1, x2, y2 = line_box
|
| 584 |
+
rx1, ry1, rx2, ry2 = region_box
|
| 585 |
+
return (x1 >= rx1 - 3 and y1 >= ry1 - 3 and x2 <= rx2 + 3 and y2 <= ry2 + 3)
|
| 586 |
+
|
| 587 |
+
if title_boxes:
|
| 588 |
+
lines = []
|
| 589 |
+
for ob in ocr_lines:
|
| 590 |
+
for tb in title_boxes:
|
| 591 |
+
if inside(ob["box"], tb):
|
| 592 |
+
lines.append(ob["text"])
|
| 593 |
+
break
|
| 594 |
+
lines = [x for x in lines if x]
|
| 595 |
+
if lines:
|
| 596 |
+
return " ".join(lines[:3]).strip()
|
| 597 |
+
|
| 598 |
+
if ocr_lines:
|
| 599 |
+
return ocr_lines[0]["text"]
|
| 600 |
+
return ""
|
| 601 |
+
|
| 602 |
+
|
| 603 |
+
def attach_ocr_to_regions(regions: List[dict], ocr_lines: List[dict], pad: int = 3) -> List[dict]:
|
| 604 |
+
def inside(line_box, region_box) -> bool:
|
| 605 |
+
x1, y1, x2, y2 = line_box
|
| 606 |
+
rx1, ry1, rx2, ry2 = region_box
|
| 607 |
+
return (x1 >= rx1 - pad and y1 >= ry1 - pad and x2 <= rx2 + pad and y2 <= ry2 + pad)
|
| 608 |
+
|
| 609 |
+
out = []
|
| 610 |
+
for r in regions:
|
| 611 |
+
rb = r.get("box")
|
| 612 |
+
if not rb:
|
| 613 |
+
out.append(r)
|
| 614 |
+
continue
|
| 615 |
+
|
| 616 |
+
texts = []
|
| 617 |
+
lines_in = []
|
| 618 |
+
for ln in ocr_lines:
|
| 619 |
+
lb = ln.get("box")
|
| 620 |
+
if lb and inside(lb, rb):
|
| 621 |
+
t = ln.get("text", "")
|
| 622 |
+
if t:
|
| 623 |
+
texts.append(t)
|
| 624 |
+
lines_in.append(ln)
|
| 625 |
+
|
| 626 |
+
rr = dict(r)
|
| 627 |
+
rr["text_lines"] = texts
|
| 628 |
+
rr["text"] = " ".join(texts).strip()
|
| 629 |
+
rr["ocr_line_count"] = len(lines_in)
|
| 630 |
+
out.append(rr)
|
| 631 |
+
|
| 632 |
+
return out
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
# ----------------------------
|
| 636 |
+
# CLIP frame type classifier (no LLM)
|
| 637 |
+
# ----------------------------
|
| 638 |
+
|
| 639 |
+
def init_clip_classifier() -> Tuple[Any, Any, Dict[str, Any], str]:
|
| 640 |
+
"""
|
| 641 |
+
Builds a robust CLIP classifier with:
|
| 642 |
+
- POS prompt ensembling per class
|
| 643 |
+
- NEG prompt ensembling per class
|
| 644 |
+
- score = mean(sim to POS) - mean(sim to NEG)
|
| 645 |
+
Returns:
|
| 646 |
+
clip_model, preprocess, pack, device
|
| 647 |
+
where pack contains text features and metadata.
|
| 648 |
+
"""
|
| 649 |
+
if clip is None or torch is None or Image is None:
|
| 650 |
+
raise RuntimeError(
|
| 651 |
+
"CLIP dependencies missing. Install torch and CLIP "
|
| 652 |
+
"(e.g. pip install torch and pip install git+https://github.com/openai/CLIP.git)."
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 656 |
+
try:
|
| 657 |
+
model, preprocess = clip.load(CLIP_MODEL_NAME, device=device)
|
| 658 |
+
model.eval()
|
| 659 |
+
except Exception as e:
|
| 660 |
+
raise RuntimeError(f"CLIP init failed for model '{CLIP_MODEL_NAME}': {type(e).__name__}: {e}") from e
|
| 661 |
+
|
| 662 |
+
if len(CLIP_CLASS_PROMPTS) != len(CLIP_CLASS_LABELS):
|
| 663 |
+
raise ValueError("CLIP_CLASS_PROMPTS must align with CLIP_CLASS_LABELS (same length).")
|
| 664 |
+
|
| 665 |
+
if "CLIP_CLASS_NEG_PROMPTS" not in globals():
|
| 666 |
+
raise ValueError("CLIP_CLASS_NEG_PROMPTS is missing. Define it (aligned with CLIP_CLASS_LABELS).")
|
| 667 |
+
|
| 668 |
+
if len(CLIP_CLASS_NEG_PROMPTS) != len(CLIP_CLASS_LABELS):
|
| 669 |
+
raise ValueError("CLIP_CLASS_NEG_PROMPTS must align with CLIP_CLASS_LABELS (same length).")
|
| 670 |
+
|
| 671 |
+
flat_pos: List[str] = []
|
| 672 |
+
pos_slices: List[Tuple[int, int]] = []
|
| 673 |
+
idx = 0
|
| 674 |
+
for prompts in CLIP_CLASS_PROMPTS:
|
| 675 |
+
if not isinstance(prompts, list) or len(prompts) == 0:
|
| 676 |
+
raise ValueError("Each entry in CLIP_CLASS_PROMPTS must be a non-empty list[str].")
|
| 677 |
+
s = idx
|
| 678 |
+
for p in prompts:
|
| 679 |
+
if not isinstance(p, str):
|
| 680 |
+
raise ValueError("All POS prompts must be strings.")
|
| 681 |
+
flat_pos.append(p)
|
| 682 |
+
idx += 1
|
| 683 |
+
pos_slices.append((s, idx))
|
| 684 |
+
|
| 685 |
+
flat_neg: List[str] = []
|
| 686 |
+
neg_slices: List[Tuple[int, int]] = []
|
| 687 |
+
idx = 0
|
| 688 |
+
for prompts in CLIP_CLASS_NEG_PROMPTS:
|
| 689 |
+
if not isinstance(prompts, list) or len(prompts) == 0:
|
| 690 |
+
raise ValueError("Each entry in CLIP_CLASS_NEG_PROMPTS must be a non-empty list[str].")
|
| 691 |
+
s = idx
|
| 692 |
+
for p in prompts:
|
| 693 |
+
if not isinstance(p, str):
|
| 694 |
+
raise ValueError("All NEG prompts must be strings.")
|
| 695 |
+
flat_neg.append(p)
|
| 696 |
+
idx += 1
|
| 697 |
+
neg_slices.append((s, idx))
|
| 698 |
+
|
| 699 |
+
with torch.no_grad():
|
| 700 |
+
pos_tokens = clip.tokenize(flat_pos).to(device)
|
| 701 |
+
pos_feats_all = model.encode_text(pos_tokens)
|
| 702 |
+
pos_feats_all = pos_feats_all / pos_feats_all.norm(dim=-1, keepdim=True)
|
| 703 |
+
|
| 704 |
+
neg_tokens = clip.tokenize(flat_neg).to(device)
|
| 705 |
+
neg_feats_all = model.encode_text(neg_tokens)
|
| 706 |
+
neg_feats_all = neg_feats_all / neg_feats_all.norm(dim=-1, keepdim=True)
|
| 707 |
+
|
| 708 |
+
pos_class_feats: List[torch.Tensor] = []
|
| 709 |
+
neg_class_feats: List[torch.Tensor] = []
|
| 710 |
+
|
| 711 |
+
for (s, e) in pos_slices:
|
| 712 |
+
pos_class_feats.append(pos_feats_all[s:e])
|
| 713 |
+
for (s, e) in neg_slices:
|
| 714 |
+
neg_class_feats.append(neg_feats_all[s:e])
|
| 715 |
+
|
| 716 |
+
pack = {
|
| 717 |
+
"labels": CLIP_CLASS_LABELS,
|
| 718 |
+
"pos_class_feats": pos_class_feats,
|
| 719 |
+
"neg_class_feats": neg_class_feats,
|
| 720 |
+
"score_mode": str(CLIP_SCORE_MODE),
|
| 721 |
+
"min_margin": float(CLIP_MIN_MARGIN),
|
| 722 |
+
}
|
| 723 |
+
return model, preprocess, pack, device
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
def classify_frame_clip(
|
| 727 |
+
*,
|
| 728 |
+
frame_bgr: np.ndarray,
|
| 729 |
+
clip_model: Any,
|
| 730 |
+
clip_preprocess: Any,
|
| 731 |
+
clip_text_features: Any,
|
| 732 |
+
clip_device: str,
|
| 733 |
+
) -> Tuple[str, Dict[str, float]]:
|
| 734 |
+
pack = clip_text_features
|
| 735 |
+
labels: List[str] = pack["labels"]
|
| 736 |
+
pos_class_feats: List[Any] = pack["pos_class_feats"]
|
| 737 |
+
neg_class_feats: List[Any] = pack["neg_class_feats"]
|
| 738 |
+
|
| 739 |
+
none_margin: float = float(pack.get("none_margin", 0.02))
|
| 740 |
+
weak_thr: float = float(pack.get("weak_thr", 0.00))
|
| 741 |
+
slide_close: float = float(pack.get("slide_close", 0.03))
|
| 742 |
+
|
| 743 |
+
rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 744 |
+
img = Image.fromarray(rgb)
|
| 745 |
+
image = clip_preprocess(img).unsqueeze(0).to(clip_device)
|
| 746 |
+
|
| 747 |
+
with torch.no_grad():
|
| 748 |
+
img_feat = clip_model.encode_image(image)
|
| 749 |
+
img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
|
| 750 |
+
|
| 751 |
+
scores: List[float] = []
|
| 752 |
+
for i in range(len(labels)):
|
| 753 |
+
pos_feats = pos_class_feats[i].to(clip_device)
|
| 754 |
+
neg_feats = neg_class_feats[i].to(clip_device)
|
| 755 |
+
|
| 756 |
+
pos_sims = (img_feat @ pos_feats.T).squeeze(0)
|
| 757 |
+
neg_sims = (img_feat @ neg_feats.T).squeeze(0)
|
| 758 |
+
|
| 759 |
+
score = float(pos_sims.mean().item() - neg_sims.mean().item())
|
| 760 |
+
scores.append(score)
|
| 761 |
+
|
| 762 |
+
scores_np = np.array(scores, dtype=np.float32)
|
| 763 |
+
score_map: Dict[str, float] = {labels[i]: float(scores_np[i]) for i in range(len(labels))}
|
| 764 |
+
|
| 765 |
+
if "none" not in labels:
|
| 766 |
+
best_idx = int(np.argmax(scores_np))
|
| 767 |
+
pred = labels[best_idx]
|
| 768 |
+
if ("slides" in labels) and (pred != "slides"):
|
| 769 |
+
winner_score = float(score_map[pred])
|
| 770 |
+
slides_score = float(score_map["slides"])
|
| 771 |
+
if (winner_score - slides_score) < float(slide_close):
|
| 772 |
+
pred = "slides"
|
| 773 |
+
score_map["_slide_close"] = float(slide_close)
|
| 774 |
+
return pred, score_map
|
| 775 |
+
|
| 776 |
+
none_idx = int(labels.index("none"))
|
| 777 |
+
none_score = float(scores_np[none_idx])
|
| 778 |
+
|
| 779 |
+
non_none_idxs = [i for i, lab in enumerate(labels) if lab != "none"]
|
| 780 |
+
best_non_none_idx = int(max(non_none_idxs, key=lambda i: float(scores_np[i])))
|
| 781 |
+
best_non_none_label = labels[best_non_none_idx]
|
| 782 |
+
best_non_none_score = float(scores_np[best_non_none_idx])
|
| 783 |
+
|
| 784 |
+
if (none_score >= best_non_none_score + none_margin) or (best_non_none_score < weak_thr):
|
| 785 |
+
pred = "none"
|
| 786 |
+
else:
|
| 787 |
+
pred = best_non_none_label
|
| 788 |
+
|
| 789 |
+
if pred != "none" and ("slides" in labels) and (pred != "slides"):
|
| 790 |
+
slides_score = float(score_map["slides"])
|
| 791 |
+
winner_score = float(score_map[pred])
|
| 792 |
+
if (winner_score - slides_score) < float(slide_close):
|
| 793 |
+
pred = "slides"
|
| 794 |
+
|
| 795 |
+
score_map["_best_non_none_score"] = float(best_non_none_score)
|
| 796 |
+
score_map["_none_score"] = float(none_score)
|
| 797 |
+
score_map["_none_margin"] = float(none_margin)
|
| 798 |
+
score_map["_weak_thr"] = float(weak_thr)
|
| 799 |
+
score_map["_best_non_none_idx"] = float(best_non_none_idx)
|
| 800 |
+
score_map["_none_idx"] = float(none_idx)
|
| 801 |
+
score_map["_slide_close"] = float(slide_close)
|
| 802 |
+
if "slides" in labels:
|
| 803 |
+
score_map["_slides_score"] = float(score_map["slides"])
|
| 804 |
+
|
| 805 |
+
return pred, score_map
|
| 806 |
+
|
| 807 |
+
|
| 808 |
+
# ----------------------------
|
| 809 |
+
# Candidate detection (cheap, local)
|
| 810 |
+
# ----------------------------
|
| 811 |
+
|
| 812 |
+
def find_candidates_diff(
|
| 813 |
+
video_path: Path,
|
| 814 |
+
sample_fps: float,
|
| 815 |
+
resize_w: int,
|
| 816 |
+
candidate_percentile: float,
|
| 817 |
+
max_candidates: int,
|
| 818 |
+
) -> Tuple[List[CandidateFrame], float]:
|
| 819 |
+
fps, duration, total_frames = _probe_video(video_path)
|
| 820 |
+
if duration <= 0 or total_frames <= 0:
|
| 821 |
+
raise RuntimeError("Could not determine video duration/frames.")
|
| 822 |
+
|
| 823 |
+
cap = cv2.VideoCapture(str(video_path))
|
| 824 |
+
if not cap.isOpened():
|
| 825 |
+
raise RuntimeError(f"Could not open video: {video_path}")
|
| 826 |
+
|
| 827 |
+
sample_fps = float(sample_fps)
|
| 828 |
+
if sample_fps <= 0:
|
| 829 |
+
raise ValueError("sample_fps must be > 0")
|
| 830 |
+
|
| 831 |
+
step_frames = max(1, int(round(fps / sample_fps)))
|
| 832 |
+
|
| 833 |
+
print(f" [step1] video_fps={fps:.3f} duration_sec={duration:.2f} total_frames={total_frames}")
|
| 834 |
+
print(f" [step1] SAMPLE_FPS={sample_fps} -> step_frames={step_frames} (~{1.0/sample_fps:.2f}s per sample)")
|
| 835 |
+
print(f" [step1] RESIZE_W={resize_w} CANDIDATE_PERCENTILE={candidate_percentile} MAX_CANDIDATES={max_candidates}")
|
| 836 |
+
|
| 837 |
+
candidates: List[CandidateFrame] = []
|
| 838 |
+
diffs: List[float] = []
|
| 839 |
+
|
| 840 |
+
prev_gray = None
|
| 841 |
+
sampled = 0
|
| 842 |
+
|
| 843 |
+
max_k = int((total_frames - 1) // step_frames) if total_frames > 0 else 0
|
| 844 |
+
|
| 845 |
+
for k in range(max_k + 1):
|
| 846 |
+
frame_idx = int(k * step_frames)
|
| 847 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
|
| 848 |
+
ret, frame = cap.read()
|
| 849 |
+
if not ret or frame is None:
|
| 850 |
+
break
|
| 851 |
+
|
| 852 |
+
sampled += 1
|
| 853 |
+
t_sec = frame_idx / fps
|
| 854 |
+
|
| 855 |
+
gray = _downscale_gray(frame, resize_w=resize_w)
|
| 856 |
+
d = 999.0 if prev_gray is None else _mad_diff(gray, prev_gray)
|
| 857 |
+
|
| 858 |
+
candidates.append(CandidateFrame(t_sec=float(t_sec), frame_idx=int(frame_idx), diff_score=float(d)))
|
| 859 |
+
diffs.append(float(d))
|
| 860 |
+
prev_gray = gray
|
| 861 |
+
|
| 862 |
+
if sampled % 300 == 0:
|
| 863 |
+
print(f" [step1] sampled={sampled} last_t={fmt_hhmmss(t_sec)} last_diff={d:.2f}")
|
| 864 |
+
|
| 865 |
+
cap.release()
|
| 866 |
+
|
| 867 |
+
if not candidates:
|
| 868 |
+
print(" [step1] no candidates produced (empty video?)")
|
| 869 |
+
return [], 0.0
|
| 870 |
+
|
| 871 |
+
diffs_np = np.array(diffs, dtype=np.float32)
|
| 872 |
+
diffs_for_thr = diffs_np[1:] if len(diffs_np) > 1 else diffs_np
|
| 873 |
+
base_thr = float(np.percentile(diffs_for_thr, float(candidate_percentile)))
|
| 874 |
+
base_thr = max(4.0, base_thr)
|
| 875 |
+
|
| 876 |
+
order = np.argsort(diffs_np)[::-1]
|
| 877 |
+
picked = set()
|
| 878 |
+
out: List[CandidateFrame] = []
|
| 879 |
+
|
| 880 |
+
out.append(candidates[0])
|
| 881 |
+
picked.add(0)
|
| 882 |
+
|
| 883 |
+
for idx in order:
|
| 884 |
+
if len(out) >= int(max_candidates):
|
| 885 |
+
break
|
| 886 |
+
ii = int(idx)
|
| 887 |
+
if ii in picked:
|
| 888 |
+
continue
|
| 889 |
+
out.append(candidates[ii])
|
| 890 |
+
picked.add(ii)
|
| 891 |
+
|
| 892 |
+
out.sort(key=lambda x: x.t_sec)
|
| 893 |
+
|
| 894 |
+
print(f" [step1] sampled_frames={sampled} raw_candidates={len(candidates)} selected_candidates={len(out)} base_thr={base_thr:.2f}")
|
| 895 |
+
return out, base_thr
|
| 896 |
+
|
| 897 |
+
|
| 898 |
+
# ----------------------------
|
| 899 |
+
# Keyframe keep rule (visual only)
|
| 900 |
+
# ----------------------------
|
| 901 |
+
|
| 902 |
+
def should_keep_visual_only(
|
| 903 |
+
*,
|
| 904 |
+
frame_type: str,
|
| 905 |
+
t_sec: float,
|
| 906 |
+
diff_to_last_keep: float,
|
| 907 |
+
base_thr: float,
|
| 908 |
+
last_kept_t: float,
|
| 909 |
+
) -> Tuple[bool, Dict[str, float]]:
|
| 910 |
+
cfg = SENS.get(frame_type, {"min_gap_sec": 1.0, "diff_mult": 1.0})
|
| 911 |
+
diff_mult = float(cfg.get("diff_mult", 1.0))
|
| 912 |
+
|
| 913 |
+
min_gap = float(MIN_KEYFRAME_GAP_SEC)
|
| 914 |
+
|
| 915 |
+
ok_gap = True if last_kept_t <= -1e8 else ((t_sec - last_kept_t) >= min_gap)
|
| 916 |
+
thr_eff = float(base_thr * diff_mult)
|
| 917 |
+
ok_visual = diff_to_last_keep >= thr_eff
|
| 918 |
+
|
| 919 |
+
debug = {
|
| 920 |
+
"diff_to_last_keep": float(diff_to_last_keep),
|
| 921 |
+
"thr_effective": float(thr_eff),
|
| 922 |
+
"ok_gap": 1.0 if ok_gap else 0.0,
|
| 923 |
+
"ok_visual": 1.0 if ok_visual else 0.0,
|
| 924 |
+
"min_gap_sec_used": float(min_gap),
|
| 925 |
+
}
|
| 926 |
+
return (ok_gap and ok_visual), debug
|
| 927 |
+
|
| 928 |
+
|
| 929 |
+
# ----------------------------
|
| 930 |
+
# Concurrent parsing worker (YOLO + OCR) for kept keyframes
|
| 931 |
+
# ----------------------------
|
| 932 |
+
|
| 933 |
+
_WORKER_LAYOUT_MODEL = None
|
| 934 |
+
_WORKER_OCR_MODEL = None
|
| 935 |
+
|
| 936 |
+
def _worker_init(layout_weights: str, ocr_lang: str, enable_yolo: bool = True):
|
| 937 |
+
global _WORKER_LAYOUT_MODEL, _WORKER_OCR_MODEL
|
| 938 |
+
_WORKER_LAYOUT_MODEL = YOLO(layout_weights) if enable_yolo else None
|
| 939 |
+
|
| 940 |
+
# IMPORTANT:
|
| 941 |
+
# - use_angle_cls=False: turn off angle classifier
|
| 942 |
+
# - use_gpu=USE_GPU: attempts GPU (requires paddlepaddle-gpu)
|
| 943 |
+
_WORKER_OCR_MODEL = PaddleOCR(
|
| 944 |
+
use_angle_cls=False,
|
| 945 |
+
lang=ocr_lang,
|
| 946 |
+
use_gpu=USE_GPU,
|
| 947 |
+
show_log=False,
|
| 948 |
+
enable_mkldnn=False,
|
| 949 |
+
ir_optim=False,
|
| 950 |
+
)
|
| 951 |
+
|
| 952 |
+
def _parse_one_keyframe(job: dict) -> dict:
|
| 953 |
+
global _WORKER_LAYOUT_MODEL, _WORKER_OCR_MODEL
|
| 954 |
+
kidx = int(job["keyframe_idx"])
|
| 955 |
+
img_path = job["image_path"]
|
| 956 |
+
frame_type = str(job.get("frame_type", "none"))
|
| 957 |
+
parse_mode = str(job.get("parse_mode", "yolo_ocr"))
|
| 958 |
+
|
| 959 |
+
frame = cv2.imread(str(img_path))
|
| 960 |
+
if frame is None:
|
| 961 |
+
return {"keyframe_idx": kidx, "error": f"Could not read image: {img_path}"}
|
| 962 |
+
|
| 963 |
+
# Resize for slides/demo/none to speed up YOLO+OCR; keep code max
|
| 964 |
+
max_w = int(PARSE_MAX_W_BY_TYPE.get(frame_type, 1280))
|
| 965 |
+
frame_for_parse = _resize_frame_max_w(frame, max_w=max_w) if max_w < 99999 else frame
|
| 966 |
+
|
| 967 |
+
H, W = frame_for_parse.shape[:2]
|
| 968 |
+
|
| 969 |
+
regions: List[dict] = []
|
| 970 |
+
t_yolo_ms = 0.0
|
| 971 |
+
if parse_mode == "yolo_ocr":
|
| 972 |
+
if _WORKER_LAYOUT_MODEL is None:
|
| 973 |
+
return {"keyframe_idx": kidx, "error": "YOLO model is not initialized for yolo_ocr parse mode."}
|
| 974 |
+
t0 = time.perf_counter()
|
| 975 |
+
regions = run_layout_yolo(_WORKER_LAYOUT_MODEL, frame_for_parse)
|
| 976 |
+
t_yolo_ms = (time.perf_counter() - t0) * 1000.0
|
| 977 |
+
|
| 978 |
+
t0 = time.perf_counter()
|
| 979 |
+
if parse_mode == "yolo_ocr":
|
| 980 |
+
ocr_lines = run_paddle_ocr_on_text_regions(
|
| 981 |
+
_WORKER_OCR_MODEL,
|
| 982 |
+
frame_for_parse,
|
| 983 |
+
regions,
|
| 984 |
+
frame_type=frame_type,
|
| 985 |
+
max_regions=OCR_CROP_MAX_REGIONS,
|
| 986 |
+
)
|
| 987 |
+
else:
|
| 988 |
+
# OCR-only mode: no layout detection, run full-frame OCR.
|
| 989 |
+
ocr_lines = run_paddle_ocr(_WORKER_OCR_MODEL, frame_for_parse)
|
| 990 |
+
t_ocr_ms = (time.perf_counter() - t0) * 1000.0
|
| 991 |
+
|
| 992 |
+
t0 = time.perf_counter()
|
| 993 |
+
regions_with_text = attach_ocr_to_regions(regions, ocr_lines) if regions else []
|
| 994 |
+
zones = attach_zones(regions_with_text, W=W, H=H) if regions_with_text else {"top": [], "left": [], "center": [], "right": [], "bottom": []}
|
| 995 |
+
title_guess_val = guess_title(regions_with_text, ocr_lines)
|
| 996 |
+
t_attach_ms = (time.perf_counter() - t0) * 1000.0
|
| 997 |
+
|
| 998 |
+
text_lines = [x["text"] for x in ocr_lines if x.get("text")][:MAX_OCR_LINES]
|
| 999 |
+
|
| 1000 |
+
screen_parse = {
|
| 1001 |
+
"frame_w": int(W),
|
| 1002 |
+
"frame_h": int(H),
|
| 1003 |
+
"layout_regions": regions_with_text,
|
| 1004 |
+
"ocr_lines": ocr_lines,
|
| 1005 |
+
"zones": zones,
|
| 1006 |
+
"title_guess": title_guess_val,
|
| 1007 |
+
"layout_model": str(LAYOUT_YOLO_WEIGHTS),
|
| 1008 |
+
"ocr_lang": str(OCR_LANG),
|
| 1009 |
+
"layout_conf": float(LAYOUT_CONF),
|
| 1010 |
+
"layout_iou": float(LAYOUT_IOU),
|
| 1011 |
+
"ocr_min_conf": float(OCR_MIN_CONF),
|
| 1012 |
+
"parse_input_frame_type": str(frame_type),
|
| 1013 |
+
"yolo_device": str(YOLO_DEVICE),
|
| 1014 |
+
"yolo_imgsz": int(YOLO_IMGSZ),
|
| 1015 |
+
"ocr_use_gpu": bool(USE_GPU),
|
| 1016 |
+
"ocr_angle_cls": False,
|
| 1017 |
+
"ocr_crop_max_regions": int(OCR_CROP_MAX_REGIONS),
|
| 1018 |
+
"ocr_crop_scale_used": float(OCR_CROP_SCALE_BY_TYPE.get(frame_type, 0.80)),
|
| 1019 |
+
"parse_max_w_used": int(max_w),
|
| 1020 |
+
"parse_mode": str(parse_mode),
|
| 1021 |
+
}
|
| 1022 |
+
|
| 1023 |
+
return {
|
| 1024 |
+
"keyframe_idx": kidx,
|
| 1025 |
+
"on_screen_text": text_lines,
|
| 1026 |
+
"screen_parse": screen_parse,
|
| 1027 |
+
"parse_timings_ms": {
|
| 1028 |
+
"full_yolo_ms": float(t_yolo_ms),
|
| 1029 |
+
"full_ocr_ms": float(t_ocr_ms),
|
| 1030 |
+
"attach_text_ms": float(t_attach_ms),
|
| 1031 |
+
}
|
| 1032 |
+
}
|
| 1033 |
+
|
| 1034 |
+
|
| 1035 |
+
# ----------------------------
|
| 1036 |
+
# Main
|
| 1037 |
+
# ----------------------------
|
| 1038 |
+
|
| 1039 |
+
def main():
|
| 1040 |
+
load_dotenv()
|
| 1041 |
+
|
| 1042 |
+
ap = argparse.ArgumentParser()
|
| 1043 |
+
ap.add_argument("--video", required=True, help="Path to meeting.mp4")
|
| 1044 |
+
ap.add_argument("--out", required=True, help="Output folder")
|
| 1045 |
+
ap.add_argument("--force", action="store_true")
|
| 1046 |
+
ap.add_argument(
|
| 1047 |
+
"--no-yolo-for-non-demo",
|
| 1048 |
+
action="store_true",
|
| 1049 |
+
help="Use OCR-only parsing for non-demo frames (slides/code/none).",
|
| 1050 |
+
)
|
| 1051 |
+
args = ap.parse_args()
|
| 1052 |
+
|
| 1053 |
+
if not ENABLE_LOCAL_SCREEN_PARSE:
|
| 1054 |
+
raise RuntimeError("ENABLE_LOCAL_SCREEN_PARSE must be True. YOLO and PaddleOCR are required.")
|
| 1055 |
+
|
| 1056 |
+
if not Path(LAYOUT_YOLO_WEIGHTS).exists():
|
| 1057 |
+
raise FileNotFoundError(f"Layout YOLO weights not found at: {LAYOUT_YOLO_WEIGHTS}")
|
| 1058 |
+
|
| 1059 |
+
try:
|
| 1060 |
+
_ = YOLO(LAYOUT_YOLO_WEIGHTS)
|
| 1061 |
+
except Exception as e:
|
| 1062 |
+
raise RuntimeError(f"YOLO init failed: {type(e).__name__}: {e}") from e
|
| 1063 |
+
|
| 1064 |
+
# NOTE: this tries GPU; if your Paddle is CPU-only, this may error.
|
| 1065 |
+
# In that case install paddlepaddle-gpu, or set USE_GPU=False.
|
| 1066 |
+
try:
|
| 1067 |
+
_ = PaddleOCR(
|
| 1068 |
+
use_angle_cls=False,
|
| 1069 |
+
lang=OCR_LANG,
|
| 1070 |
+
use_gpu=USE_GPU,
|
| 1071 |
+
show_log=False,
|
| 1072 |
+
enable_mkldnn=False,
|
| 1073 |
+
ir_optim=False,
|
| 1074 |
+
)
|
| 1075 |
+
except Exception as e:
|
| 1076 |
+
raise RuntimeError(f"PaddleOCR init failed: {type(e).__name__}: {e}") from e
|
| 1077 |
+
|
| 1078 |
+
try:
|
| 1079 |
+
clip_model, clip_preprocess, clip_text_features, clip_device = init_clip_classifier()
|
| 1080 |
+
except Exception as e:
|
| 1081 |
+
raise RuntimeError(f"CLIP classifier init failed: {type(e).__name__}: {e}") from e
|
| 1082 |
+
|
| 1083 |
+
video_path = Path(args.video).resolve()
|
| 1084 |
+
out_dir = Path(args.out).resolve()
|
| 1085 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 1086 |
+
|
| 1087 |
+
frames_dir = out_dir / "frames_selected"
|
| 1088 |
+
frames_dir.mkdir(parents=True, exist_ok=True)
|
| 1089 |
+
|
| 1090 |
+
enriched_json = out_dir / "keyframes_parsed.json"
|
| 1091 |
+
timing_json = out_dir / "timing_summary.json"
|
| 1092 |
+
classified_dir = out_dir / "classified"
|
| 1093 |
+
classified_dir.mkdir(parents=True, exist_ok=True)
|
| 1094 |
+
|
| 1095 |
+
out_paths = {
|
| 1096 |
+
"slides": classified_dir / "slides_keyframes.json",
|
| 1097 |
+
"code": classified_dir / "code_keyframes.json",
|
| 1098 |
+
"demo": classified_dir / "demo_keyframes.json",
|
| 1099 |
+
"none": classified_dir / "none_keyframes.json",
|
| 1100 |
+
}
|
| 1101 |
+
|
| 1102 |
+
t_total0 = time.perf_counter()
|
| 1103 |
+
timing_totals = {
|
| 1104 |
+
"candidate_detection_ms": 0.0,
|
| 1105 |
+
"candidate_loop_ms": 0.0,
|
| 1106 |
+
"read_frame_ms": 0.0,
|
| 1107 |
+
"gray_diff_ms": 0.0,
|
| 1108 |
+
"clip_ms": 0.0,
|
| 1109 |
+
"keep_logic_ms": 0.0,
|
| 1110 |
+
"save_frame_ms": 0.0,
|
| 1111 |
+
"parse_concurrent_ms": 0.0,
|
| 1112 |
+
"json_write_ms": 0.0,
|
| 1113 |
+
}
|
| 1114 |
+
|
| 1115 |
+
all_selected: List[dict] = []
|
| 1116 |
+
processed_times: set = set()
|
| 1117 |
+
|
| 1118 |
+
last_kept_t = -1e9
|
| 1119 |
+
last_kept_gray: Optional[np.ndarray] = None
|
| 1120 |
+
|
| 1121 |
+
if (not args.force) and enriched_json.exists():
|
| 1122 |
+
try:
|
| 1123 |
+
all_selected = safe_read_json(enriched_json)
|
| 1124 |
+
if isinstance(all_selected, list) and all_selected:
|
| 1125 |
+
processed_times = {round(float(x.get("t_sec", -1.0)), 2) for x in all_selected if "t_sec" in x}
|
| 1126 |
+
last = all_selected[-1]
|
| 1127 |
+
last_kept_t = float(last.get("t_sec", last_kept_t))
|
| 1128 |
+
|
| 1129 |
+
last_img = Path(last.get("image_path", ""))
|
| 1130 |
+
if last_img.exists():
|
| 1131 |
+
img = cv2.imread(str(last_img))
|
| 1132 |
+
if img is not None:
|
| 1133 |
+
last_kept_gray = _downscale_gray(img, RESIZE_W)
|
| 1134 |
+
|
| 1135 |
+
print(f"Resuming: already selected {len(all_selected)} keyframes (last at {fmt_hhmmss(last_kept_t)}).")
|
| 1136 |
+
except Exception:
|
| 1137 |
+
all_selected = []
|
| 1138 |
+
processed_times = set()
|
| 1139 |
+
last_kept_t = -1e9
|
| 1140 |
+
last_kept_gray = None
|
| 1141 |
+
|
| 1142 |
+
if args.force:
|
| 1143 |
+
all_selected = []
|
| 1144 |
+
processed_times = set()
|
| 1145 |
+
last_kept_t = -1e9
|
| 1146 |
+
last_kept_gray = None
|
| 1147 |
+
|
| 1148 |
+
print("1) Finding candidate change points locally (no API)...")
|
| 1149 |
+
print(f" [step1] starting... (this can take time on long videos)")
|
| 1150 |
+
t0 = time.perf_counter()
|
| 1151 |
+
candidates, base_thr = find_candidates_diff(
|
| 1152 |
+
video_path=video_path,
|
| 1153 |
+
sample_fps=SAMPLE_FPS,
|
| 1154 |
+
resize_w=RESIZE_W,
|
| 1155 |
+
candidate_percentile=CANDIDATE_PERCENTILE,
|
| 1156 |
+
max_candidates=MAX_CANDIDATES,
|
| 1157 |
+
)
|
| 1158 |
+
t1_ms = (time.perf_counter() - t0) * 1000.0
|
| 1159 |
+
timing_totals["candidate_detection_ms"] += t1_ms
|
| 1160 |
+
print(f" [step1] done in {t1_ms/1000.0:.2f}s")
|
| 1161 |
+
|
| 1162 |
+
print(f"Candidates: {len(candidates)}, base diff threshold ~ {base_thr:.2f}")
|
| 1163 |
+
print("Sensitivity config (edit in code):", SENS)
|
| 1164 |
+
print("Layout model:", LAYOUT_YOLO_WEIGHTS)
|
| 1165 |
+
print("YOLO device:", YOLO_DEVICE, "| imgsz:", YOLO_IMGSZ)
|
| 1166 |
+
print("OCR lang:", OCR_LANG, "| OCR_MIN_CONF:", OCR_MIN_CONF, "| OCR GPU:", USE_GPU, "| angle_cls:", False)
|
| 1167 |
+
print("CLIP model:", CLIP_MODEL_NAME, "| device:", clip_device)
|
| 1168 |
+
print("Parse workers:", PARSE_WORKERS)
|
| 1169 |
+
print(f"Global min gap override (seconds since last keyframe): {MIN_KEYFRAME_GAP_SEC:.2f}s")
|
| 1170 |
+
|
| 1171 |
+
kept_count = len(all_selected)
|
| 1172 |
+
reader = VideoReader(video_path)
|
| 1173 |
+
|
| 1174 |
+
try:
|
| 1175 |
+
print("2) Selecting keyframes (VISUAL ONLY: time gap + diff; no OCR in loop)...")
|
| 1176 |
+
t_loop0 = time.perf_counter()
|
| 1177 |
+
|
| 1178 |
+
for i, cand in enumerate(candidates, start=1):
|
| 1179 |
+
if kept_count >= int(MAX_FRAMES):
|
| 1180 |
+
break
|
| 1181 |
+
|
| 1182 |
+
t_key = round(float(cand.t_sec), 2)
|
| 1183 |
+
if t_key in processed_times:
|
| 1184 |
+
continue
|
| 1185 |
+
if cand.t_sec <= (last_kept_t + 1e-6) and last_kept_t > -1e8:
|
| 1186 |
+
continue
|
| 1187 |
+
|
| 1188 |
+
gap = float(cand.t_sec - last_kept_t) if last_kept_t > -1e8 else 9999.0
|
| 1189 |
+
|
| 1190 |
+
if last_kept_t > -1e8 and gap < float(MIN_KEYFRAME_GAP_SEC):
|
| 1191 |
+
continue
|
| 1192 |
+
|
| 1193 |
+
t0 = time.perf_counter()
|
| 1194 |
+
frame = reader.read_at_frame(cand.frame_idx)
|
| 1195 |
+
timing_totals["read_frame_ms"] += (time.perf_counter() - t0) * 1000.0
|
| 1196 |
+
if frame is None:
|
| 1197 |
+
continue
|
| 1198 |
+
|
| 1199 |
+
t0 = time.perf_counter()
|
| 1200 |
+
gray_now = _downscale_gray(frame, RESIZE_W)
|
| 1201 |
+
diff_to_last_keep = 999.0 if last_kept_gray is None else _mad_diff(gray_now, last_kept_gray)
|
| 1202 |
+
timing_totals["gray_diff_ms"] += (time.perf_counter() - t0) * 1000.0
|
| 1203 |
+
|
| 1204 |
+
print(
|
| 1205 |
+
f"[{i}/{len(candidates)}] t={fmt_hhmmss(cand.t_sec)} "
|
| 1206 |
+
f"gap_since_last_keep={gap:.2f}s cand_diff={cand.diff_score:.2f} keep_diff={diff_to_last_keep:.2f} ..."
|
| 1207 |
+
)
|
| 1208 |
+
|
| 1209 |
+
frame_fast = _resize_frame_max_w(frame, FAST_FRAME_MAX_W)
|
| 1210 |
+
|
| 1211 |
+
t0 = time.perf_counter()
|
| 1212 |
+
frame_type, clip_probs = classify_frame_clip(
|
| 1213 |
+
frame_bgr=frame_fast,
|
| 1214 |
+
clip_model=clip_model,
|
| 1215 |
+
clip_preprocess=clip_preprocess,
|
| 1216 |
+
clip_text_features=clip_text_features,
|
| 1217 |
+
clip_device=clip_device,
|
| 1218 |
+
)
|
| 1219 |
+
t_clip_ms = (time.perf_counter() - t0) * 1000.0
|
| 1220 |
+
timing_totals["clip_ms"] += t_clip_ms
|
| 1221 |
+
|
| 1222 |
+
t0 = time.perf_counter()
|
| 1223 |
+
keep, dbg = should_keep_visual_only(
|
| 1224 |
+
frame_type=frame_type,
|
| 1225 |
+
t_sec=float(cand.t_sec),
|
| 1226 |
+
diff_to_last_keep=float(diff_to_last_keep),
|
| 1227 |
+
base_thr=float(base_thr),
|
| 1228 |
+
last_kept_t=float(last_kept_t),
|
| 1229 |
+
)
|
| 1230 |
+
t_keep_ms = (time.perf_counter() - t0) * 1000.0
|
| 1231 |
+
timing_totals["keep_logic_ms"] += t_keep_ms
|
| 1232 |
+
|
| 1233 |
+
print(
|
| 1234 |
+
f" timings: clip={t_clip_ms:.0f}ms keep_logic={t_keep_ms:.0f}ms "
|
| 1235 |
+
f"| type={frame_type} keep={keep} | diff={diff_to_last_keep:.2f} thr_eff={dbg['thr_effective']:.2f} "
|
| 1236 |
+
f"| min_gap_used={dbg.get('min_gap_sec_used', MIN_KEYFRAME_GAP_SEC):.2f}s"
|
| 1237 |
+
)
|
| 1238 |
+
|
| 1239 |
+
if not keep:
|
| 1240 |
+
if BASE_SLEEP_SEC > 0:
|
| 1241 |
+
time.sleep(BASE_SLEEP_SEC)
|
| 1242 |
+
continue
|
| 1243 |
+
|
| 1244 |
+
t0 = time.perf_counter()
|
| 1245 |
+
out_img = frames_dir / f"frame_{kept_count:04d}_{cand.t_sec:.2f}s_{frame_type}.jpg"
|
| 1246 |
+
cv2.imwrite(str(out_img), frame)
|
| 1247 |
+
t_save_ms = (time.perf_counter() - t0) * 1000.0
|
| 1248 |
+
timing_totals["save_frame_ms"] += t_save_ms
|
| 1249 |
+
|
| 1250 |
+
item = {
|
| 1251 |
+
"keyframe_idx": int(kept_count),
|
| 1252 |
+
"t_sec": float(cand.t_sec),
|
| 1253 |
+
"timestamp": fmt_hhmmss(cand.t_sec),
|
| 1254 |
+
"image_path": str(out_img),
|
| 1255 |
+
|
| 1256 |
+
"frame_type": frame_type,
|
| 1257 |
+
"on_screen_text": [],
|
| 1258 |
+
"screen_parse": None,
|
| 1259 |
+
|
| 1260 |
+
"candidate_diff_score": float(cand.diff_score),
|
| 1261 |
+
"diff_to_last_keep": float(diff_to_last_keep),
|
| 1262 |
+
"base_diff_threshold": float(base_thr),
|
| 1263 |
+
"thr_effective": float(dbg.get("thr_effective", 0.0)),
|
| 1264 |
+
"gap_since_last_keep_sec": float(gap),
|
| 1265 |
+
|
| 1266 |
+
"clip_probs": {k: float(v) for k, v in clip_probs.items()},
|
| 1267 |
+
"clip_prompt_map": dict(zip(CLIP_CLASS_LABELS, CLIP_CLASS_PROMPTS)),
|
| 1268 |
+
"clip_model_name": str(CLIP_MODEL_NAME),
|
| 1269 |
+
|
| 1270 |
+
"timings_ms": {
|
| 1271 |
+
"clip_ms": float(t_clip_ms),
|
| 1272 |
+
"keep_logic_ms": float(t_keep_ms),
|
| 1273 |
+
"save_frame_ms": float(t_save_ms),
|
| 1274 |
+
},
|
| 1275 |
+
}
|
| 1276 |
+
|
| 1277 |
+
all_selected.append(item)
|
| 1278 |
+
processed_times.add(t_key)
|
| 1279 |
+
kept_count += 1
|
| 1280 |
+
|
| 1281 |
+
last_kept_t = float(cand.t_sec)
|
| 1282 |
+
last_kept_gray = gray_now
|
| 1283 |
+
|
| 1284 |
+
t0 = time.perf_counter()
|
| 1285 |
+
safe_write_json(enriched_json, all_selected)
|
| 1286 |
+
timing_totals["json_write_ms"] += (time.perf_counter() - t0) * 1000.0
|
| 1287 |
+
|
| 1288 |
+
if BASE_SLEEP_SEC > 0:
|
| 1289 |
+
time.sleep(BASE_SLEEP_SEC)
|
| 1290 |
+
|
| 1291 |
+
timing_totals["candidate_loop_ms"] += (time.perf_counter() - t_loop0) * 1000.0
|
| 1292 |
+
|
| 1293 |
+
finally:
|
| 1294 |
+
reader.close()
|
| 1295 |
+
|
| 1296 |
+
# Phase 3: YOLO + OCR concurrently on keyframes that need parsing
|
| 1297 |
+
to_parse = []
|
| 1298 |
+
for it in all_selected:
|
| 1299 |
+
if (not args.force) and isinstance(it.get("screen_parse"), dict) and it.get("on_screen_text"):
|
| 1300 |
+
continue
|
| 1301 |
+
if it.get("image_path"):
|
| 1302 |
+
frame_type = str(it.get("frame_type", "none"))
|
| 1303 |
+
parse_mode = "yolo_ocr"
|
| 1304 |
+
if args.no_yolo_for_non_demo and frame_type != "demo":
|
| 1305 |
+
parse_mode = "ocr_only"
|
| 1306 |
+
to_parse.append({
|
| 1307 |
+
"keyframe_idx": int(it["keyframe_idx"]),
|
| 1308 |
+
"t_sec": float(it["t_sec"]),
|
| 1309 |
+
"frame_type": frame_type,
|
| 1310 |
+
"image_path": str(it["image_path"]),
|
| 1311 |
+
"parse_mode": parse_mode,
|
| 1312 |
+
})
|
| 1313 |
+
|
| 1314 |
+
print(f"3) Parsing kept keyframes with YOLO+OCR concurrently... to_parse={len(to_parse)}")
|
| 1315 |
+
|
| 1316 |
+
if to_parse:
|
| 1317 |
+
yolo_jobs = sum(1 for j in to_parse if j.get("parse_mode") == "yolo_ocr")
|
| 1318 |
+
ocr_only_jobs = len(to_parse) - yolo_jobs
|
| 1319 |
+
enable_yolo = yolo_jobs > 0
|
| 1320 |
+
|
| 1321 |
+
print(" [step3] starting ProcessPoolExecutor...")
|
| 1322 |
+
print(f" [step3] PARSE_WORKERS={PARSE_WORKERS} (each worker loads YOLO + PaddleOCR once)")
|
| 1323 |
+
print(f" [step3] YOLO_DEVICE={YOLO_DEVICE} YOLO_IMGSZ={YOLO_IMGSZ} | OCR_GPU={USE_GPU} angle_cls=False")
|
| 1324 |
+
print(f" [step3] OCR crops: max_regions={OCR_CROP_MAX_REGIONS} scale_by_type={OCR_CROP_SCALE_BY_TYPE}")
|
| 1325 |
+
print(f" [step3] Parse resize max_w_by_type={PARSE_MAX_W_BY_TYPE}")
|
| 1326 |
+
print(f" [step3] parse_mode split: yolo_ocr={yolo_jobs}, ocr_only={ocr_only_jobs}")
|
| 1327 |
+
|
| 1328 |
+
t0 = time.perf_counter()
|
| 1329 |
+
|
| 1330 |
+
with cf.ProcessPoolExecutor(
|
| 1331 |
+
max_workers=max(1, PARSE_WORKERS),
|
| 1332 |
+
initializer=_worker_init,
|
| 1333 |
+
initargs=(str(LAYOUT_YOLO_WEIGHTS), str(OCR_LANG), bool(enable_yolo)),
|
| 1334 |
+
) as ex:
|
| 1335 |
+
fut_to_job = {ex.submit(_parse_one_keyframe, job): job for job in to_parse}
|
| 1336 |
+
|
| 1337 |
+
done_count = 0
|
| 1338 |
+
err_count = 0
|
| 1339 |
+
t_last_report = time.perf_counter()
|
| 1340 |
+
|
| 1341 |
+
for fut in cf.as_completed(fut_to_job):
|
| 1342 |
+
job = fut_to_job[fut]
|
| 1343 |
+
job_kidx = int(job.get("keyframe_idx", -1))
|
| 1344 |
+
done_count += 1
|
| 1345 |
+
|
| 1346 |
+
try:
|
| 1347 |
+
res = fut.result()
|
| 1348 |
+
except Exception as e:
|
| 1349 |
+
err_count += 1
|
| 1350 |
+
if 0 <= job_kidx < len(all_selected):
|
| 1351 |
+
all_selected[job_kidx]["screen_parse_error"] = f"worker_exception: {type(e).__name__}: {e}"
|
| 1352 |
+
now = time.perf_counter()
|
| 1353 |
+
if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
|
| 1354 |
+
print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
|
| 1355 |
+
t_last_report = now
|
| 1356 |
+
continue
|
| 1357 |
+
|
| 1358 |
+
kidx = int(res.get("keyframe_idx", job_kidx))
|
| 1359 |
+
if kidx < 0 or kidx >= len(all_selected):
|
| 1360 |
+
now = time.perf_counter()
|
| 1361 |
+
if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
|
| 1362 |
+
print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
|
| 1363 |
+
t_last_report = now
|
| 1364 |
+
continue
|
| 1365 |
+
|
| 1366 |
+
# Track explicit worker-level error payloads.
|
| 1367 |
+
if "error" in res:
|
| 1368 |
+
err_count += 1
|
| 1369 |
+
|
| 1370 |
+
if "error" in res:
|
| 1371 |
+
all_selected[kidx]["screen_parse_error"] = res["error"]
|
| 1372 |
+
else:
|
| 1373 |
+
all_selected[kidx]["on_screen_text"] = res.get("on_screen_text", [])[:MAX_OCR_LINES]
|
| 1374 |
+
all_selected[kidx]["screen_parse"] = res.get("screen_parse")
|
| 1375 |
+
tm = all_selected[kidx].get("timings_ms", {}) or {}
|
| 1376 |
+
tm.update(res.get("parse_timings_ms", {}) or {})
|
| 1377 |
+
all_selected[kidx]["timings_ms"] = tm
|
| 1378 |
+
|
| 1379 |
+
now = time.perf_counter()
|
| 1380 |
+
if (now - t_last_report) >= 1.0 or done_count == len(fut_to_job):
|
| 1381 |
+
print(f" [step3] progress {done_count}/{len(fut_to_job)} parsed (errors={err_count})")
|
| 1382 |
+
t_last_report = now
|
| 1383 |
+
|
| 1384 |
+
t3_ms = (time.perf_counter() - t0) * 1000.0
|
| 1385 |
+
timing_totals["parse_concurrent_ms"] += t3_ms
|
| 1386 |
+
print(f" [step3] done in {t3_ms/1000.0:.2f}s (errors={err_count})")
|
| 1387 |
+
|
| 1388 |
+
# Rebuild buckets from final frame_type
|
| 1389 |
+
buckets: Dict[str, List[dict]] = {k: [] for k in out_paths.keys()}
|
| 1390 |
+
for it in all_selected:
|
| 1391 |
+
ft = it.get("frame_type", "none")
|
| 1392 |
+
if ft not in buckets:
|
| 1393 |
+
ft = "none"
|
| 1394 |
+
it["frame_type"] = "none"
|
| 1395 |
+
buckets[ft].append(it)
|
| 1396 |
+
|
| 1397 |
+
# Final writes
|
| 1398 |
+
t0 = time.perf_counter()
|
| 1399 |
+
safe_write_json(enriched_json, all_selected)
|
| 1400 |
+
for ft, p in out_paths.items():
|
| 1401 |
+
safe_write_json(p, buckets[ft])
|
| 1402 |
+
timing_totals["json_write_ms"] += (time.perf_counter() - t0) * 1000.0
|
| 1403 |
+
|
| 1404 |
+
total_ms = (time.perf_counter() - t_total0) * 1000.0
|
| 1405 |
+
|
| 1406 |
+
timing_summary = {
|
| 1407 |
+
"timing_totals_ms": {k: float(v) for k, v in timing_totals.items()},
|
| 1408 |
+
"total_ms": float(total_ms),
|
| 1409 |
+
"candidates": int(len(candidates)),
|
| 1410 |
+
"selected_frames": int(len(all_selected)),
|
| 1411 |
+
"parsed_frames": int(sum(1 for x in all_selected if isinstance(x.get("screen_parse"), dict))),
|
| 1412 |
+
"parse_workers": int(PARSE_WORKERS),
|
| 1413 |
+
"min_keyframe_gap_sec": float(MIN_KEYFRAME_GAP_SEC),
|
| 1414 |
+
"yolo_device": str(YOLO_DEVICE),
|
| 1415 |
+
"yolo_imgsz": int(YOLO_IMGSZ),
|
| 1416 |
+
"ocr_use_gpu": bool(USE_GPU),
|
| 1417 |
+
"ocr_angle_cls": False,
|
| 1418 |
+
"ocr_crop_max_regions": int(OCR_CROP_MAX_REGIONS),
|
| 1419 |
+
"ocr_crop_scale_by_type": dict(OCR_CROP_SCALE_BY_TYPE),
|
| 1420 |
+
"parse_max_w_by_type": dict(PARSE_MAX_W_BY_TYPE),
|
| 1421 |
+
}
|
| 1422 |
+
safe_write_json(timing_json, timing_summary)
|
| 1423 |
+
|
| 1424 |
+
print("\nDone.")
|
| 1425 |
+
print("Selected frames:", len(all_selected))
|
| 1426 |
+
print("Frames folder:", frames_dir)
|
| 1427 |
+
print("Parsed JSON:", enriched_json)
|
| 1428 |
+
print("Timing JSON:", timing_json)
|
| 1429 |
+
for ft, p in out_paths.items():
|
| 1430 |
+
print(ft, "->", p)
|
| 1431 |
+
|
| 1432 |
+
print("\nTiming summary (ms):")
|
| 1433 |
+
for k, v in timing_totals.items():
|
| 1434 |
+
print(f" {k}: {v:.0f}")
|
| 1435 |
+
print(f" total_ms: {total_ms:.0f}")
|
| 1436 |
+
|
| 1437 |
+
|
| 1438 |
+
if __name__ == "__main__":
|
| 1439 |
+
try:
|
| 1440 |
+
main()
|
| 1441 |
+
except Exception as e:
|
| 1442 |
+
print(f"[ERROR] {type(e).__name__}: {e}")
|
| 1443 |
+
raise
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.0.0
|
| 2 |
+
fastapi==0.116.1
|
| 3 |
+
uvicorn==0.34.3
|
| 4 |
+
python-multipart==0.0.20
|
| 5 |
+
setuptools==70.0.0
|
| 6 |
+
wheel==0.45.1
|
| 7 |
+
python-dotenv==1.2.1
|
| 8 |
+
deepgram-sdk==4.8.0
|
| 9 |
+
httpx==0.28.1
|
| 10 |
+
google-genai==1.60.0
|
| 11 |
+
pydantic==2.12.5
|
| 12 |
+
opencv-python-headless==4.11.0.86
|
| 13 |
+
numpy==1.26.4
|
| 14 |
+
ultralytics==8.4.12
|
| 15 |
+
paddleocr==2.7.3
|
| 16 |
+
paddlepaddle==2.6.2
|
| 17 |
+
torch==2.5.1
|
run_manager.py
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
import shutil
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
import tempfile
|
| 10 |
+
import threading
|
| 11 |
+
import time
|
| 12 |
+
import uuid
|
| 13 |
+
from html import unescape
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, Optional
|
| 16 |
+
from urllib.parse import parse_qs, urljoin, urlparse
|
| 17 |
+
|
| 18 |
+
import httpx
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 22 |
+
PIPELINES_DIR = BASE_DIR / "pipelines"
|
| 23 |
+
DEFAULT_WORKDIR = Path(os.getenv("PIPELINE_WORKDIR", tempfile.gettempdir())) / "deployed-meet-runs"
|
| 24 |
+
DEFAULT_WORKDIR.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
RUNS_DIR = DEFAULT_WORKDIR / "runs"
|
| 26 |
+
RUNS_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _tail(text: str, max_lines: int = 220) -> str:
|
| 30 |
+
lines = (text or "").splitlines()
|
| 31 |
+
if len(lines) <= max_lines:
|
| 32 |
+
return "\n".join(lines)
|
| 33 |
+
return "\n".join(lines[-max_lines:])
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _run_dir(run_id: str) -> Path:
|
| 37 |
+
return RUNS_DIR / run_id
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _meta_path(run_id: str) -> Path:
|
| 41 |
+
return _run_dir(run_id) / "run_meta.json"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _logs_path(run_id: str) -> Path:
|
| 45 |
+
return _run_dir(run_id) / "pipeline.log"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _write_json(path: Path, data: Dict[str, Any]) -> None:
|
| 49 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
tmp = path.with_suffix(path.suffix + ".tmp")
|
| 51 |
+
tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 52 |
+
tmp.replace(path)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _read_json(path: Path) -> Dict[str, Any]:
|
| 56 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _extract_gdrive_file_id(url: str) -> Optional[str]:
|
| 60 |
+
parsed = urlparse(url)
|
| 61 |
+
host = (parsed.netloc or "").lower()
|
| 62 |
+
if "drive.google.com" not in host:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
m = re.search(r"/file/d/([a-zA-Z0-9_-]+)", parsed.path or "")
|
| 66 |
+
if m:
|
| 67 |
+
return m.group(1)
|
| 68 |
+
|
| 69 |
+
qs = parse_qs(parsed.query or "")
|
| 70 |
+
if "id" in qs and qs["id"]:
|
| 71 |
+
return qs["id"][0]
|
| 72 |
+
|
| 73 |
+
return None
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _download_google_drive(url: str, out_path: Path) -> None:
|
| 77 |
+
file_id = _extract_gdrive_file_id(url)
|
| 78 |
+
if not file_id:
|
| 79 |
+
raise ValueError("Could not parse Google Drive file id from video_url.")
|
| 80 |
+
|
| 81 |
+
direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
|
| 82 |
+
|
| 83 |
+
def _is_html_response(resp: httpx.Response) -> bool:
|
| 84 |
+
ctype = (resp.headers.get("content-type") or "").lower()
|
| 85 |
+
if "html" in ctype or "text/plain" in ctype:
|
| 86 |
+
return True
|
| 87 |
+
head = (resp.content[:256] or b"").lower()
|
| 88 |
+
return b"<html" in head or b"<!doctype html" in head
|
| 89 |
+
|
| 90 |
+
def _write_if_file(resp: httpx.Response) -> bool:
|
| 91 |
+
if _is_html_response(resp):
|
| 92 |
+
return False
|
| 93 |
+
if not resp.content or len(resp.content) < 1024:
|
| 94 |
+
return False
|
| 95 |
+
out_path.write_bytes(resp.content)
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
with httpx.Client(timeout=120.0, follow_redirects=True) as client:
|
| 99 |
+
candidates = [
|
| 100 |
+
direct_url,
|
| 101 |
+
f"https://drive.usercontent.google.com/download?id={file_id}&export=download&confirm=t",
|
| 102 |
+
]
|
| 103 |
+
for c in candidates:
|
| 104 |
+
rr = client.get(c)
|
| 105 |
+
rr.raise_for_status()
|
| 106 |
+
if _write_if_file(rr):
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
page = client.get(f"https://drive.google.com/file/d/{file_id}/view")
|
| 110 |
+
page.raise_for_status()
|
| 111 |
+
html = page.text or ""
|
| 112 |
+
|
| 113 |
+
form_action_match = re.search(r'id="download-form"[^>]*action="([^"]+)"', html)
|
| 114 |
+
if form_action_match:
|
| 115 |
+
action = unescape(form_action_match.group(1))
|
| 116 |
+
action_url = urljoin("https://drive.google.com", action)
|
| 117 |
+
params = {k: v for k, v in re.findall(r'<input[^>]+name="([^"]+)"[^>]+value="([^"]*)"', html)}
|
| 118 |
+
form_resp = client.get(action_url, params=params)
|
| 119 |
+
form_resp.raise_for_status()
|
| 120 |
+
if _write_if_file(form_resp):
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
link_match = re.search(r'href="(/uc\?export=download[^"]+)"', html)
|
| 124 |
+
if link_match:
|
| 125 |
+
href = unescape(link_match.group(1)).replace("&", "&")
|
| 126 |
+
link_url = urljoin("https://drive.google.com", href)
|
| 127 |
+
link_resp = client.get(link_url)
|
| 128 |
+
link_resp.raise_for_status()
|
| 129 |
+
if _write_if_file(link_resp):
|
| 130 |
+
return
|
| 131 |
+
|
| 132 |
+
cookie_confirm = None
|
| 133 |
+
for k, v in page.cookies.items():
|
| 134 |
+
if str(k).startswith("download_warning"):
|
| 135 |
+
cookie_confirm = v
|
| 136 |
+
break
|
| 137 |
+
if cookie_confirm:
|
| 138 |
+
confirm_url = f"https://drive.google.com/uc?export=download&confirm={cookie_confirm}&id={file_id}"
|
| 139 |
+
confirm_resp = client.get(confirm_url)
|
| 140 |
+
confirm_resp.raise_for_status()
|
| 141 |
+
if _write_if_file(confirm_resp):
|
| 142 |
+
return
|
| 143 |
+
|
| 144 |
+
msg = "Google Drive link did not provide a downloadable file."
|
| 145 |
+
low = html.lower()
|
| 146 |
+
if "you need access" in low or "request access" in low:
|
| 147 |
+
msg += " File is not publicly accessible."
|
| 148 |
+
elif "quota exceeded" in low or "too many users have viewed or downloaded" in low:
|
| 149 |
+
msg += " File appears to be quota-limited by Google Drive."
|
| 150 |
+
else:
|
| 151 |
+
msg += " Use a publicly accessible direct file link or local video file upload."
|
| 152 |
+
raise ValueError(msg)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _validate_video_file(path: Path) -> None:
|
| 156 |
+
if not path.exists() or not path.is_file():
|
| 157 |
+
raise ValueError(f"Input video file not found: {path}")
|
| 158 |
+
|
| 159 |
+
size = path.stat().st_size
|
| 160 |
+
if size < 1024:
|
| 161 |
+
raise ValueError(f"Input file is too small to be valid media: {path} ({size} bytes)")
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
head = path.read_bytes()[:4096].lower()
|
| 165 |
+
if b"<html" in head or b"<!doctype html" in head or b"{\"error\"" in head:
|
| 166 |
+
raise ValueError(
|
| 167 |
+
"Downloaded input is not a media file (looks like HTML/JSON response). "
|
| 168 |
+
"Use a direct video URL or upload a file."
|
| 169 |
+
)
|
| 170 |
+
except ValueError:
|
| 171 |
+
raise
|
| 172 |
+
except Exception:
|
| 173 |
+
pass
|
| 174 |
+
|
| 175 |
+
try:
|
| 176 |
+
import cv2
|
| 177 |
+
|
| 178 |
+
cap = cv2.VideoCapture(str(path))
|
| 179 |
+
ok = cap.isOpened()
|
| 180 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 181 |
+
cap.release()
|
| 182 |
+
if (not ok) or frame_count <= 0:
|
| 183 |
+
raise ValueError(
|
| 184 |
+
"Input file is not a decodable video for this runtime. "
|
| 185 |
+
"Provide a valid MP4 (H.264/AAC recommended)."
|
| 186 |
+
)
|
| 187 |
+
except ValueError:
|
| 188 |
+
raise
|
| 189 |
+
except Exception:
|
| 190 |
+
pass
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def _resolve_python_executable(python_bin: Optional[str]) -> str:
|
| 194 |
+
if python_bin:
|
| 195 |
+
p = Path(python_bin).expanduser()
|
| 196 |
+
if not p.exists():
|
| 197 |
+
raise ValueError(f"python_bin does not exist: {p}")
|
| 198 |
+
return str(p.resolve())
|
| 199 |
+
|
| 200 |
+
candidates = [
|
| 201 |
+
BASE_DIR.parent / ".venv" / "Scripts" / "python.exe",
|
| 202 |
+
BASE_DIR / ".venv" / "Scripts" / "python.exe",
|
| 203 |
+
BASE_DIR.parent / ".venv" / "bin" / "python",
|
| 204 |
+
BASE_DIR / ".venv" / "bin" / "python",
|
| 205 |
+
]
|
| 206 |
+
for c in candidates:
|
| 207 |
+
if c.exists():
|
| 208 |
+
return str(c.resolve())
|
| 209 |
+
|
| 210 |
+
return sys.executable or os.getenv("PYTHON_BIN") or "python"
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _resolve_out_dir(out_dir: Optional[str], run_id: str) -> Path:
|
| 214 |
+
if out_dir:
|
| 215 |
+
p = Path(out_dir)
|
| 216 |
+
if not p.is_absolute():
|
| 217 |
+
p = DEFAULT_WORKDIR / p
|
| 218 |
+
else:
|
| 219 |
+
p = DEFAULT_WORKDIR / f"run_{run_id}"
|
| 220 |
+
p.mkdir(parents=True, exist_ok=True)
|
| 221 |
+
return p.resolve()
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _build_common_args(
|
| 225 |
+
*,
|
| 226 |
+
video_path: Path,
|
| 227 |
+
out_dir: Path,
|
| 228 |
+
deepgram_model: str,
|
| 229 |
+
deepgram_language: Optional[str],
|
| 230 |
+
deepgram_request_timeout_sec: float,
|
| 231 |
+
deepgram_connect_timeout_sec: float,
|
| 232 |
+
deepgram_retries: int,
|
| 233 |
+
deepgram_retry_backoff_sec: float,
|
| 234 |
+
force_deepgram: bool,
|
| 235 |
+
force_keyframes: bool,
|
| 236 |
+
pre_roll_sec: float,
|
| 237 |
+
gemini_model: str,
|
| 238 |
+
similarity_threshold: float,
|
| 239 |
+
temperature: float,
|
| 240 |
+
) -> list[str]:
|
| 241 |
+
args = [
|
| 242 |
+
"--video",
|
| 243 |
+
str(video_path),
|
| 244 |
+
"--out",
|
| 245 |
+
str(out_dir),
|
| 246 |
+
"--deepgram-model",
|
| 247 |
+
deepgram_model,
|
| 248 |
+
"--deepgram-request-timeout-sec",
|
| 249 |
+
str(deepgram_request_timeout_sec),
|
| 250 |
+
"--deepgram-connect-timeout-sec",
|
| 251 |
+
str(deepgram_connect_timeout_sec),
|
| 252 |
+
"--deepgram-retries",
|
| 253 |
+
str(deepgram_retries),
|
| 254 |
+
"--deepgram-retry-backoff-sec",
|
| 255 |
+
str(deepgram_retry_backoff_sec),
|
| 256 |
+
"--pre-roll-sec",
|
| 257 |
+
str(pre_roll_sec),
|
| 258 |
+
"--gemini-model",
|
| 259 |
+
gemini_model,
|
| 260 |
+
"--similarity-threshold",
|
| 261 |
+
str(similarity_threshold),
|
| 262 |
+
"--temperature",
|
| 263 |
+
str(temperature),
|
| 264 |
+
]
|
| 265 |
+
if deepgram_language:
|
| 266 |
+
args.extend(["--deepgram-language", deepgram_language])
|
| 267 |
+
if force_deepgram:
|
| 268 |
+
args.append("--force-deepgram")
|
| 269 |
+
if force_keyframes:
|
| 270 |
+
args.append("--force-keyframes")
|
| 271 |
+
return args
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def _build_output_files(out_dir: Path, variant: str) -> Dict[str, str]:
|
| 275 |
+
return {
|
| 276 |
+
"utterances": str(out_dir / "utterances.json"),
|
| 277 |
+
"keyframes_parsed": str(out_dir / "keyframes_parsed.json"),
|
| 278 |
+
"keyframes_with_utterances": str(out_dir / "keyframes_with_utterances.json"),
|
| 279 |
+
"final_output": str(
|
| 280 |
+
out_dir / ("final_output.json" if variant == "full" else "final_output_demo_code.json")
|
| 281 |
+
),
|
| 282 |
+
"final_output_condensed": str(
|
| 283 |
+
out_dir / ("final_output_condensed.json" if variant == "full" else "final_output_demo_code_condensed.json")
|
| 284 |
+
),
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def _artifact_state(output_files: Dict[str, str]) -> Dict[str, Dict[str, Any]]:
|
| 289 |
+
state: Dict[str, Dict[str, Any]] = {}
|
| 290 |
+
for key, p in output_files.items():
|
| 291 |
+
path = Path(p)
|
| 292 |
+
if path.exists():
|
| 293 |
+
try:
|
| 294 |
+
st = path.stat()
|
| 295 |
+
state[key] = {
|
| 296 |
+
"size_bytes": int(st.st_size),
|
| 297 |
+
"mtime": float(st.st_mtime),
|
| 298 |
+
}
|
| 299 |
+
except Exception:
|
| 300 |
+
state[key] = {"size_bytes": -1, "mtime": -1.0}
|
| 301 |
+
return state
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def _format_artifact_compact(state: Dict[str, Dict[str, Any]]) -> str:
|
| 305 |
+
if not state:
|
| 306 |
+
return "none"
|
| 307 |
+
parts = []
|
| 308 |
+
for k in sorted(state.keys()):
|
| 309 |
+
sz = float(state[k].get("size_bytes", 0))
|
| 310 |
+
parts.append(f"{k}:{sz/1024.0:.1f}KB")
|
| 311 |
+
return ", ".join(parts)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _watch_run(run_id: str, proc: subprocess.Popen, started_at: float, log_fh, heartbeat_sec: float) -> None:
|
| 315 |
+
heartbeat_sec = max(2.0, float(heartbeat_sec))
|
| 316 |
+
last_hb = 0.0
|
| 317 |
+
last_artifact_change = started_at
|
| 318 |
+
last_state: Dict[str, Dict[str, Any]] = {}
|
| 319 |
+
|
| 320 |
+
while True:
|
| 321 |
+
now = time.time()
|
| 322 |
+
rc = proc.poll()
|
| 323 |
+
|
| 324 |
+
if (now - last_hb) >= heartbeat_sec:
|
| 325 |
+
try:
|
| 326 |
+
meta_file = _meta_path(run_id)
|
| 327 |
+
meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
|
| 328 |
+
out_files = meta.get("output_files", {}) or {}
|
| 329 |
+
cur_state = _artifact_state(out_files)
|
| 330 |
+
changed = cur_state != last_state
|
| 331 |
+
if changed:
|
| 332 |
+
last_artifact_change = now
|
| 333 |
+
unchanged_for = now - last_artifact_change
|
| 334 |
+
elapsed = now - started_at
|
| 335 |
+
|
| 336 |
+
log_fh.write(
|
| 337 |
+
"[runner] heartbeat "
|
| 338 |
+
f"elapsed={elapsed:.1f}s pid={proc.pid} "
|
| 339 |
+
f"artifacts={len(cur_state)}/{len(out_files)} "
|
| 340 |
+
f"changed={'yes' if changed else 'no'} "
|
| 341 |
+
f"unchanged_for={unchanged_for:.1f}s "
|
| 342 |
+
f"[{_format_artifact_compact(cur_state)}]\n"
|
| 343 |
+
)
|
| 344 |
+
log_fh.flush()
|
| 345 |
+
|
| 346 |
+
meta["last_heartbeat_epoch"] = now
|
| 347 |
+
meta["last_heartbeat_elapsed_sec"] = round(elapsed, 3)
|
| 348 |
+
meta["artifacts_ready_count"] = len(cur_state)
|
| 349 |
+
meta["artifacts_total_count"] = len(out_files)
|
| 350 |
+
meta["artifacts_unchanged_for_sec"] = round(unchanged_for, 3)
|
| 351 |
+
_write_json(meta_file, meta)
|
| 352 |
+
last_state = cur_state
|
| 353 |
+
except Exception as e:
|
| 354 |
+
try:
|
| 355 |
+
log_fh.write(f"[runner] heartbeat_error: {type(e).__name__}: {e}\n")
|
| 356 |
+
log_fh.flush()
|
| 357 |
+
except Exception:
|
| 358 |
+
pass
|
| 359 |
+
last_hb = now
|
| 360 |
+
|
| 361 |
+
if rc is not None:
|
| 362 |
+
return_code = int(rc)
|
| 363 |
+
break
|
| 364 |
+
|
| 365 |
+
time.sleep(1.0)
|
| 366 |
+
|
| 367 |
+
finished_at = time.time()
|
| 368 |
+
try:
|
| 369 |
+
meta_file = _meta_path(run_id)
|
| 370 |
+
meta = _read_json(meta_file) if meta_file.exists() else {"run_id": run_id}
|
| 371 |
+
meta["status"] = "succeeded" if return_code == 0 else "failed"
|
| 372 |
+
meta["exit_code"] = int(return_code)
|
| 373 |
+
meta["finished_at_epoch"] = finished_at
|
| 374 |
+
meta["duration_sec"] = round(finished_at - started_at, 3)
|
| 375 |
+
_write_json(meta_file, meta)
|
| 376 |
+
except Exception as e:
|
| 377 |
+
try:
|
| 378 |
+
log_fh.write(f"\n[runner] failed to update metadata: {type(e).__name__}: {e}\n")
|
| 379 |
+
log_fh.flush()
|
| 380 |
+
except Exception:
|
| 381 |
+
pass
|
| 382 |
+
|
| 383 |
+
try:
|
| 384 |
+
log_fh.write(f"\n[runner] process finished with exit_code={return_code}\n")
|
| 385 |
+
log_fh.flush()
|
| 386 |
+
except Exception:
|
| 387 |
+
pass
|
| 388 |
+
finally:
|
| 389 |
+
try:
|
| 390 |
+
log_fh.close()
|
| 391 |
+
except Exception:
|
| 392 |
+
pass
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def start_run(
|
| 396 |
+
*,
|
| 397 |
+
variant: str,
|
| 398 |
+
video_file_path: Optional[str],
|
| 399 |
+
video_url: Optional[str],
|
| 400 |
+
out_dir: Optional[str],
|
| 401 |
+
python_bin: Optional[str],
|
| 402 |
+
deepgram_model: str,
|
| 403 |
+
deepgram_language: Optional[str],
|
| 404 |
+
deepgram_request_timeout_sec: float,
|
| 405 |
+
deepgram_connect_timeout_sec: float,
|
| 406 |
+
deepgram_retries: int,
|
| 407 |
+
deepgram_retry_backoff_sec: float,
|
| 408 |
+
force_deepgram: bool,
|
| 409 |
+
force_keyframes: bool,
|
| 410 |
+
pre_roll_sec: float,
|
| 411 |
+
gemini_model: str,
|
| 412 |
+
similarity_threshold: float,
|
| 413 |
+
temperature: float,
|
| 414 |
+
log_heartbeat_sec: float = 10.0,
|
| 415 |
+
) -> Dict[str, Any]:
|
| 416 |
+
script_name = {
|
| 417 |
+
"full": "run_pipeline_all.py",
|
| 418 |
+
"demo-code": "run_pipeline_demo_code.py",
|
| 419 |
+
}.get(variant)
|
| 420 |
+
if not script_name:
|
| 421 |
+
raise ValueError("variant must be one of: full, demo-code")
|
| 422 |
+
|
| 423 |
+
pipeline_script = PIPELINES_DIR / script_name
|
| 424 |
+
if not pipeline_script.exists():
|
| 425 |
+
raise FileNotFoundError(f"Missing pipeline script: {pipeline_script}")
|
| 426 |
+
|
| 427 |
+
run_id = uuid.uuid4().hex[:12]
|
| 428 |
+
run_dir = _run_dir(run_id)
|
| 429 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 430 |
+
|
| 431 |
+
if video_file_path:
|
| 432 |
+
src = Path(video_file_path).expanduser().resolve()
|
| 433 |
+
if not src.exists():
|
| 434 |
+
raise ValueError(f"Uploaded/local video file not found: {src}")
|
| 435 |
+
dst = run_dir / f"input_{run_id}{src.suffix or '.mp4'}"
|
| 436 |
+
shutil.copy2(src, dst)
|
| 437 |
+
video_path = dst
|
| 438 |
+
elif video_url:
|
| 439 |
+
suffix = Path(video_url).suffix or ".mp4"
|
| 440 |
+
video_path = run_dir / f"input_{run_id}{suffix}"
|
| 441 |
+
if _extract_gdrive_file_id(video_url):
|
| 442 |
+
_download_google_drive(video_url, video_path)
|
| 443 |
+
else:
|
| 444 |
+
with httpx.stream("GET", video_url, timeout=120.0, follow_redirects=True) as r:
|
| 445 |
+
r.raise_for_status()
|
| 446 |
+
with open(video_path, "wb") as f:
|
| 447 |
+
for chunk in r.iter_bytes():
|
| 448 |
+
f.write(chunk)
|
| 449 |
+
else:
|
| 450 |
+
raise ValueError("Provide one of: video_file_path or video_url")
|
| 451 |
+
|
| 452 |
+
_validate_video_file(video_path)
|
| 453 |
+
out_path = _resolve_out_dir(out_dir, run_id)
|
| 454 |
+
python_exe = _resolve_python_executable(python_bin)
|
| 455 |
+
|
| 456 |
+
cmd = [
|
| 457 |
+
python_exe,
|
| 458 |
+
"-u",
|
| 459 |
+
str(pipeline_script),
|
| 460 |
+
"--python",
|
| 461 |
+
python_exe,
|
| 462 |
+
*_build_common_args(
|
| 463 |
+
video_path=video_path,
|
| 464 |
+
out_dir=out_path,
|
| 465 |
+
deepgram_model=deepgram_model,
|
| 466 |
+
deepgram_language=deepgram_language,
|
| 467 |
+
deepgram_request_timeout_sec=deepgram_request_timeout_sec,
|
| 468 |
+
deepgram_connect_timeout_sec=deepgram_connect_timeout_sec,
|
| 469 |
+
deepgram_retries=deepgram_retries,
|
| 470 |
+
deepgram_retry_backoff_sec=deepgram_retry_backoff_sec,
|
| 471 |
+
force_deepgram=force_deepgram,
|
| 472 |
+
force_keyframes=force_keyframes,
|
| 473 |
+
pre_roll_sec=pre_roll_sec,
|
| 474 |
+
gemini_model=gemini_model,
|
| 475 |
+
similarity_threshold=similarity_threshold,
|
| 476 |
+
temperature=temperature,
|
| 477 |
+
),
|
| 478 |
+
]
|
| 479 |
+
|
| 480 |
+
started = time.time()
|
| 481 |
+
logs_path = _logs_path(run_id)
|
| 482 |
+
log_fh = open(logs_path, "a", encoding="utf-8", buffering=1)
|
| 483 |
+
log_fh.write(
|
| 484 |
+
f"[runner] run_id={run_id} variant={variant} started_at_epoch={started}\n"
|
| 485 |
+
f"[runner] command={' '.join(cmd)}\n"
|
| 486 |
+
f"[runner] cwd={PIPELINES_DIR}\n\n"
|
| 487 |
+
f"[runner] heartbeat_interval_sec={log_heartbeat_sec}\n"
|
| 488 |
+
f"[runner] python_unbuffered=1\n\n"
|
| 489 |
+
)
|
| 490 |
+
log_fh.flush()
|
| 491 |
+
|
| 492 |
+
child_env = os.environ.copy()
|
| 493 |
+
child_env["PYTHONUNBUFFERED"] = "1"
|
| 494 |
+
child_env.setdefault("PYTHONIOENCODING", "utf-8")
|
| 495 |
+
|
| 496 |
+
proc = subprocess.Popen(
|
| 497 |
+
cmd,
|
| 498 |
+
cwd=str(PIPELINES_DIR),
|
| 499 |
+
stdout=log_fh,
|
| 500 |
+
stderr=subprocess.STDOUT,
|
| 501 |
+
text=True,
|
| 502 |
+
env=child_env,
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
meta = {
|
| 506 |
+
"variant": variant,
|
| 507 |
+
"run_id": run_id,
|
| 508 |
+
"python_executable": python_exe,
|
| 509 |
+
"command": cmd,
|
| 510 |
+
"status": "running",
|
| 511 |
+
"exit_code": None,
|
| 512 |
+
"pid": proc.pid,
|
| 513 |
+
"started_at_epoch": started,
|
| 514 |
+
"finished_at_epoch": None,
|
| 515 |
+
"duration_sec": None,
|
| 516 |
+
"out_dir": str(out_path),
|
| 517 |
+
"logs_path": str(logs_path),
|
| 518 |
+
"heartbeat_interval_sec": float(log_heartbeat_sec),
|
| 519 |
+
"output_files": _build_output_files(out_path, variant),
|
| 520 |
+
}
|
| 521 |
+
_write_json(_meta_path(run_id), meta)
|
| 522 |
+
|
| 523 |
+
watcher = threading.Thread(
|
| 524 |
+
target=_watch_run,
|
| 525 |
+
args=(run_id, proc, started, log_fh, float(log_heartbeat_sec)),
|
| 526 |
+
daemon=True,
|
| 527 |
+
)
|
| 528 |
+
watcher.start()
|
| 529 |
+
|
| 530 |
+
return {
|
| 531 |
+
"run_id": run_id,
|
| 532 |
+
"variant": variant,
|
| 533 |
+
"status": "running",
|
| 534 |
+
"python_executable": python_exe,
|
| 535 |
+
"status_path": f"runs/{run_id}",
|
| 536 |
+
"logs_path": f"runs/{run_id}/logs",
|
| 537 |
+
"final_output_path": f"runs/{run_id}/final-output",
|
| 538 |
+
"final_output_condensed_path": f"runs/{run_id}/final-output/condensed",
|
| 539 |
+
"out_dir": str(out_path),
|
| 540 |
+
}
|
| 541 |
+
|
| 542 |
+
|
| 543 |
+
def get_status(run_id: str) -> Dict[str, Any]:
|
| 544 |
+
p = _meta_path(run_id)
|
| 545 |
+
if not p.exists():
|
| 546 |
+
raise FileNotFoundError(f"Unknown run_id: {run_id}")
|
| 547 |
+
return _read_json(p)
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
def get_logs(run_id: str, tail_lines: int = 300) -> str:
|
| 551 |
+
meta = get_status(run_id)
|
| 552 |
+
p = Path(meta.get("logs_path", ""))
|
| 553 |
+
if not p.exists():
|
| 554 |
+
return ""
|
| 555 |
+
txt = p.read_text(encoding="utf-8", errors="replace")
|
| 556 |
+
limit = max(1, min(int(tail_lines), 5000))
|
| 557 |
+
return _tail(txt, max_lines=limit)
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
def get_final_output(run_id: str, condensed: bool = False) -> Dict[str, Any]:
|
| 561 |
+
meta = get_status(run_id)
|
| 562 |
+
status = meta.get("status")
|
| 563 |
+
key = "final_output_condensed" if condensed else "final_output"
|
| 564 |
+
out_file = Path(meta["output_files"][key])
|
| 565 |
+
|
| 566 |
+
if status == "running":
|
| 567 |
+
return {
|
| 568 |
+
"run_id": run_id,
|
| 569 |
+
"status": status,
|
| 570 |
+
"message": "Pipeline is still running. Check logs.",
|
| 571 |
+
}
|
| 572 |
+
if status == "failed":
|
| 573 |
+
return {
|
| 574 |
+
"run_id": run_id,
|
| 575 |
+
"status": status,
|
| 576 |
+
"message": "Pipeline failed. Check logs.",
|
| 577 |
+
}
|
| 578 |
+
if not out_file.exists():
|
| 579 |
+
raise FileNotFoundError(f"Output not found: {out_file}")
|
| 580 |
+
return _read_json(out_file)
|
| 581 |
+
|
vercel.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": 2,
|
| 3 |
+
"builds": [
|
| 4 |
+
{
|
| 5 |
+
"src": "api/index.py",
|
| 6 |
+
"use": "@vercel/python"
|
| 7 |
+
}
|
| 8 |
+
],
|
| 9 |
+
"routes": [
|
| 10 |
+
{
|
| 11 |
+
"src": "/(.*)",
|
| 12 |
+
"dest": "api/index.py"
|
| 13 |
+
}
|
| 14 |
+
],
|
| 15 |
+
"functions": {
|
| 16 |
+
"api/index.py": {
|
| 17 |
+
"maxDuration": 900
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
|