Keqing Li commited on
Commit ·
c9f5b32
0
Parent(s):
Final verified deployment for HF Space
Browse files- .gitattributes +11 -0
- Dockerfile +76 -0
- README.md +73 -0
- frontend/index.html +12 -0
- frontend/package.json +28 -0
- frontend/postcss.config.js +6 -0
- frontend/src/App.tsx +766 -0
- frontend/src/index.css +25 -0
- frontend/src/main.tsx +10 -0
- frontend/tailwind.config.js +20 -0
- frontend/tsconfig.json +21 -0
- frontend/tsconfig.node.json +10 -0
- frontend/vite.config.ts +7 -0
- main.go +62 -0
- requirements.txt +42 -0
- src/agents.py +153 -0
- src/app.py +987 -0
- src/croissant_transformer.py +126 -0
- src/factuality_logic.py +154 -0
- src/finetune.py +165 -0
- src/inference_logic.py +377 -0
- src/labeling_logic.py +158 -0
- src/model.py +93 -0
- src/my_vision_process.py +812 -0
- src/run_inference.py +135 -0
- src/toon_parser.py +282 -0
- src/transcription.py +59 -0
- start.sh +16 -0
.gitattributes
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Handle line endings automatically
|
| 2 |
+
* text=auto
|
| 3 |
+
|
| 4 |
+
# Declare files that will always have LF line endings on checkout.
|
| 5 |
+
*.sh text eol=lf
|
| 6 |
+
*.py text eol=lf
|
| 7 |
+
*.go text eol=lf
|
| 8 |
+
*.tsx text eol=lf
|
| 9 |
+
*.css text eol=lf
|
| 10 |
+
*.html text eol=lf
|
| 11 |
+
*.json text eol=lf
|
Dockerfile
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==========================================
|
| 2 |
+
# Stage 1: Build Frontend (React/TS/Vite)
|
| 3 |
+
# ==========================================
|
| 4 |
+
FROM node:20-slim AS frontend-builder
|
| 5 |
+
WORKDIR /app/frontend
|
| 6 |
+
|
| 7 |
+
# Copy frontend definitions
|
| 8 |
+
COPY frontend/package.json frontend/package-lock.json* ./
|
| 9 |
+
RUN npm install
|
| 10 |
+
|
| 11 |
+
# Copy source and build
|
| 12 |
+
COPY frontend/ ./
|
| 13 |
+
RUN npm run build
|
| 14 |
+
|
| 15 |
+
# ==========================================
|
| 16 |
+
# Stage 2: Build Backend (Golang)
|
| 17 |
+
# ==========================================
|
| 18 |
+
FROM golang:1.23 AS backend-builder
|
| 19 |
+
WORKDIR /app/backend
|
| 20 |
+
|
| 21 |
+
# Copy Go source
|
| 22 |
+
COPY main.go .
|
| 23 |
+
|
| 24 |
+
# Build static binary
|
| 25 |
+
RUN go mod init vchat-server && \
|
| 26 |
+
go mod tidy && \
|
| 27 |
+
CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o vchat-server main.go
|
| 28 |
+
|
| 29 |
+
# ==========================================
|
| 30 |
+
# Stage 3: Final Runtime (PyTorch Base)
|
| 31 |
+
# ==========================================
|
| 32 |
+
FROM pytorch/pytorch:2.9.1-cuda13.0-cudnn9-devel
|
| 33 |
+
|
| 34 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 35 |
+
DEBIAN_FRONTEND=noninteractive \
|
| 36 |
+
LITE_MODE=false \
|
| 37 |
+
PATH="/usr/lib/google-cloud-sdk/bin:$PATH" \
|
| 38 |
+
PIP_NO_CACHE_DIR=1
|
| 39 |
+
|
| 40 |
+
WORKDIR /app
|
| 41 |
+
|
| 42 |
+
# 1. Install System Dependencies
|
| 43 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 44 |
+
ffmpeg \
|
| 45 |
+
git \
|
| 46 |
+
curl \
|
| 47 |
+
gnupg \
|
| 48 |
+
ca-certificates \
|
| 49 |
+
&& echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \
|
| 50 |
+
&& curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \
|
| 51 |
+
&& apt-get update && apt-get install -y google-cloud-cli \
|
| 52 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 53 |
+
|
| 54 |
+
# 2. Install Python Dependencies
|
| 55 |
+
RUN pip install uv
|
| 56 |
+
COPY requirements.txt ./
|
| 57 |
+
RUN uv pip install --system -r requirements.txt
|
| 58 |
+
|
| 59 |
+
# 3. Copy Python Application Code
|
| 60 |
+
COPY . .
|
| 61 |
+
|
| 62 |
+
# 4. Install Built Artifacts
|
| 63 |
+
COPY --from=backend-builder /app/backend/vchat-server /usr/local/bin/vchat-server
|
| 64 |
+
RUN mkdir -p /usr/share/vchat/static
|
| 65 |
+
COPY --from=frontend-builder /app/frontend/dist /usr/share/vchat/static
|
| 66 |
+
|
| 67 |
+
# 5. Setup Entrypoint (Fix Windows Line Endings Here)
|
| 68 |
+
COPY start.sh /usr/local/bin/start.sh
|
| 69 |
+
RUN sed -i 's/\r$//' /usr/local/bin/start.sh && \
|
| 70 |
+
chmod +x /usr/local/bin/start.sh
|
| 71 |
+
|
| 72 |
+
# Expose the Go Server port
|
| 73 |
+
EXPOSE 8000
|
| 74 |
+
|
| 75 |
+
# Run the Orchestrator
|
| 76 |
+
CMD ["/usr/local/bin/start.sh"]
|
README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VFacts
|
| 3 |
+
emoji: 😀
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# VFacts - Video Veracity & Analysis Platform
|
| 11 |
+
|
| 12 |
+
## Research Overview
|
| 13 |
+
|
| 14 |
+
The liarMP4 project investigates the efficacy of Generative AI (GenAI) systems in detecting "contextual malformation" in video content, as opposed to traditional Predictive AI (PredAI) which focuses on metadata and engagement velocity.
|
| 15 |
+
|
| 16 |
+
While traditional content moderation relies on scalar probabilities derived from tabular data (account age, keyword triggers), this research proposes a **Fractal Chain-of-Thought** methodology. This approach utilizes Multimodal Large Language Models to analyze the semantic dissonance between visual evidence, audio waveforms, and textual claims.
|
| 17 |
+
|
| 18 |
+
The system generates **Veracity Vectors**, multi-dimensional scores representing Visual Integrity, Audio Integrity, and Cross-Modal Alignment—outputting data in a strict Token-Oriented Object Notation (TOON) schema.
|
| 19 |
+
|
| 20 |
+
## Key Features
|
| 21 |
+
|
| 22 |
+
* **Predictive Benchmarking:** Comparison against AutoGluon/Gradient Boosting models trained on engagement metadata.
|
| 23 |
+
* **Fractal Chain-of-Thought (FCoT):** A recursive inference strategy that hypothesizes intent at a macro-scale and verifies pixel/audio artifacts at a meso-scale.
|
| 24 |
+
* **TOON Schema:** A standardized output format ensuring strict type adherence for database integration.
|
| 25 |
+
* **Human-in-the-Loop (HITL) Protocol:** A browser-based grounding workflow to calibrate AI "reasoning" against human authorial intent.
|
| 26 |
+
|
| 27 |
+
## Project Resources
|
| 28 |
+
|
| 29 |
+
* **Live Demonstration (Hugging Face):** [https://huggingface.co/spaces/GlazedDon0t/liarMP4](https://huggingface.co/spaces/GlazedDon0t/liarMP4)
|
| 30 |
+
* **Source Code (GitHub):** [https://github.com/DevKlim/LiarMP4](https://github.com/DevKlim/LiarMP4)
|
| 31 |
+
|
| 32 |
+
## Repository Structure
|
| 33 |
+
|
| 34 |
+
* **src/**: Core inference logic for the Generative AI pipeline and FCoT implementation.
|
| 35 |
+
* **preprocessing_tools/**: Scripts for training Predictive AI models on tabular datasets.
|
| 36 |
+
* **extension/**: Browser extension source code for the Human-in-the-Loop labeling workflow.
|
| 37 |
+
* **data/**: Benchmark datasets containing engagement metadata and manual veracity labels.
|
| 38 |
+
|
| 39 |
+
## Installation and Usage
|
| 40 |
+
|
| 41 |
+
This project is containerized to ensure reproducibility across different environments. The entire pipeline, including the inference logic and database connections, can be deployed using Docker.
|
| 42 |
+
|
| 43 |
+
### Prerequisites
|
| 44 |
+
|
| 45 |
+
* Docker Engine
|
| 46 |
+
* Docker Compose
|
| 47 |
+
|
| 48 |
+
### Deployment Instructions
|
| 49 |
+
|
| 50 |
+
1. Clone the repository:
|
| 51 |
+
```bash
|
| 52 |
+
git clone https://github.com/DevKlim/LiarMP4.git
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
2. Navigate to the project directory:
|
| 56 |
+
```bash
|
| 57 |
+
cd LiarMP4/liarMP4
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
3. Build and run the containerized environment:
|
| 61 |
+
```bash
|
| 62 |
+
docker-compose up --build
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
The system will initialize the backend services and expose the necessary endpoints for the analysis pipeline.
|
| 66 |
+
|
| 67 |
+
## License
|
| 68 |
+
|
| 69 |
+
This research project is open-source. Please refer to the LICENSE file in the repository for specific terms regarding usage and distribution.
|
| 70 |
+
|
| 71 |
+
## Authors
|
| 72 |
+
|
| 73 |
+
Kliment Ho, Shiwei Yang, Keqing Li
|
frontend/index.html
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>vChat AI | Modern Inference</title>
|
| 7 |
+
</head>
|
| 8 |
+
<body>
|
| 9 |
+
<div id="root"></div>
|
| 10 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 11 |
+
</body>
|
| 12 |
+
</html>
|
frontend/package.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "vchat-frontend",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "1.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"preview": "vite preview"
|
| 10 |
+
},
|
| 11 |
+
"dependencies": {
|
| 12 |
+
"react": "^18.2.0",
|
| 13 |
+
"react-dom": "^18.2.0",
|
| 14 |
+
"lucide-react": "^0.344.0",
|
| 15 |
+
"clsx": "^2.1.0",
|
| 16 |
+
"tailwind-merge": "^2.2.1"
|
| 17 |
+
},
|
| 18 |
+
"devDependencies": {
|
| 19 |
+
"@types/react": "^18.2.64",
|
| 20 |
+
"@types/react-dom": "^18.2.21",
|
| 21 |
+
"@vitejs/plugin-react": "^4.2.1",
|
| 22 |
+
"autoprefixer": "^10.4.18",
|
| 23 |
+
"postcss": "^8.4.35",
|
| 24 |
+
"tailwindcss": "^3.4.1",
|
| 25 |
+
"typescript": "^5.2.2",
|
| 26 |
+
"vite": "^5.1.4"
|
| 27 |
+
}
|
| 28 |
+
}
|
frontend/postcss.config.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default {
|
| 2 |
+
plugins: {
|
| 3 |
+
tailwindcss: {},
|
| 4 |
+
autoprefixer: {},
|
| 5 |
+
},
|
| 6 |
+
}
|
frontend/src/App.tsx
ADDED
|
@@ -0,0 +1,766 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React, { useState, useRef, useEffect } from 'react';
|
| 2 |
+
import {
|
| 3 |
+
AlertCircle, Play, Upload, Layers, Terminal, Cpu, Activity,
|
| 4 |
+
FileText, Zap, MessageSquare, Sliders, LayoutDashboard, FileJson,
|
| 5 |
+
ChevronDown, ChevronRight, Bot, Database, Trash2, Eye, StopCircle, List,
|
| 6 |
+
CheckCircle, XCircle, BrainCircuit, Edit3, ClipboardList, CheckSquare,
|
| 7 |
+
BarChart2, TrendingUp, TrendingDown, Scale, ExternalLink
|
| 8 |
+
} from 'lucide-react';
|
| 9 |
+
|
| 10 |
+
function App() {
|
| 11 |
+
const [activeTab, setActiveTab] = useState('queue');
|
| 12 |
+
const [logs, setLogs] = useState<string>('System Ready.\n');
|
| 13 |
+
const [isProcessing, setIsProcessing] = useState(false);
|
| 14 |
+
|
| 15 |
+
// Data States
|
| 16 |
+
const [dataList, setDataList] = useState<any[]>([]);
|
| 17 |
+
const [queueList, setQueueList] = useState<any[]>([]);
|
| 18 |
+
const [workflowList, setWorkflowList] = useState<any[]>([]);
|
| 19 |
+
const [comparisonList, setComparisonList] = useState<any[]>([]);
|
| 20 |
+
|
| 21 |
+
const [expandedRow, setExpandedRow] = useState<string | null>(null);
|
| 22 |
+
const [refreshTrigger, setRefreshTrigger] = useState(0);
|
| 23 |
+
|
| 24 |
+
// Manual Labeling Modal
|
| 25 |
+
const [labelingItem, setLabelingItem] = useState<any>(null);
|
| 26 |
+
const [manualForm, setManualForm] = useState({
|
| 27 |
+
visual: 5, audio: 5, source: 5, logic: 5, emotion: 5,
|
| 28 |
+
va: 5, vc: 5, ac: 5,
|
| 29 |
+
final: 50,
|
| 30 |
+
reasoning: '',
|
| 31 |
+
tags: ''
|
| 32 |
+
});
|
| 33 |
+
|
| 34 |
+
const [videoUrl, setVideoUrl] = useState('');
|
| 35 |
+
const [model, setModel] = useState('vertex');
|
| 36 |
+
const [reasoningMethod, setReasoningMethod] = useState('cot');
|
| 37 |
+
const fileInputRef = useRef<HTMLInputElement>(null);
|
| 38 |
+
const logEndRef = useRef<HTMLDivElement>(null);
|
| 39 |
+
|
| 40 |
+
useEffect(() => {
|
| 41 |
+
logEndRef.current?.scrollIntoView({ behavior: "smooth" });
|
| 42 |
+
}, [logs]);
|
| 43 |
+
|
| 44 |
+
// Refresh Data logic
|
| 45 |
+
useEffect(() => {
|
| 46 |
+
if (activeTab === 'moderation') {
|
| 47 |
+
fetch('/manage/list').then(res => res.json()).then(setDataList).catch(err => console.error("Data Load Error:", err));
|
| 48 |
+
}
|
| 49 |
+
if (activeTab === 'queue') {
|
| 50 |
+
fetch('/queue/list').then(res => res.json()).then(setQueueList).catch(err => console.error("Queue Load Error:", err));
|
| 51 |
+
}
|
| 52 |
+
if (activeTab === 'workflow') {
|
| 53 |
+
fetch('/workflow/status').then(res => res.json()).then(setWorkflowList).catch(err => console.error("Workflow Load Error:", err));
|
| 54 |
+
}
|
| 55 |
+
if (activeTab === 'analytics') {
|
| 56 |
+
fetch('/manage/comparison_data').then(res => res.json()).then(setComparisonList).catch(err => console.error("Analytics Load Error:", err));
|
| 57 |
+
}
|
| 58 |
+
}, [activeTab, refreshTrigger]);
|
| 59 |
+
|
| 60 |
+
const appendLog = (text: string) => setLogs(prev => prev + text);
|
| 61 |
+
|
| 62 |
+
// --- Handlers ---
|
| 63 |
+
|
| 64 |
+
const handleFileUpload = async (e: React.ChangeEvent<HTMLInputElement>) => {
|
| 65 |
+
if (!e.target.files?.length) return;
|
| 66 |
+
const file = e.target.files[0];
|
| 67 |
+
const fd = new FormData();
|
| 68 |
+
fd.append("file", file);
|
| 69 |
+
|
| 70 |
+
appendLog(`[SYSTEM] Uploading ${file.name} to queue...\n`);
|
| 71 |
+
try {
|
| 72 |
+
const res = await fetch('/queue/upload_csv', { method: 'POST', body: fd });
|
| 73 |
+
const data = await res.json();
|
| 74 |
+
if(data.error) throw new Error(data.error);
|
| 75 |
+
appendLog(`[SYSTEM] Upload complete. Added ${data.added} links.\n`);
|
| 76 |
+
setRefreshTrigger(prev => prev + 1);
|
| 77 |
+
} catch (err: any) {
|
| 78 |
+
appendLog(`[ERROR] Upload failed: ${err.message}\n`);
|
| 79 |
+
}
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
const handleStartQueue = async (e: React.FormEvent) => {
|
| 83 |
+
e.preventDefault();
|
| 84 |
+
setIsProcessing(true);
|
| 85 |
+
setLogs("[SYSTEM] Starting Queue Processing...\n");
|
| 86 |
+
|
| 87 |
+
const form = document.getElementById('control-form') as HTMLFormElement;
|
| 88 |
+
const formData = new FormData(form);
|
| 89 |
+
|
| 90 |
+
try {
|
| 91 |
+
const response = await fetch('/queue/run', { method: 'POST', body: formData });
|
| 92 |
+
if (!response.body) throw new Error("No response");
|
| 93 |
+
const reader = response.body.getReader();
|
| 94 |
+
const decoder = new TextDecoder();
|
| 95 |
+
while (true) {
|
| 96 |
+
const { done, value } = await reader.read();
|
| 97 |
+
if (done) break;
|
| 98 |
+
const chunk = decoder.decode(value, { stream: true });
|
| 99 |
+
chunk.split('\n\n').forEach(line => {
|
| 100 |
+
if (line.startsWith('data:')) appendLog(line.replace('data:', '').trim() + '\n');
|
| 101 |
+
if (line.startsWith('event: close')) setIsProcessing(false);
|
| 102 |
+
});
|
| 103 |
+
}
|
| 104 |
+
} catch (err: any) {
|
| 105 |
+
appendLog(`\n[ERROR]: ${err.message}\n`);
|
| 106 |
+
} finally {
|
| 107 |
+
setIsProcessing(false);
|
| 108 |
+
setRefreshTrigger(prev => prev + 1);
|
| 109 |
+
}
|
| 110 |
+
};
|
| 111 |
+
|
| 112 |
+
const handleStopQueue = async () => {
|
| 113 |
+
await fetch('/queue/stop', { method: 'POST' });
|
| 114 |
+
appendLog("[USER] Stop signal sent. Finishing current item...\n");
|
| 115 |
+
};
|
| 116 |
+
|
| 117 |
+
const handleDelete = async (id: string, link: string) => {
|
| 118 |
+
if (!confirm("Delete this entry? This allows re-labeling of the link in the queue.")) return;
|
| 119 |
+
try {
|
| 120 |
+
const res = await fetch(`/manage/delete?id=${id}&link=${encodeURIComponent(link)}`, { method: 'DELETE' });
|
| 121 |
+
const json = await res.json();
|
| 122 |
+
if (json.status === 'deleted') {
|
| 123 |
+
setRefreshTrigger(prev => prev + 1);
|
| 124 |
+
} else {
|
| 125 |
+
alert("Error deleting: " + JSON.stringify(json));
|
| 126 |
+
}
|
| 127 |
+
} catch (e) { alert("Fail: " + e); }
|
| 128 |
+
};
|
| 129 |
+
|
| 130 |
+
const handleQueueDelete = async (link: string) => {
|
| 131 |
+
if(!confirm("Remove this link from queue?")) return;
|
| 132 |
+
try {
|
| 133 |
+
const res = await fetch(`/queue/delete?link=${encodeURIComponent(link)}`, { method: 'DELETE' });
|
| 134 |
+
const json = await res.json();
|
| 135 |
+
if(json.status === 'success') {
|
| 136 |
+
setRefreshTrigger(prev => prev + 1);
|
| 137 |
+
} else {
|
| 138 |
+
alert("Error: " + json.message);
|
| 139 |
+
}
|
| 140 |
+
} catch(err) {
|
| 141 |
+
console.error(err);
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
// --- Manual Labeling Handlers ---
|
| 146 |
+
|
| 147 |
+
const parseScore = (val: any) => {
|
| 148 |
+
if (!val) return 5;
|
| 149 |
+
const str = String(val).replace(/[^\d]/g, '');
|
| 150 |
+
const num = parseInt(str);
|
| 151 |
+
return isNaN(num) ? 5 : num;
|
| 152 |
+
};
|
| 153 |
+
|
| 154 |
+
const openLabelingModal = (item: any) => {
|
| 155 |
+
// Logic to handle source: workflow (item.ai_data) vs moderation (item direct)
|
| 156 |
+
let ai = item.ai_data || {};
|
| 157 |
+
|
| 158 |
+
// Fallback: If coming from Moderation tab, item itself is the AI data
|
| 159 |
+
if (!item.ai_data && item.source_type === 'auto') {
|
| 160 |
+
ai = {
|
| 161 |
+
visual: item.visual_integrity_score,
|
| 162 |
+
final: item.final_veracity_score,
|
| 163 |
+
reasoning: item.final_reasoning,
|
| 164 |
+
tags: item.tags
|
| 165 |
+
};
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
setLabelingItem(item);
|
| 169 |
+
setManualForm({
|
| 170 |
+
visual: parseScore(ai.visual),
|
| 171 |
+
audio: 5, source: 5, logic: 5, emotion: 5,
|
| 172 |
+
va: 5, vc: 5, ac: 5,
|
| 173 |
+
final: parseScore(ai.final || 50),
|
| 174 |
+
reasoning: ai.reasoning || '',
|
| 175 |
+
tags: ai.tags || ''
|
| 176 |
+
});
|
| 177 |
+
};
|
| 178 |
+
|
| 179 |
+
const submitManualLabel = async () => {
|
| 180 |
+
if(!labelingItem) return;
|
| 181 |
+
|
| 182 |
+
const payload = {
|
| 183 |
+
link: labelingItem.link,
|
| 184 |
+
caption: "Manual Label via WebUI",
|
| 185 |
+
labels: {
|
| 186 |
+
visual_integrity_score: manualForm.visual,
|
| 187 |
+
audio_integrity_score: manualForm.audio,
|
| 188 |
+
source_credibility_score: manualForm.source,
|
| 189 |
+
logical_consistency_score: manualForm.logic,
|
| 190 |
+
emotional_manipulation_score: manualForm.emotion,
|
| 191 |
+
video_audio_score: manualForm.va,
|
| 192 |
+
video_caption_score: manualForm.vc,
|
| 193 |
+
audio_caption_score: manualForm.ac,
|
| 194 |
+
final_veracity_score: manualForm.final,
|
| 195 |
+
reasoning: manualForm.reasoning
|
| 196 |
+
},
|
| 197 |
+
tags: manualForm.tags,
|
| 198 |
+
stats: { platform: "webui" }
|
| 199 |
+
};
|
| 200 |
+
|
| 201 |
+
try {
|
| 202 |
+
const res = await fetch('/extension/save_manual', {
|
| 203 |
+
method: 'POST',
|
| 204 |
+
headers: {'Content-Type': 'application/json'},
|
| 205 |
+
body: JSON.stringify(payload)
|
| 206 |
+
});
|
| 207 |
+
const json = await res.json();
|
| 208 |
+
if(json.status === 'saved') {
|
| 209 |
+
alert("Manual Label Saved!");
|
| 210 |
+
setLabelingItem(null);
|
| 211 |
+
setRefreshTrigger(prev => prev + 1);
|
| 212 |
+
} else {
|
| 213 |
+
alert("Error: " + JSON.stringify(json));
|
| 214 |
+
}
|
| 215 |
+
} catch(e: any) {
|
| 216 |
+
alert("Error: " + e.message);
|
| 217 |
+
}
|
| 218 |
+
};
|
| 219 |
+
|
| 220 |
+
// Helper to segment workflow
|
| 221 |
+
const pendingVerification = workflowList.filter(row => row.ai_status === 'Labeled' && row.manual_status !== 'Completed');
|
| 222 |
+
const pendingAI = workflowList.filter(row => row.ai_status !== 'Labeled' && row.manual_status !== 'Completed');
|
| 223 |
+
const completed = workflowList.filter(row => row.manual_status === 'Completed');
|
| 224 |
+
|
| 225 |
+
// Helper for Analytics Summary
|
| 226 |
+
const calculateStats = () => {
|
| 227 |
+
if(comparisonList.length === 0) return { avgDelta: 0, bias: 0 };
|
| 228 |
+
let totalDelta = 0;
|
| 229 |
+
let totalAbsDelta = 0;
|
| 230 |
+
comparisonList.forEach(c => {
|
| 231 |
+
totalDelta += c.deltas.final; // Raw delta (+ means AI > Human)
|
| 232 |
+
totalAbsDelta += Math.abs(c.deltas.final);
|
| 233 |
+
});
|
| 234 |
+
return {
|
| 235 |
+
avgMAE: (totalAbsDelta / comparisonList.length).toFixed(1),
|
| 236 |
+
bias: (totalDelta / comparisonList.length).toFixed(1)
|
| 237 |
+
};
|
| 238 |
+
};
|
| 239 |
+
const stats = calculateStats();
|
| 240 |
+
|
| 241 |
+
return (
|
| 242 |
+
<div className="flex h-screen w-full bg-[#09090b] text-slate-200 font-sans overflow-hidden">
|
| 243 |
+
|
| 244 |
+
{/* LEFT PANEL */}
|
| 245 |
+
<div className="w-[380px] flex flex-col border-r border-slate-800/60 bg-[#0c0c0e]">
|
| 246 |
+
<div className="h-16 flex items-center px-6 border-b border-slate-800/60">
|
| 247 |
+
<div className="flex items-center gap-3">
|
| 248 |
+
<div className="w-8 h-8 rounded-lg bg-indigo-600 flex items-center justify-center">
|
| 249 |
+
<Bot className="w-5 h-5 text-white" />
|
| 250 |
+
</div>
|
| 251 |
+
<h1 className="text-sm font-bold text-white">vChat <span className="text-slate-500">Manager</span></h1>
|
| 252 |
+
</div>
|
| 253 |
+
</div>
|
| 254 |
+
|
| 255 |
+
<div className="flex-1 overflow-y-auto p-6 space-y-6 custom-scrollbar">
|
| 256 |
+
<form id="control-form" className="space-y-6">
|
| 257 |
+
<div className="grid grid-cols-2 gap-1 p-1 bg-slate-900 rounded-lg border border-slate-800">
|
| 258 |
+
{[{id:'queue',l:'Ingest Queue',i:List}, {id:'workflow',l:'Labeling Workflow',i:ClipboardList}, {id:'moderation',l:'Dataset',i:Database}, {id:'analytics',l:'Showcase',i:BarChart2}].map(t => (
|
| 259 |
+
<button key={t.id} type="button" onClick={() => setActiveTab(t.id)}
|
| 260 |
+
className={`flex items-center justify-center gap-2 py-2 text-xs font-medium rounded ${activeTab===t.id ? 'bg-slate-800 text-white' : 'text-slate-500'}`}>
|
| 261 |
+
<t.i className="w-3 h-3" /> {t.l}
|
| 262 |
+
</button>
|
| 263 |
+
))}
|
| 264 |
+
</div>
|
| 265 |
+
|
| 266 |
+
<div className="space-y-3">
|
| 267 |
+
<label className="text-xs font-bold text-slate-500 uppercase">Model Engine</label>
|
| 268 |
+
<select name="model_selection" value={model} onChange={(e) => setModel(e.target.value)}
|
| 269 |
+
className="w-full bg-slate-900 border border-slate-800 rounded px-3 py-2 text-xs outline-none focus:border-indigo-500">
|
| 270 |
+
<option value="vertex">Google Vertex AI</option>
|
| 271 |
+
<option value="gemini">Google Gemini API</option>
|
| 272 |
+
</select>
|
| 273 |
+
|
| 274 |
+
{model === 'gemini' && (
|
| 275 |
+
<div className="space-y-2">
|
| 276 |
+
<input type="password" name="gemini_api_key" placeholder="Gemini API Key" className="w-full bg-black/20 border border-slate-800 rounded px-3 py-2 text-xs" />
|
| 277 |
+
<input type="text" name="gemini_model_name" defaultValue="models/gemini-2.0-flash-exp" className="w-full bg-black/20 border border-slate-800 rounded px-3 py-2 text-xs" />
|
| 278 |
+
</div>
|
| 279 |
+
)}
|
| 280 |
+
{model === 'vertex' && (
|
| 281 |
+
<div className="space-y-2">
|
| 282 |
+
<input type="text" name="vertex_project_id" placeholder="GCP Project ID" className="w-full bg-black/20 border border-slate-800 rounded px-3 py-2 text-xs" />
|
| 283 |
+
<div className="grid grid-cols-2 gap-2">
|
| 284 |
+
<input type="text" name="vertex_location" defaultValue="us-central1" className="w-full bg-black/20 border border-slate-800 rounded px-3 py-2 text-xs" />
|
| 285 |
+
<input type="text" name="vertex_model_name" defaultValue="gemini-1.5-pro-preview-0409" className="w-full bg-black/20 border border-slate-800 rounded px-3 py-2 text-xs" />
|
| 286 |
+
</div>
|
| 287 |
+
</div>
|
| 288 |
+
)}
|
| 289 |
+
</div>
|
| 290 |
+
|
| 291 |
+
<div className="space-y-3">
|
| 292 |
+
<label className="text-xs font-bold text-slate-500 uppercase flex items-center gap-2">
|
| 293 |
+
<BrainCircuit className="w-3 h-3" /> Reasoning Architecture
|
| 294 |
+
</label>
|
| 295 |
+
<div className="grid grid-cols-2 gap-2">
|
| 296 |
+
<button type="button"
|
| 297 |
+
onClick={() => setReasoningMethod('cot')}
|
| 298 |
+
className={`py-2 px-3 rounded text-xs border ${reasoningMethod === 'cot' ? 'bg-indigo-900/40 border-indigo-500 text-indigo-300' : 'bg-slate-900 border-slate-800 text-slate-500 hover:border-slate-700'}`}>
|
| 299 |
+
Standard CoT
|
| 300 |
+
</button>
|
| 301 |
+
<button type="button"
|
| 302 |
+
onClick={() => setReasoningMethod('fcot')}
|
| 303 |
+
className={`py-2 px-3 rounded text-xs border ${reasoningMethod === 'fcot' ? 'bg-indigo-900/40 border-indigo-500 text-indigo-300' : 'bg-slate-900 border-slate-800 text-slate-500 hover:border-slate-700'}`}>
|
| 304 |
+
Fractal CoT
|
| 305 |
+
</button>
|
| 306 |
+
</div>
|
| 307 |
+
<input type="hidden" name="reasoning_method" value={reasoningMethod} />
|
| 308 |
+
<p className="text-[10px] text-slate-500">
|
| 309 |
+
{reasoningMethod === 'cot' ? "Single-pass linear chain of thought." : "Recursive Multi-Scale (Macro → Meso → Consensus)."}
|
| 310 |
+
</p>
|
| 311 |
+
</div>
|
| 312 |
+
|
| 313 |
+
{activeTab === 'queue' && (
|
| 314 |
+
<div className="space-y-4">
|
| 315 |
+
<div onClick={() => fileInputRef.current?.click()} className="border-2 border-dashed border-slate-700 hover:border-indigo-500 hover:bg-indigo-500/5 rounded-xl p-4 text-center cursor-pointer transition-colors">
|
| 316 |
+
<Upload className="w-5 h-5 mx-auto text-slate-500 mb-1" />
|
| 317 |
+
<p className="text-xs text-slate-400">Upload CSV to Queue</p>
|
| 318 |
+
<input type="file" ref={fileInputRef} onChange={handleFileUpload} accept=".csv" hidden />
|
| 319 |
+
</div>
|
| 320 |
+
<div className="p-3 bg-indigo-900/10 border border-indigo-500/20 rounded-lg flex justify-between">
|
| 321 |
+
<p className="text-xs text-indigo-300">Pending: {queueList.filter(x=>x.status==='Pending').length}</p>
|
| 322 |
+
<p className="text-xs text-indigo-300">Total: {queueList.length}</p>
|
| 323 |
+
</div>
|
| 324 |
+
</div>
|
| 325 |
+
)}
|
| 326 |
+
|
| 327 |
+
<div className="flex items-center gap-2 mt-4 bg-slate-900 p-2 rounded border border-slate-800">
|
| 328 |
+
<input type="checkbox" name="include_comments" id="include_comments" value="true" className="rounded bg-slate-800 border-slate-700 text-indigo-500 focus:ring-offset-0 focus:ring-0" />
|
| 329 |
+
<label htmlFor="include_comments" className="text-xs text-slate-400 select-none cursor-pointer">Include Reasoning (Detailed Schema)</label>
|
| 330 |
+
</div>
|
| 331 |
+
</form>
|
| 332 |
+
</div>
|
| 333 |
+
|
| 334 |
+
<div className="p-6 border-t border-slate-800/60 bg-[#0c0c0e]">
|
| 335 |
+
{activeTab === 'queue' ? (
|
| 336 |
+
<div className="flex gap-2">
|
| 337 |
+
<button onClick={handleStartQueue} disabled={isProcessing} className={`flex-1 py-3 rounded-lg font-bold text-xs flex items-center justify-center gap-2 ${isProcessing ? 'bg-slate-800 text-slate-400' : 'bg-emerald-600 hover:bg-emerald-500 text-white'}`}>
|
| 338 |
+
<Play className="w-4 h-4" /> Start Batch
|
| 339 |
+
</button>
|
| 340 |
+
{isProcessing && (
|
| 341 |
+
<button onClick={handleStopQueue} className="px-4 bg-red-900/50 text-red-400 border border-red-900 rounded-lg hover:bg-red-900/80">
|
| 342 |
+
<StopCircle className="w-4 h-4" />
|
| 343 |
+
</button>
|
| 344 |
+
)}
|
| 345 |
+
</div>
|
| 346 |
+
) : activeTab === 'manual' ? (
|
| 347 |
+
<button type="submit" form="control-form" className="w-full py-3 bg-indigo-600 hover:bg-indigo-500 rounded-lg text-xs font-bold text-white flex justify-center gap-2"><Play className="w-4 h-4"/> Run Labeler</button>
|
| 348 |
+
) : (
|
| 349 |
+
<button onClick={() => setRefreshTrigger(x=>x+1)} className="w-full py-3 bg-slate-800 hover:bg-slate-700 rounded-lg text-xs font-bold text-white">Refresh List</button>
|
| 350 |
+
)}
|
| 351 |
+
</div>
|
| 352 |
+
</div>
|
| 353 |
+
|
| 354 |
+
{/* RIGHT PANEL */}
|
| 355 |
+
<div className="flex-1 flex flex-col bg-[#09090b] overflow-hidden relative">
|
| 356 |
+
<div className="h-16 border-b border-slate-800/60 bg-[#09090b]/80 backdrop-blur flex justify-between items-center px-8 z-10">
|
| 357 |
+
<span className="text-xs font-mono font-medium text-slate-400 tracking-wide">{activeTab.toUpperCase()} VIEW</span>
|
| 358 |
+
<button onClick={() => setLogs('')} className="text-[10px] text-slate-600 hover:text-slate-400">Clear Logs</button>
|
| 359 |
+
</div>
|
| 360 |
+
|
| 361 |
+
<div className="flex-1 p-6 overflow-hidden flex flex-col z-10">
|
| 362 |
+
{activeTab === 'queue' && (
|
| 363 |
+
<div className="flex-1 flex flex-col gap-4">
|
| 364 |
+
<div className="h-1/2 bg-slate-900/30 border border-slate-800 rounded-xl overflow-auto custom-scrollbar">
|
| 365 |
+
<table className="w-full text-left text-xs text-slate-400">
|
| 366 |
+
<thead className="bg-slate-900 text-slate-300 sticky top-0"><tr><th className="p-3">Link</th><th className="p-3">Ingested</th><th className="p-3">Status</th><th className="p-3 text-right">Action</th></tr></thead>
|
| 367 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 368 |
+
{queueList.map((q,i) => (
|
| 369 |
+
<tr key={i} className="hover:bg-white/5">
|
| 370 |
+
<td className="p-3 truncate max-w-[300px] text-sky-500">
|
| 371 |
+
<a href={q.link} target="_blank" rel="noopener noreferrer" className="hover:underline flex items-center gap-1">
|
| 372 |
+
{q.link} <ExternalLink className="w-3 h-3"/>
|
| 373 |
+
</a>
|
| 374 |
+
</td>
|
| 375 |
+
<td className="p-3 text-slate-500">{q.timestamp}</td>
|
| 376 |
+
<td className="p-3"><span className={`px-2 py-0.5 rounded ${q.status==='Processed' ? 'bg-emerald-500/10 text-emerald-500' : 'bg-amber-500/10 text-amber-500'}`}>{q.status}</span></td>
|
| 377 |
+
<td className="p-3 text-right">
|
| 378 |
+
<button onClick={()=>handleQueueDelete(q.link)} className="text-slate-500 hover:text-red-500 p-1">
|
| 379 |
+
<Trash2 className="w-4 h-4"/>
|
| 380 |
+
</button>
|
| 381 |
+
</td>
|
| 382 |
+
</tr>
|
| 383 |
+
))}
|
| 384 |
+
{queueList.length===0 && <tr><td colSpan={4} className="p-4 text-center">Queue empty. Upload CSV or use Extension.</td></tr>}
|
| 385 |
+
</tbody>
|
| 386 |
+
</table>
|
| 387 |
+
</div>
|
| 388 |
+
<div className="h-1/2 bg-black/40 border border-slate-800 rounded-xl p-4 font-mono text-[11px] text-slate-300 overflow-auto">
|
| 389 |
+
<pre>{logs}</pre>
|
| 390 |
+
<div ref={logEndRef} />
|
| 391 |
+
</div>
|
| 392 |
+
</div>
|
| 393 |
+
)}
|
| 394 |
+
|
| 395 |
+
{activeTab === 'analytics' && (
|
| 396 |
+
<div className="flex-1 overflow-auto custom-scrollbar flex flex-col gap-6">
|
| 397 |
+
{/* Header Stats */}
|
| 398 |
+
<div className="grid grid-cols-4 gap-4">
|
| 399 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 400 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">Total Verified</h4>
|
| 401 |
+
<div className="text-2xl font-bold text-white mt-1">{comparisonList.length}</div>
|
| 402 |
+
</div>
|
| 403 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 404 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">MAE (Mean Err)</h4>
|
| 405 |
+
<div className="text-2xl font-bold text-sky-400 mt-1">{stats.avgMAE}</div>
|
| 406 |
+
<div className="text-[10px] text-slate-500">Average absolute deviation</div>
|
| 407 |
+
</div>
|
| 408 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 409 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">AI Bias</h4>
|
| 410 |
+
<div className="text-2xl font-bold text-amber-400 mt-1">{Number(stats.bias) > 0 ? "+" : ""}{stats.bias}</div>
|
| 411 |
+
<div className="text-[10px] text-slate-500">Positive = AI scores higher than human</div>
|
| 412 |
+
</div>
|
| 413 |
+
</div>
|
| 414 |
+
|
| 415 |
+
{/* Comparison Chart List */}
|
| 416 |
+
<div className="bg-slate-900/30 border border-slate-800 rounded-xl overflow-hidden">
|
| 417 |
+
<div className="p-4 bg-slate-950 border-b border-slate-800 flex justify-between items-center">
|
| 418 |
+
<h3 className="text-sm font-bold text-white flex items-center gap-2">
|
| 419 |
+
<Scale className="w-4 h-4 text-indigo-500"/> Verification Showcase
|
| 420 |
+
</h3>
|
| 421 |
+
</div>
|
| 422 |
+
<table className="w-full text-left text-xs text-slate-400">
|
| 423 |
+
<thead className="bg-slate-900 text-slate-300">
|
| 424 |
+
<tr>
|
| 425 |
+
<th className="p-3 w-1/4">Video / Link</th>
|
| 426 |
+
<th className="p-3 w-1/2">Score Comparison (AI vs Manual)</th>
|
| 427 |
+
<th className="p-3 text-right">Delta</th>
|
| 428 |
+
</tr>
|
| 429 |
+
</thead>
|
| 430 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 431 |
+
{comparisonList.map((item, i) => (
|
| 432 |
+
<tr key={i} className="hover:bg-white/5">
|
| 433 |
+
<td className="p-3">
|
| 434 |
+
<div className="text-sky-500 truncate max-w-[200px]" title={item.link}>
|
| 435 |
+
<a href={item.link} target="_blank" rel="noopener noreferrer" className="hover:underline flex items-center gap-1">
|
| 436 |
+
{item.link} <ExternalLink className="w-3 h-3"/>
|
| 437 |
+
</a>
|
| 438 |
+
</div>
|
| 439 |
+
<div className="text-[10px] text-slate-600 font-mono">{item.id}</div>
|
| 440 |
+
</td>
|
| 441 |
+
<td className="p-3">
|
| 442 |
+
{/* Visual Integrity Bar */}
|
| 443 |
+
<div className="flex items-center gap-2 mb-1">
|
| 444 |
+
<span className="w-16 text-[10px] text-slate-500">Visual</span>
|
| 445 |
+
<div className="flex-1 h-2 bg-slate-800 rounded-full overflow-hidden flex relative">
|
| 446 |
+
<div className="absolute top-0 bottom-0 bg-indigo-500/50" style={{left:0, width:`${item.scores.visual.ai * 10}%`}}></div>
|
| 447 |
+
<div className="absolute top-0 bottom-0 border-l-2 border-emerald-500 h-full" style={{left:`${item.scores.visual.manual * 10}%`}}></div>
|
| 448 |
+
</div>
|
| 449 |
+
<span className="text-[10px] font-mono"><span className="text-indigo-400">{item.scores.visual.ai}</span> / <span className="text-emerald-400">{item.scores.visual.manual}</span></span>
|
| 450 |
+
</div>
|
| 451 |
+
{/* Final Veracity Bar */}
|
| 452 |
+
<div className="flex items-center gap-2">
|
| 453 |
+
<span className="w-16 text-[10px] text-slate-500 font-bold">Final</span>
|
| 454 |
+
<div className="flex-1 h-3 bg-slate-800 rounded-full overflow-hidden flex relative">
|
| 455 |
+
{/* AI Score */}
|
| 456 |
+
<div className="bg-indigo-600 h-full" style={{width:`${item.scores.final.ai}%`}}></div>
|
| 457 |
+
</div>
|
| 458 |
+
{/* Manual Marker */}
|
| 459 |
+
<div className="w-1 h-3 bg-emerald-500 -ml-1 z-10"></div>
|
| 460 |
+
<span className="text-[10px] font-mono"><span className="text-indigo-400">{item.scores.final.ai}</span> / <span className="text-emerald-400">{item.scores.final.manual}</span></span>
|
| 461 |
+
</div>
|
| 462 |
+
</td>
|
| 463 |
+
<td className="p-3 text-right">
|
| 464 |
+
<div className={`font-bold ${Math.abs(item.deltas.final) > 20 ? 'text-red-500' : 'text-slate-400'}`}>
|
| 465 |
+
{item.deltas.final > 0 ? "+" : ""}{item.deltas.final}
|
| 466 |
+
</div>
|
| 467 |
+
</td>
|
| 468 |
+
</tr>
|
| 469 |
+
))}
|
| 470 |
+
</tbody>
|
| 471 |
+
</table>
|
| 472 |
+
</div>
|
| 473 |
+
</div>
|
| 474 |
+
)}
|
| 475 |
+
|
| 476 |
+
{activeTab === 'workflow' && (
|
| 477 |
+
<div className="flex-1 overflow-auto custom-scrollbar flex flex-col gap-6">
|
| 478 |
+
|
| 479 |
+
{/* Summary Cards */}
|
| 480 |
+
<div className="grid grid-cols-3 gap-4">
|
| 481 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 482 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">Pending Manual Review</h4>
|
| 483 |
+
<div className="text-2xl font-bold text-white mt-1">{pendingVerification.length}</div>
|
| 484 |
+
<div className="text-[10px] text-amber-500 mt-1">Ready for verification</div>
|
| 485 |
+
</div>
|
| 486 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 487 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">Ingestion Queue</h4>
|
| 488 |
+
<div className="text-2xl font-bold text-white mt-1">{pendingAI.length}</div>
|
| 489 |
+
<div className="text-[10px] text-sky-500 mt-1">Waiting for AI labeling</div>
|
| 490 |
+
</div>
|
| 491 |
+
<div className="bg-slate-900 border border-slate-800 rounded-lg p-4">
|
| 492 |
+
<h4 className="text-xs text-slate-500 uppercase font-bold">Total Verified</h4>
|
| 493 |
+
<div className="text-2xl font-bold text-white mt-1">{completed.length}</div>
|
| 494 |
+
<div className="text-[10px] text-emerald-500 mt-1">Manually confirmed</div>
|
| 495 |
+
</div>
|
| 496 |
+
</div>
|
| 497 |
+
|
| 498 |
+
{/* Main Section: Ready for Manual Verification */}
|
| 499 |
+
<div className="bg-slate-900/30 border border-slate-800 rounded-xl overflow-hidden flex flex-col min-h-[300px]">
|
| 500 |
+
<div className="p-4 bg-slate-950 border-b border-slate-800 flex justify-between items-center">
|
| 501 |
+
<h3 className="text-sm font-bold text-white flex items-center gap-2">
|
| 502 |
+
<AlertCircle className="w-4 h-4 text-amber-500"/> Needs Verification (Priority)
|
| 503 |
+
</h3>
|
| 504 |
+
<span className="text-xs text-slate-500">AI labeled links missing manual review.</span>
|
| 505 |
+
</div>
|
| 506 |
+
<div className="flex-1 overflow-auto">
|
| 507 |
+
<table className="w-full text-left text-xs text-slate-400">
|
| 508 |
+
<thead className="bg-slate-900 text-slate-300 sticky top-0">
|
| 509 |
+
<tr>
|
| 510 |
+
<th className="p-4">Link</th>
|
| 511 |
+
<th className="p-4">AI Score</th>
|
| 512 |
+
<th className="p-4 text-right">Action</th>
|
| 513 |
+
</tr>
|
| 514 |
+
</thead>
|
| 515 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 516 |
+
{pendingVerification.map((row, i) => (
|
| 517 |
+
<tr key={i} className="hover:bg-white/5 cursor-pointer" onClick={() => openLabelingModal(row)}>
|
| 518 |
+
<td className="p-4 truncate max-w-[400px] text-sky-400">
|
| 519 |
+
<a href={row.link} target="_blank" rel="noopener noreferrer" onClick={(e) => e.stopPropagation()} className="hover:underline flex items-center gap-1">
|
| 520 |
+
{row.link} <ExternalLink className="w-3 h-3"/>
|
| 521 |
+
</a>
|
| 522 |
+
</td>
|
| 523 |
+
<td className="p-4">
|
| 524 |
+
<span className="flex items-center gap-1 text-emerald-400">
|
| 525 |
+
<BrainCircuit className="w-3 h-3"/> {row.ai_data?.final}
|
| 526 |
+
</span>
|
| 527 |
+
</td>
|
| 528 |
+
<td className="p-4 text-right">
|
| 529 |
+
<button className="px-4 py-1.5 bg-indigo-600 text-white rounded font-bold hover:bg-indigo-500 shadow-lg shadow-indigo-500/20">Verify</button>
|
| 530 |
+
</td>
|
| 531 |
+
</tr>
|
| 532 |
+
))}
|
| 533 |
+
{pendingVerification.length === 0 && (
|
| 534 |
+
<tr><td colSpan={3} className="p-8 text-center text-emerald-500">No pending verifications. Good job!</td></tr>
|
| 535 |
+
)}
|
| 536 |
+
</tbody>
|
| 537 |
+
</table>
|
| 538 |
+
</div>
|
| 539 |
+
</div>
|
| 540 |
+
|
| 541 |
+
{/* Secondary: Ingestion Queue */}
|
| 542 |
+
<div className="bg-slate-900/30 border border-slate-800 rounded-xl overflow-hidden">
|
| 543 |
+
<div className="p-3 bg-slate-950/50 border-b border-slate-800">
|
| 544 |
+
<h3 className="text-xs font-bold text-slate-400 uppercase">Ingestion Queue (Waiting for AI)</h3>
|
| 545 |
+
</div>
|
| 546 |
+
<table className="w-full text-left text-xs text-slate-500">
|
| 547 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 548 |
+
{pendingAI.slice(0, 10).map((row, i) => (
|
| 549 |
+
<tr key={i}>
|
| 550 |
+
<td className="p-3 truncate max-w-[400px] opacity-60">
|
| 551 |
+
<a href={row.link} target="_blank" rel="noopener noreferrer" className="hover:underline hover:text-sky-400 flex items-center gap-1">
|
| 552 |
+
{row.link} <ExternalLink className="w-3 h-3"/>
|
| 553 |
+
</a>
|
| 554 |
+
</td>
|
| 555 |
+
<td className="p-3 text-right">
|
| 556 |
+
<span className="px-2 py-0.5 bg-slate-800 rounded text-slate-400">Pending AI</span>
|
| 557 |
+
</td>
|
| 558 |
+
</tr>
|
| 559 |
+
))}
|
| 560 |
+
{pendingAI.length > 10 && <tr><td colSpan={2} className="p-3 text-center opacity-50">...and {pendingAI.length - 10} more</td></tr>}
|
| 561 |
+
</tbody>
|
| 562 |
+
</table>
|
| 563 |
+
</div>
|
| 564 |
+
|
| 565 |
+
{/* Verified History */}
|
| 566 |
+
<div className="bg-slate-900/30 border border-slate-800 rounded-xl overflow-hidden mb-8">
|
| 567 |
+
<div className="p-3 bg-slate-950/50 border-b border-slate-800">
|
| 568 |
+
<h3 className="text-xs font-bold text-slate-400 uppercase">Verification History</h3>
|
| 569 |
+
</div>
|
| 570 |
+
<table className="w-full text-left text-xs text-slate-500">
|
| 571 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 572 |
+
{completed.slice(0, 10).map((row, i) => (
|
| 573 |
+
<tr key={i}>
|
| 574 |
+
<td className="p-3 truncate max-w-[400px] opacity-60 text-emerald-500/50">
|
| 575 |
+
<a href={row.link} target="_blank" rel="noopener noreferrer" className="hover:underline hover:text-emerald-400 flex items-center gap-1">
|
| 576 |
+
{row.link} <ExternalLink className="w-3 h-3"/>
|
| 577 |
+
</a>
|
| 578 |
+
</td>
|
| 579 |
+
<td className="p-3 opacity-60">Tags: {row.manual_tags || "-"}</td>
|
| 580 |
+
<td className="p-3 text-right">
|
| 581 |
+
<CheckSquare className="w-4 h-4 text-emerald-600 inline"/>
|
| 582 |
+
</td>
|
| 583 |
+
</tr>
|
| 584 |
+
))}
|
| 585 |
+
</tbody>
|
| 586 |
+
</table>
|
| 587 |
+
</div>
|
| 588 |
+
</div>
|
| 589 |
+
)}
|
| 590 |
+
|
| 591 |
+
{activeTab === 'moderation' && (
|
| 592 |
+
<div className="flex-1 bg-slate-900/30 border border-slate-800 rounded-xl overflow-auto custom-scrollbar">
|
| 593 |
+
<table className="w-full text-left text-xs text-slate-400">
|
| 594 |
+
<thead className="bg-slate-900 text-slate-300 sticky top-0">
|
| 595 |
+
<tr><th className="p-4">ID / Source</th><th className="p-4">Link / Caption</th><th className="p-4">Scores (V/A/F)</th><th className="p-4">Status</th><th className="p-4 text-right">Action</th></tr>
|
| 596 |
+
</thead>
|
| 597 |
+
<tbody className="divide-y divide-slate-800/50">
|
| 598 |
+
{dataList.map((row, i) => (
|
| 599 |
+
<React.Fragment key={i}>
|
| 600 |
+
<tr onClick={() => setExpandedRow(expandedRow === row.id ? null : row.id)} className={`hover:bg-white/5 cursor-pointer ${expandedRow===row.id?'bg-white/5':''}`}>
|
| 601 |
+
<td className="p-4">
|
| 602 |
+
<div className="font-mono text-indigo-400 font-bold">{row.id || 'N/A'}</div>
|
| 603 |
+
<span className="text-[9px] uppercase px-1.5 py-0.5 rounded bg-slate-800 text-slate-500">{row.source_type}</span>
|
| 604 |
+
</td>
|
| 605 |
+
<td className="p-4">
|
| 606 |
+
<div className="truncate max-w-[250px] text-white mb-1" title={row.caption}>{row.caption || 'No Caption'}</div>
|
| 607 |
+
<div className="truncate max-w-[250px] text-[10px] text-slate-600">
|
| 608 |
+
<a href={row.link} target="_blank" rel="noopener noreferrer" onClick={(e) => e.stopPropagation()} className="hover:text-indigo-400 hover:underline flex items-center gap-1">
|
| 609 |
+
{row.link} <ExternalLink className="w-3 h-3"/>
|
| 610 |
+
</a>
|
| 611 |
+
</div>
|
| 612 |
+
{row.tags && <div className="mt-1 text-[9px] text-emerald-400 font-mono">{row.tags}</div>}
|
| 613 |
+
</td>
|
| 614 |
+
<td className="p-4 font-mono">
|
| 615 |
+
<span title="Visual" className="text-emerald-400">{row.visual_integrity_score}</span> /
|
| 616 |
+
<span title="Audio" className="text-sky-400">{row.audio_integrity_score}</span> /
|
| 617 |
+
<span title="Final" className="text-white font-bold">{row.final_veracity_score}</span>
|
| 618 |
+
</td>
|
| 619 |
+
<td className="p-4">
|
| 620 |
+
{row.source_type === 'auto' && (
|
| 621 |
+
row.manual_verification_status === 'Verified' ?
|
| 622 |
+
<span className="text-emerald-500 flex items-center gap-1"><CheckCircle className="w-3 h-3"/> Verified</span> :
|
| 623 |
+
<div className="flex items-center gap-2">
|
| 624 |
+
<span className="text-amber-500">Need Manual</span>
|
| 625 |
+
<button onClick={(e) => { e.stopPropagation(); openLabelingModal(row); }}
|
| 626 |
+
className="px-2 py-1 bg-indigo-600 hover:bg-indigo-500 text-white rounded text-[10px]">
|
| 627 |
+
Verify
|
| 628 |
+
</button>
|
| 629 |
+
</div>
|
| 630 |
+
)}
|
| 631 |
+
</td>
|
| 632 |
+
<td className="p-4 text-right"><button onClick={(e)=>{e.stopPropagation(); handleDelete(row.id, row.link)}} className="hover:text-red-400 p-2"><Trash2 className="w-4 h-4"/></button></td>
|
| 633 |
+
</tr>
|
| 634 |
+
{expandedRow === row.id && (
|
| 635 |
+
<tr><td colSpan={5} className="bg-slate-950 p-6 border-b border-slate-800">
|
| 636 |
+
<div className="grid grid-cols-2 gap-6">
|
| 637 |
+
<div className="space-y-4">
|
| 638 |
+
<div>
|
| 639 |
+
<h4 className="text-indigo-400 text-[10px] font-bold uppercase mb-2">Prompt Used</h4>
|
| 640 |
+
<div className="bg-black/30 border border-slate-800 rounded p-2 h-32 overflow-auto text-[9px] font-mono text-slate-500">
|
| 641 |
+
{row.json_data?.meta_info?.prompt_used || "Prompt not saved in legacy data."}
|
| 642 |
+
</div>
|
| 643 |
+
</div>
|
| 644 |
+
<div>
|
| 645 |
+
<h4 className="text-indigo-400 text-[10px] font-bold uppercase mb-2">Reasoning</h4>
|
| 646 |
+
<p className="text-sm text-slate-300 italic">{row.final_reasoning}</p>
|
| 647 |
+
</div>
|
| 648 |
+
</div>
|
| 649 |
+
<div>
|
| 650 |
+
<h4 className="text-indigo-400 text-[10px] font-bold uppercase mb-2">Raw JSON Data</h4>
|
| 651 |
+
<div className="h-[300px] overflow-auto border border-slate-800 rounded p-3 bg-black/50 custom-scrollbar">
|
| 652 |
+
<pre className="text-[10px] font-mono text-emerald-500">{JSON.stringify(row.json_data || row, null, 2)}</pre>
|
| 653 |
+
</div>
|
| 654 |
+
</div>
|
| 655 |
+
</div>
|
| 656 |
+
</td></tr>
|
| 657 |
+
)}
|
| 658 |
+
</React.Fragment>
|
| 659 |
+
))}
|
| 660 |
+
</tbody>
|
| 661 |
+
</table>
|
| 662 |
+
</div>
|
| 663 |
+
)}
|
| 664 |
+
|
| 665 |
+
{activeTab === 'manual' && (
|
| 666 |
+
<div className="flex-1 bg-black/40 border border-slate-800 rounded-xl p-4 font-mono text-[11px] text-slate-300 overflow-auto">
|
| 667 |
+
<pre>{logs}</pre>
|
| 668 |
+
<div ref={logEndRef} />
|
| 669 |
+
</div>
|
| 670 |
+
)}
|
| 671 |
+
</div>
|
| 672 |
+
|
| 673 |
+
{/* MANUAL LABELING MODAL */}
|
| 674 |
+
{labelingItem && (
|
| 675 |
+
<div className="absolute inset-0 z-50 bg-black/80 backdrop-blur-sm flex items-center justify-center p-8">
|
| 676 |
+
<div className="bg-[#0f172a] border border-slate-700 rounded-xl w-full max-w-4xl h-full max-h-full overflow-y-auto shadow-2xl flex flex-col">
|
| 677 |
+
<div className="p-4 border-b border-slate-800 flex justify-between items-center bg-[#1e293b]">
|
| 678 |
+
<h2 className="text-lg font-bold text-white">Manual Verification</h2>
|
| 679 |
+
<button onClick={() => setLabelingItem(null)} className="text-slate-400 hover:text-white"><XCircle className="w-6 h-6"/></button>
|
| 680 |
+
</div>
|
| 681 |
+
<div className="p-6 flex-1 overflow-y-auto">
|
| 682 |
+
<div className="mb-6 bg-slate-900 p-4 rounded border border-slate-800">
|
| 683 |
+
<div className="flex items-center gap-2 mb-2">
|
| 684 |
+
<a href={labelingItem.link} target="_blank" rel="noopener noreferrer" className="text-xs text-indigo-400 font-mono hover:underline flex items-center gap-2">
|
| 685 |
+
{labelingItem.link} <ExternalLink className="w-3 h-3"/>
|
| 686 |
+
</a>
|
| 687 |
+
</div>
|
| 688 |
+
{labelingItem.ai_data?.reasoning && (
|
| 689 |
+
<div className="text-xs text-slate-500 italic border-l-2 border-indigo-500 pl-3">
|
| 690 |
+
"AI Reasoning: {labelingItem.ai_data.reasoning}"
|
| 691 |
+
</div>
|
| 692 |
+
)}
|
| 693 |
+
{!labelingItem.ai_data && labelingItem.final_reasoning && (
|
| 694 |
+
<div className="text-xs text-slate-500 italic border-l-2 border-indigo-500 pl-3">
|
| 695 |
+
"AI Reasoning: {labelingItem.final_reasoning}"
|
| 696 |
+
</div>
|
| 697 |
+
)}
|
| 698 |
+
</div>
|
| 699 |
+
|
| 700 |
+
<div className="grid grid-cols-2 gap-8">
|
| 701 |
+
<div className="space-y-4">
|
| 702 |
+
<h3 className="text-xs font-bold text-slate-400 uppercase">Veracity Vectors (1-10)</h3>
|
| 703 |
+
{['visual', 'audio', 'source', 'logic'].map(k => (
|
| 704 |
+
<div key={k} className="flex items-center gap-4">
|
| 705 |
+
<label className="w-24 text-xs capitalize text-slate-300">{k}</label>
|
| 706 |
+
<input type="range" min="1" max="10" value={(manualForm as any)[k]}
|
| 707 |
+
onChange={e => setManualForm({...manualForm, [k]: parseInt(e.target.value)})}
|
| 708 |
+
className="flex-1 accent-indigo-500" />
|
| 709 |
+
<span className="w-6 text-center text-sm font-bold text-indigo-400">{(manualForm as any)[k]}</span>
|
| 710 |
+
</div>
|
| 711 |
+
))}
|
| 712 |
+
|
| 713 |
+
<h3 className="text-xs font-bold text-slate-400 uppercase mt-6">Modalities (1-10)</h3>
|
| 714 |
+
{['va', 'vc', 'ac'].map(k => (
|
| 715 |
+
<div key={k} className="flex items-center gap-4">
|
| 716 |
+
<label className="w-24 text-xs uppercase text-slate-300">{k}</label>
|
| 717 |
+
<input type="range" min="1" max="10" value={(manualForm as any)[k]}
|
| 718 |
+
onChange={e => setManualForm({...manualForm, [k]: parseInt(e.target.value)})}
|
| 719 |
+
className="flex-1 accent-emerald-500" />
|
| 720 |
+
<span className="w-6 text-center text-sm font-bold text-emerald-400">{(manualForm as any)[k]}</span>
|
| 721 |
+
</div>
|
| 722 |
+
))}
|
| 723 |
+
</div>
|
| 724 |
+
|
| 725 |
+
<div className="space-y-4">
|
| 726 |
+
<div>
|
| 727 |
+
<label className="text-xs font-bold text-slate-400 uppercase block mb-2">Final Veracity Score (1-100)</label>
|
| 728 |
+
<div className="flex items-center gap-4">
|
| 729 |
+
<input type="range" min="1" max="100" value={manualForm.final}
|
| 730 |
+
onChange={e => setManualForm({...manualForm, final: parseInt(e.target.value)})}
|
| 731 |
+
className="flex-1 accent-amber-500 h-2" />
|
| 732 |
+
<span className="text-xl font-bold text-amber-500">{manualForm.final}</span>
|
| 733 |
+
</div>
|
| 734 |
+
</div>
|
| 735 |
+
|
| 736 |
+
<div>
|
| 737 |
+
<label className="text-xs font-bold text-slate-400 uppercase block mb-2">Reasoning</label>
|
| 738 |
+
<textarea className="w-full bg-slate-900 border border-slate-700 rounded p-3 text-xs text-white h-24"
|
| 739 |
+
value={manualForm.reasoning} onChange={e => setManualForm({...manualForm, reasoning: e.target.value})}
|
| 740 |
+
placeholder="Why did you assign these scores?"
|
| 741 |
+
/>
|
| 742 |
+
</div>
|
| 743 |
+
|
| 744 |
+
<div>
|
| 745 |
+
<label className="text-xs font-bold text-slate-400 uppercase block mb-2">Tags (comma separated)</label>
|
| 746 |
+
<input type="text" className="w-full bg-slate-900 border border-slate-700 rounded p-3 text-xs text-white"
|
| 747 |
+
value={manualForm.tags} onChange={e => setManualForm({...manualForm, tags: e.target.value})}
|
| 748 |
+
placeholder="political, viral, deepfake..."
|
| 749 |
+
/>
|
| 750 |
+
</div>
|
| 751 |
+
</div>
|
| 752 |
+
</div>
|
| 753 |
+
</div>
|
| 754 |
+
<div className="p-4 border-t border-slate-800 bg-[#1e293b] flex justify-end gap-3">
|
| 755 |
+
<button onClick={() => setLabelingItem(null)} className="px-4 py-2 text-slate-400 hover:text-white">Cancel</button>
|
| 756 |
+
<button onClick={submitManualLabel} className="px-6 py-2 bg-indigo-600 hover:bg-indigo-500 text-white font-bold rounded">Save Manual Label</button>
|
| 757 |
+
</div>
|
| 758 |
+
</div>
|
| 759 |
+
</div>
|
| 760 |
+
)}
|
| 761 |
+
</div>
|
| 762 |
+
</div>
|
| 763 |
+
)
|
| 764 |
+
}
|
| 765 |
+
|
| 766 |
+
export default App
|
frontend/src/index.css
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@tailwind base;
|
| 2 |
+
@tailwind components;
|
| 3 |
+
@tailwind utilities;
|
| 4 |
+
|
| 5 |
+
body {
|
| 6 |
+
background-color: #0f172a;
|
| 7 |
+
color: #e2e8f0;
|
| 8 |
+
font-family: 'Inter', sans-serif;
|
| 9 |
+
overflow: hidden;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
/* Custom Scrollbar */
|
| 13 |
+
::-webkit-scrollbar {
|
| 14 |
+
width: 8px;
|
| 15 |
+
}
|
| 16 |
+
::-webkit-scrollbar-track {
|
| 17 |
+
background: #1e293b;
|
| 18 |
+
}
|
| 19 |
+
::-webkit-scrollbar-thumb {
|
| 20 |
+
background: #475569;
|
| 21 |
+
border-radius: 4px;
|
| 22 |
+
}
|
| 23 |
+
::-webkit-scrollbar-thumb:hover {
|
| 24 |
+
background: #64748b;
|
| 25 |
+
}
|
frontend/src/main.tsx
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import React from 'react'
|
| 2 |
+
import ReactDOM from 'react-dom/client'
|
| 3 |
+
import App from './App.tsx'
|
| 4 |
+
import './index.css'
|
| 5 |
+
|
| 6 |
+
ReactDOM.createRoot(document.getElementById('root')!).render(
|
| 7 |
+
<React.StrictMode>
|
| 8 |
+
<App />
|
| 9 |
+
</React.StrictMode>,
|
| 10 |
+
)
|
frontend/tailwind.config.js
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/** @type {import('tailwindcss').Config} */
|
| 2 |
+
export default {
|
| 3 |
+
content: [
|
| 4 |
+
"./index.html",
|
| 5 |
+
"./src/**/*.{js,ts,jsx,tsx}",
|
| 6 |
+
],
|
| 7 |
+
theme: {
|
| 8 |
+
extend: {
|
| 9 |
+
colors: {
|
| 10 |
+
'vchat-bg': '#0f172a',
|
| 11 |
+
'vchat-panel': '#1e293b',
|
| 12 |
+
'vchat-accent': '#6366f1',
|
| 13 |
+
},
|
| 14 |
+
animation: {
|
| 15 |
+
'pulse-slow': 'pulse 3s cubic-bezier(0.4, 0, 0.6, 1) infinite',
|
| 16 |
+
}
|
| 17 |
+
},
|
| 18 |
+
},
|
| 19 |
+
plugins: [],
|
| 20 |
+
}
|
frontend/tsconfig.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"target": "ES2020",
|
| 4 |
+
"useDefineForClassFields": true,
|
| 5 |
+
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
| 6 |
+
"module": "ESNext",
|
| 7 |
+
"skipLibCheck": true,
|
| 8 |
+
"moduleResolution": "bundler",
|
| 9 |
+
"allowImportingTsExtensions": true,
|
| 10 |
+
"resolveJsonModule": true,
|
| 11 |
+
"isolatedModules": true,
|
| 12 |
+
"noEmit": true,
|
| 13 |
+
"jsx": "react-jsx",
|
| 14 |
+
"strict": true,
|
| 15 |
+
"noUnusedLocals": true,
|
| 16 |
+
"noUnusedParameters": true,
|
| 17 |
+
"noFallthroughCasesInSwitch": true
|
| 18 |
+
},
|
| 19 |
+
"include": ["src"],
|
| 20 |
+
"references": [{ "path": "./tsconfig.node.json" }]
|
| 21 |
+
}
|
frontend/tsconfig.node.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"composite": true,
|
| 4 |
+
"skipLibCheck": true,
|
| 5 |
+
"module": "ESNext",
|
| 6 |
+
"moduleResolution": "bundler",
|
| 7 |
+
"allowSyntheticDefaultImports": true
|
| 8 |
+
},
|
| 9 |
+
"include": ["vite.config.ts"]
|
| 10 |
+
}
|
frontend/vite.config.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from 'vite'
|
| 2 |
+
import react from '@vitejs/plugin-react'
|
| 3 |
+
|
| 4 |
+
// https://vitejs.dev/config/
|
| 5 |
+
export default defineConfig({
|
| 6 |
+
plugins: [react()],
|
| 7 |
+
})
|
main.go
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
package main
|
| 2 |
+
|
| 3 |
+
import (
|
| 4 |
+
"log"
|
| 5 |
+
"net/http"
|
| 6 |
+
"net/http/httputil"
|
| 7 |
+
"net/url"
|
| 8 |
+
"os"
|
| 9 |
+
"strings"
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
func main() {
|
| 13 |
+
// Target Python FastAPI server (running locally in the container)
|
| 14 |
+
pythonTarget := "http://127.0.0.1:8001"
|
| 15 |
+
pythonURL, err := url.Parse(pythonTarget)
|
| 16 |
+
if err != nil {
|
| 17 |
+
log.Fatalf("Invalid Python target URL: %v", err)
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
// Create Reverse Proxy
|
| 21 |
+
proxy := httputil.NewSingleHostReverseProxy(pythonURL)
|
| 22 |
+
|
| 23 |
+
// FIXED: Point to the new location safe from Docker Volumes
|
| 24 |
+
staticPath := "/usr/share/vchat/static"
|
| 25 |
+
fs := http.FileServer(http.Dir(staticPath))
|
| 26 |
+
|
| 27 |
+
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
| 28 |
+
// Check if request is for the API
|
| 29 |
+
// Added /queue to the list so uploads and list fetches work
|
| 30 |
+
// ADDED: /workflow to fix JSON parse error in frontend
|
| 31 |
+
if strings.HasPrefix(r.URL.Path, "/process") ||
|
| 32 |
+
strings.HasPrefix(r.URL.Path, "/label_video") ||
|
| 33 |
+
strings.HasPrefix(r.URL.Path, "/batch_label") ||
|
| 34 |
+
strings.HasPrefix(r.URL.Path, "/model-architecture") ||
|
| 35 |
+
strings.HasPrefix(r.URL.Path, "/download-dataset") ||
|
| 36 |
+
strings.HasPrefix(r.URL.Path, "/extension") ||
|
| 37 |
+
strings.HasPrefix(r.URL.Path, "/manage") ||
|
| 38 |
+
strings.HasPrefix(r.URL.Path, "/workflow") ||
|
| 39 |
+
strings.HasPrefix(r.URL.Path, "/queue") {
|
| 40 |
+
|
| 41 |
+
log.Printf("Proxying %s to Python Backend...", r.URL.Path)
|
| 42 |
+
proxy.ServeHTTP(w, r)
|
| 43 |
+
return
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
// Check if file exists in static dir, otherwise serve index.html (SPA Routing)
|
| 47 |
+
path := staticPath + r.URL.Path
|
| 48 |
+
if _, err := os.Stat(path); os.IsNotExist(err) {
|
| 49 |
+
http.ServeFile(w, r, staticPath+"/index.html")
|
| 50 |
+
return
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
fs.ServeHTTP(w, r)
|
| 54 |
+
})
|
| 55 |
+
|
| 56 |
+
port := "8000"
|
| 57 |
+
log.Printf("vChat Modern Server listening on port %s", port)
|
| 58 |
+
log.Printf("Serving static files from %s", staticPath)
|
| 59 |
+
if err := http.ListenAndServe(":"+port, nil); err != nil {
|
| 60 |
+
log.Fatal(err)
|
| 61 |
+
}
|
| 62 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
torchaudio
|
| 4 |
+
# --- Core Server ---
|
| 5 |
+
fastapi
|
| 6 |
+
uvicorn[standard]
|
| 7 |
+
python-multipart
|
| 8 |
+
requests
|
| 9 |
+
aiofiles
|
| 10 |
+
jinja2
|
| 11 |
+
|
| 12 |
+
# --- AI & Vision Processing ---
|
| 13 |
+
transformers
|
| 14 |
+
accelerate
|
| 15 |
+
Pillow
|
| 16 |
+
packaging
|
| 17 |
+
av
|
| 18 |
+
# Use headless to avoid installing X11/GL libraries in Docker
|
| 19 |
+
opencv-python-headless
|
| 20 |
+
decord
|
| 21 |
+
imageio
|
| 22 |
+
numpy
|
| 23 |
+
einops
|
| 24 |
+
|
| 25 |
+
# --- Google Cloud & APIs ---
|
| 26 |
+
google-generativeai
|
| 27 |
+
google-cloud-aiplatform
|
| 28 |
+
google-genai
|
| 29 |
+
google-adk
|
| 30 |
+
# CORRECTED: The ML Croissant library package name
|
| 31 |
+
mlcroissant
|
| 32 |
+
|
| 33 |
+
# --- Fine-Tuning (LoRA/QLoRA) ---
|
| 34 |
+
peft
|
| 35 |
+
# Bitsandbytes 0.44+
|
| 36 |
+
bitsandbytes
|
| 37 |
+
trl
|
| 38 |
+
datasets
|
| 39 |
+
|
| 40 |
+
# --- Audio ---
|
| 41 |
+
openai-whisper
|
| 42 |
+
yt-dlp
|
src/agents.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from google.adk.agents import Agent, SequentialAgent, ParallelAgent, LoopAgent
|
| 2 |
+
from google.adk.models import Gemini
|
| 3 |
+
from google.genai import types, Client
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from functools import cached_property
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
# --- Custom Gemini Class for Explicit API Key ---
|
| 11 |
+
class CustomGemini(Gemini):
|
| 12 |
+
api_key: Optional[str] = None
|
| 13 |
+
|
| 14 |
+
@cached_property
|
| 15 |
+
def api_client(self) -> Client:
|
| 16 |
+
"""Overrides the default api_client to use the provided API key."""
|
| 17 |
+
return Client(
|
| 18 |
+
api_key=self.api_key,
|
| 19 |
+
http_options=types.HttpOptions(
|
| 20 |
+
headers=self._tracking_headers(),
|
| 21 |
+
retry_options=self.retry_options,
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# --- Retry Configuration ---
|
| 26 |
+
retry_config = types.HttpRetryOptions(
|
| 27 |
+
attempts=5, # Maximum retry attempts
|
| 28 |
+
exp_base=7, # Delay multiplier
|
| 29 |
+
initial_delay=1,
|
| 30 |
+
http_status_codes=[429, 500, 503, 504], # Retry on these HTTP errors
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def get_video_analysis_system(api_key: Optional[str] = None, model_name: str = "gemini-2.1-flash-lite"):
|
| 34 |
+
"""
|
| 35 |
+
Factory function to create a configured VideoAnalysisSystem.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
# Use the custom Gemini class with the provided API key
|
| 39 |
+
llm_model = CustomGemini(
|
| 40 |
+
model=model_name,
|
| 41 |
+
api_key=api_key,
|
| 42 |
+
retry_options=retry_config
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# --- Individual Agents ---
|
| 46 |
+
|
| 47 |
+
# Context Summary Agent: Provides a neutral summary of the video content.
|
| 48 |
+
context_summary_agent = Agent(
|
| 49 |
+
name="ContextSummaryAgent",
|
| 50 |
+
model=llm_model,
|
| 51 |
+
instruction="""
|
| 52 |
+
<thinking>
|
| 53 |
+
1. Scan the video, caption, and transcript for key entities and events.
|
| 54 |
+
2. Identify the main narrative thread or objective of the content.
|
| 55 |
+
3. Synthesize the findings into a neutral, objective summary.
|
| 56 |
+
</thinking>
|
| 57 |
+
Output format: summary: text[1]{text}: "Summary text" """,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Political Bias Agent: Identifies political leaning and quantifies bias.
|
| 61 |
+
political_bias_agent = Agent(
|
| 62 |
+
name="PoliticalBiasAgent",
|
| 63 |
+
model=llm_model,
|
| 64 |
+
instruction="""
|
| 65 |
+
<thinking>
|
| 66 |
+
1. Examine the language used for loaded terms or rhetorical devices.
|
| 67 |
+
2. Analyze the context of mentioned political figures or institutions.
|
| 68 |
+
3. Determine the leaning (Left/Right/Center) and evaluate the intensity of bias.
|
| 69 |
+
</thinking>
|
| 70 |
+
Output format: political_bias: details[1]{score,reasoning}: (Int),"Reasoning" """,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Criticism Level Agent: Measures the degree of hostility or support in the tone.
|
| 74 |
+
criticism_level_agent = Agent(
|
| 75 |
+
name="CriticismLevelAgent",
|
| 76 |
+
model=llm_model,
|
| 77 |
+
instruction="""
|
| 78 |
+
<thinking>
|
| 79 |
+
1. Assess the emotional valence of the speaker and visual cues.
|
| 80 |
+
2. Identify instances of direct criticism, sarcasm, or praise.
|
| 81 |
+
3. Quantify the overall hostility level on a neutral-to-supportive scale.
|
| 82 |
+
</thinking>
|
| 83 |
+
Output format: criticism_level: details[1]{score,reasoning}: (Int),"Reasoning" """,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Modalities Agent: Evaluates the consistency between video, audio, and text.
|
| 87 |
+
modalities_agent = Agent(
|
| 88 |
+
name="ModalitiesAgent",
|
| 89 |
+
model=llm_model,
|
| 90 |
+
instruction="""
|
| 91 |
+
<thinking>
|
| 92 |
+
1. Compare visual events with audio descriptions for sync issues or contradictions.
|
| 93 |
+
2. Verify if the user-provided caption accurately reflects the visual content.
|
| 94 |
+
3. Check the transcript against both the audio and the caption for discrepancies.
|
| 95 |
+
</thinking>
|
| 96 |
+
Output format:
|
| 97 |
+
video_audio_pairing: details[1]{score,reasoning}: (Int),"Reasoning"
|
| 98 |
+
video_caption_pairing: details[1]{score,reasoning}: (Int),"Reasoning"
|
| 99 |
+
audio_caption_pairing: details[1]{score,reasoning}: (Int),"Reasoning" """,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Disinformation Agent: Analyzes potential manipulation and threat levels.
|
| 103 |
+
disinformation_agent = Agent(
|
| 104 |
+
name="DisinformationAgent",
|
| 105 |
+
model=llm_model,
|
| 106 |
+
instruction="""
|
| 107 |
+
<thinking>
|
| 108 |
+
1. Search for signs of technical manipulation (deepfakes, AI artifacts).
|
| 109 |
+
2. Analyze the intent behind potential misinformation (Political/Commercial).
|
| 110 |
+
3. Evaluate the risk level and the specific threat vector used.
|
| 111 |
+
</thinking>
|
| 112 |
+
Output format: disinformation_analysis: details[1]{level,intent,threat_vector}: (Int),(Intent),(Vector) """,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Sentiment Bias Agent: Captures the overall emotional tone and inherent bias.
|
| 116 |
+
sentiment_bias_agent = Agent(
|
| 117 |
+
name="SentimentBiasAgent",
|
| 118 |
+
model=llm_model,
|
| 119 |
+
instruction="""
|
| 120 |
+
<thinking>
|
| 121 |
+
1. Aggregate the emotional signals from the entire video duration.
|
| 122 |
+
2. Identify recurring biased patterns or slanted perspectives.
|
| 123 |
+
3. Synthesize a comprehensive overview of the sentiment and bias.
|
| 124 |
+
</thinking>
|
| 125 |
+
Output format: sentiment_and_bias: text[1]{text}: "Synthesis text" """,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# --- Agent Grouping ---
|
| 129 |
+
|
| 130 |
+
# The ParallelAgent runs all its sub-agents simultaneously.
|
| 131 |
+
analysis_team = ParallelAgent(
|
| 132 |
+
name="AnalysisTeam",
|
| 133 |
+
sub_agents=[
|
| 134 |
+
context_summary_agent,
|
| 135 |
+
political_bias_agent,
|
| 136 |
+
criticism_level_agent,
|
| 137 |
+
modalities_agent,
|
| 138 |
+
disinformation_agent,
|
| 139 |
+
sentiment_bias_agent
|
| 140 |
+
],
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# This SequentialAgent defines the high-level workflow.
|
| 144 |
+
system = SequentialAgent(
|
| 145 |
+
name="VideoAnalysisSystem",
|
| 146 |
+
sub_agents=[analysis_team],
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
return system
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
system = get_video_analysis_system()
|
| 153 |
+
print("Video Analysis Agent System created successfully.")
|
src/app.py
ADDED
|
@@ -0,0 +1,987 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import asyncio
|
| 4 |
+
import subprocess
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import logging
|
| 7 |
+
import csv
|
| 8 |
+
import io
|
| 9 |
+
import datetime
|
| 10 |
+
import json
|
| 11 |
+
import hashlib
|
| 12 |
+
import re
|
| 13 |
+
from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
|
| 14 |
+
from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
|
| 15 |
+
from fastapi.templating import Jinja2Templates
|
| 16 |
+
from fastapi.staticfiles import StaticFiles
|
| 17 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 18 |
+
import yt_dlp
|
| 19 |
+
import inference_logic
|
| 20 |
+
import factuality_logic
|
| 21 |
+
import transcription
|
| 22 |
+
from factuality_logic import parse_vtt
|
| 23 |
+
from toon_parser import parse_veracity_toon
|
| 24 |
+
|
| 25 |
+
# --- Fix for Large CSV Fields ---
|
| 26 |
+
try:
|
| 27 |
+
csv.field_size_limit(sys.maxsize)
|
| 28 |
+
except OverflowError:
|
| 29 |
+
csv.field_size_limit(2147483647)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
import mlcroissant as mlc
|
| 33 |
+
CROISSANT_AVAILABLE = True
|
| 34 |
+
except ImportError:
|
| 35 |
+
try:
|
| 36 |
+
import croissant as mlc
|
| 37 |
+
CROISSANT_AVAILABLE = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
mlc = None
|
| 40 |
+
CROISSANT_AVAILABLE = False
|
| 41 |
+
|
| 42 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
| 43 |
+
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 44 |
+
|
| 45 |
+
app = FastAPI()
|
| 46 |
+
|
| 47 |
+
app.add_middleware(
|
| 48 |
+
CORSMiddleware,
|
| 49 |
+
allow_origins=["*"],
|
| 50 |
+
allow_credentials=True,
|
| 51 |
+
allow_methods=["*"],
|
| 52 |
+
allow_headers=["*"],
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
STATIC_DIR = "static"
|
| 56 |
+
if os.path.isdir("/usr/share/vchat/static"):
|
| 57 |
+
STATIC_DIR = "/usr/share/vchat/static"
|
| 58 |
+
elif os.path.isdir("frontend/dist"):
|
| 59 |
+
STATIC_DIR = "frontend/dist"
|
| 60 |
+
elif not os.path.isdir(STATIC_DIR):
|
| 61 |
+
os.makedirs(STATIC_DIR, exist_ok=True)
|
| 62 |
+
dummy_index = Path(STATIC_DIR) / "index.html"
|
| 63 |
+
if not dummy_index.exists():
|
| 64 |
+
dummy_index.write_text("<html><body>vChat Backend Running. Access via Port 8005 (Go Server).</body></html>")
|
| 65 |
+
|
| 66 |
+
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
| 67 |
+
templates = Jinja2Templates(directory=STATIC_DIR)
|
| 68 |
+
|
| 69 |
+
os.makedirs("data/videos", exist_ok=True)
|
| 70 |
+
os.makedirs("data", exist_ok=True)
|
| 71 |
+
os.makedirs("data/labels", exist_ok=True)
|
| 72 |
+
os.makedirs("data/prompts", exist_ok=True)
|
| 73 |
+
os.makedirs("data/responses", exist_ok=True)
|
| 74 |
+
os.makedirs("metadata", exist_ok=True)
|
| 75 |
+
|
| 76 |
+
STOP_QUEUE_SIGNAL = False
|
| 77 |
+
|
| 78 |
+
# --- Helper: Robust CSV Reader ---
|
| 79 |
+
def robust_read_csv(file_path: Path):
|
| 80 |
+
"""
|
| 81 |
+
Reads a CSV file tolerantly. Yields dictionaries.
|
| 82 |
+
Handles 'line contains NUL', formatting errors, etc. by skipping bad rows.
|
| 83 |
+
"""
|
| 84 |
+
if not file_path.exists():
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 88 |
+
# Read header first
|
| 89 |
+
try:
|
| 90 |
+
reader = csv.DictReader(f)
|
| 91 |
+
# Iterate manually to catch errors per row
|
| 92 |
+
while True:
|
| 93 |
+
try:
|
| 94 |
+
row = next(reader)
|
| 95 |
+
yield row
|
| 96 |
+
except StopIteration:
|
| 97 |
+
break
|
| 98 |
+
except csv.Error as e:
|
| 99 |
+
logging.warning(f"CSV Parse Error in {file_path}: {e}")
|
| 100 |
+
continue
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logging.error(f"Failed to initialize CSV reader for {file_path}: {e}")
|
| 103 |
+
return
|
| 104 |
+
|
| 105 |
+
def ensure_manual_dataset():
|
| 106 |
+
"""Ensures the manual dataset file exists with headers."""
|
| 107 |
+
p = Path("data/manual_dataset.csv")
|
| 108 |
+
if not p.exists():
|
| 109 |
+
with open(p, 'w', newline='', encoding='utf-8') as f:
|
| 110 |
+
# Standard schema + manual specific fields
|
| 111 |
+
writer = csv.writer(f)
|
| 112 |
+
writer.writerow([
|
| 113 |
+
"id", "link", "caption", "collecttime", "source",
|
| 114 |
+
"visual_integrity_score", "audio_integrity_score", "source_credibility_score",
|
| 115 |
+
"logical_consistency_score", "emotional_manipulation_score",
|
| 116 |
+
"video_audio_score", "video_caption_score", "audio_caption_score",
|
| 117 |
+
"final_veracity_score", "final_reasoning",
|
| 118 |
+
"stats_likes", "stats_shares", "stats_comments", "stats_platform", "tags"
|
| 119 |
+
])
|
| 120 |
+
|
| 121 |
+
@app.on_event("startup")
|
| 122 |
+
async def startup_event():
|
| 123 |
+
logging.info("Application starting up...")
|
| 124 |
+
ensure_manual_dataset()
|
| 125 |
+
if not LITE_MODE:
|
| 126 |
+
try:
|
| 127 |
+
inference_logic.load_models()
|
| 128 |
+
transcription.load_model()
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logging.fatal(f"Could not load models. Error: {e}", exc_info=True)
|
| 131 |
+
else:
|
| 132 |
+
logging.info("Running in LITE mode.")
|
| 133 |
+
|
| 134 |
+
@app.get("/", response_class=HTMLResponse)
|
| 135 |
+
async def read_root(request: Request):
|
| 136 |
+
custom_model_available = False
|
| 137 |
+
if not LITE_MODE:
|
| 138 |
+
custom_model_available = inference_logic.peft_model is not None
|
| 139 |
+
if not (Path(STATIC_DIR) / "index.html").exists():
|
| 140 |
+
return HTMLResponse(content="Frontend not found. Please build frontend or access via Go server.", status_code=404)
|
| 141 |
+
return templates.TemplateResponse("index.html", {
|
| 142 |
+
"request": request,
|
| 143 |
+
"custom_model_available": custom_model_available,
|
| 144 |
+
"lite_mode": LITE_MODE
|
| 145 |
+
})
|
| 146 |
+
|
| 147 |
+
@app.get("/model-architecture", response_class=PlainTextResponse)
|
| 148 |
+
async def get_model_architecture():
|
| 149 |
+
if LITE_MODE: return "Running in LITE mode."
|
| 150 |
+
if inference_logic.base_model: return str(inference_logic.base_model)
|
| 151 |
+
return "Model not loaded."
|
| 152 |
+
|
| 153 |
+
@app.get("/download-dataset")
|
| 154 |
+
async def download_dataset():
|
| 155 |
+
file_path = Path("data/dataset.csv")
|
| 156 |
+
if file_path.exists():
|
| 157 |
+
return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
|
| 158 |
+
return Response("Dataset not found.", status_code=404)
|
| 159 |
+
|
| 160 |
+
progress_message = ""
|
| 161 |
+
def progress_hook(d):
|
| 162 |
+
global progress_message
|
| 163 |
+
if d['status'] == 'downloading':
|
| 164 |
+
progress_message = f"Downloading: {d.get('_percent_str', 'N/A')} at {d.get('_speed_str', 'N/A')}\r"
|
| 165 |
+
elif d['status'] == 'finished':
|
| 166 |
+
progress_message = f"\nDownload finished. Preparing video assets...\n"
|
| 167 |
+
|
| 168 |
+
async def run_subprocess_async(command: list[str]):
|
| 169 |
+
process = await asyncio.create_subprocess_exec(*command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 170 |
+
stdout, stderr = await process.communicate()
|
| 171 |
+
if process.returncode != 0:
|
| 172 |
+
raise RuntimeError(f"Process failed:\n{stderr.decode()}")
|
| 173 |
+
return stdout.decode()
|
| 174 |
+
|
| 175 |
+
def extract_tweet_id(url: str) -> str | None:
|
| 176 |
+
match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
|
| 177 |
+
if match: return match.group(1)
|
| 178 |
+
return None
|
| 179 |
+
|
| 180 |
+
def normalize_link(link: str) -> str:
|
| 181 |
+
"""Standardize links for comparison."""
|
| 182 |
+
if not link: return ""
|
| 183 |
+
# Remove protocol and www to match robustly
|
| 184 |
+
s = link.split('?')[0].strip().rstrip('/').replace('http://', '').replace('https://', '').replace('www.', '')
|
| 185 |
+
return s
|
| 186 |
+
|
| 187 |
+
def check_if_processed(link: str) -> bool:
|
| 188 |
+
target_id = extract_tweet_id(link)
|
| 189 |
+
link_clean = normalize_link(link)
|
| 190 |
+
|
| 191 |
+
for filename in ["data/dataset.csv", "data/manual_dataset.csv"]:
|
| 192 |
+
path = Path(filename)
|
| 193 |
+
for row in robust_read_csv(path):
|
| 194 |
+
row_link = normalize_link(row.get('link', ''))
|
| 195 |
+
if row_link and row_link == link_clean: return True
|
| 196 |
+
|
| 197 |
+
row_id = row.get('id', '')
|
| 198 |
+
if target_id and row_id == target_id: return True
|
| 199 |
+
return False
|
| 200 |
+
|
| 201 |
+
async def prepare_video_assets_async(url: str) -> dict:
|
| 202 |
+
global progress_message
|
| 203 |
+
loop = asyncio.get_event_loop()
|
| 204 |
+
is_local = not (url.startswith("http://") or url.startswith("https://"))
|
| 205 |
+
video_id = "unknown"
|
| 206 |
+
transcript_path = None
|
| 207 |
+
|
| 208 |
+
if is_local:
|
| 209 |
+
original_path = Path(url)
|
| 210 |
+
if not original_path.exists(): raise FileNotFoundError(f"File not found: {url}")
|
| 211 |
+
video_id = hashlib.md5(str(url).encode('utf-8')).hexdigest()[:16]
|
| 212 |
+
metadata = {"id": video_id, "link": url, "caption": original_path.stem}
|
| 213 |
+
else:
|
| 214 |
+
tweet_id = extract_tweet_id(url)
|
| 215 |
+
video_id = tweet_id if tweet_id else hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
|
| 216 |
+
sanitized_check = Path(f"data/videos/{video_id}_fixed.mp4")
|
| 217 |
+
|
| 218 |
+
ydl_opts = {
|
| 219 |
+
'format': 'best[ext=mp4]/best',
|
| 220 |
+
'outtmpl': 'data/videos/%(id)s.%(ext)s',
|
| 221 |
+
'progress_hooks': [progress_hook], 'quiet': True, 'noplaylist': True, 'no_overwrites': True,
|
| 222 |
+
'writesubtitles': True, 'writeautomaticsub': True, 'subtitleslangs': ['en']
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
if sanitized_check.exists():
|
| 226 |
+
original_path = sanitized_check
|
| 227 |
+
metadata = {"id": video_id, "link": url, "caption": "Cached Video"}
|
| 228 |
+
else:
|
| 229 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 230 |
+
info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=True))
|
| 231 |
+
original_path = Path(ydl.prepare_filename(info))
|
| 232 |
+
metadata = {
|
| 233 |
+
"id": info.get("id", video_id), "link": info.get("webpage_url", url),
|
| 234 |
+
"caption": info.get("description", info.get("title", "N/A")).encode('ascii', 'ignore').decode('ascii').strip()[:500],
|
| 235 |
+
"postdatetime": info.get("upload_date", "N/A")
|
| 236 |
+
}
|
| 237 |
+
video_id = info.get("id", video_id)
|
| 238 |
+
|
| 239 |
+
transcript_path = next(Path("data/videos").glob(f"{video_id}*.en.vtt"), None)
|
| 240 |
+
if not transcript_path: transcript_path = next(Path("data/videos").glob(f"{video_id}*.vtt"), None)
|
| 241 |
+
|
| 242 |
+
sanitized_path = Path(f"data/videos/{video_id}_fixed.mp4")
|
| 243 |
+
if not sanitized_path.exists() and original_path.exists():
|
| 244 |
+
await run_subprocess_async(["ffmpeg", "-i", str(original_path), "-c:v", "libx264", "-preset", "fast", "-crf", "23", "-c:a", "aac", "-y", str(sanitized_path)])
|
| 245 |
+
|
| 246 |
+
audio_path = sanitized_path.with_suffix('.wav')
|
| 247 |
+
if not audio_path.exists() and sanitized_path.exists():
|
| 248 |
+
try:
|
| 249 |
+
await run_subprocess_async(["ffmpeg", "-i", str(sanitized_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", str(audio_path)])
|
| 250 |
+
except: pass
|
| 251 |
+
|
| 252 |
+
if not transcript_path and audio_path.exists() and not LITE_MODE:
|
| 253 |
+
transcript_path = await loop.run_in_executor(None, transcription.generate_transcript, str(audio_path))
|
| 254 |
+
|
| 255 |
+
return {"video": str(sanitized_path), "transcript": str(transcript_path) if transcript_path else None, "metadata": metadata}
|
| 256 |
+
|
| 257 |
+
def safe_int(value):
|
| 258 |
+
try:
|
| 259 |
+
clean = re.sub(r'[^\d]', '', str(value))
|
| 260 |
+
return int(clean) if clean else 0
|
| 261 |
+
except Exception:
|
| 262 |
+
return 0
|
| 263 |
+
|
| 264 |
+
async def generate_and_save_croissant_metadata(row_data: dict) -> str:
|
| 265 |
+
try:
|
| 266 |
+
sanitized_data = {
|
| 267 |
+
"id": str(row_data.get("id", "")),
|
| 268 |
+
"link": str(row_data.get("link", "")),
|
| 269 |
+
"visual_integrity_score": safe_int(row_data.get("visual_integrity_score")),
|
| 270 |
+
"final_veracity_score": safe_int(row_data.get("final_veracity_score"))
|
| 271 |
+
}
|
| 272 |
+
video_id = sanitized_data["id"]
|
| 273 |
+
timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
| 274 |
+
croissant_json = {
|
| 275 |
+
"@context": "https://schema.org/",
|
| 276 |
+
"@type": "Dataset",
|
| 277 |
+
"name": f"vchat-label-{video_id}",
|
| 278 |
+
"description": f"Veracity analysis labels for video {video_id}",
|
| 279 |
+
"url": sanitized_data["link"],
|
| 280 |
+
"variableMeasured": sanitized_data
|
| 281 |
+
}
|
| 282 |
+
path = Path("metadata") / f"{video_id}_{timestamp}.json"
|
| 283 |
+
path.write_text(json.dumps(croissant_json, indent=2))
|
| 284 |
+
return str(path)
|
| 285 |
+
except Exception:
|
| 286 |
+
return "N/A (Error)"
|
| 287 |
+
|
| 288 |
+
async def get_labels_for_link(video_url: str, gemini_config: dict, vertex_config: dict, model_selection: str, include_comments: bool, reasoning_method: str = "cot"):
|
| 289 |
+
try:
|
| 290 |
+
yield f"Downloading assets for {video_url}..."
|
| 291 |
+
paths = await prepare_video_assets_async(video_url)
|
| 292 |
+
video_path = paths["video"]
|
| 293 |
+
transcript_text = parse_vtt(paths["transcript"]) if paths["transcript"] else "No transcript."
|
| 294 |
+
caption = paths["metadata"].get("caption", "")
|
| 295 |
+
|
| 296 |
+
yield f"Assets ready. Running inference ({model_selection}, {reasoning_method.upper()})..."
|
| 297 |
+
final_labels = None
|
| 298 |
+
raw_toon = ""
|
| 299 |
+
prompt_used = ""
|
| 300 |
+
|
| 301 |
+
pipeline = inference_logic.run_gemini_labeling_pipeline if model_selection == 'gemini' else inference_logic.run_vertex_labeling_pipeline
|
| 302 |
+
config = gemini_config if model_selection == 'gemini' else vertex_config
|
| 303 |
+
|
| 304 |
+
async for msg in pipeline(video_path, caption, transcript_text, config, include_comments, reasoning_method):
|
| 305 |
+
if isinstance(msg, dict) and "parsed_data" in msg:
|
| 306 |
+
final_labels = msg["parsed_data"]
|
| 307 |
+
raw_toon = msg.get("raw_toon", "")
|
| 308 |
+
prompt_used = msg.get("prompt_used", "")
|
| 309 |
+
elif isinstance(msg, str): yield msg
|
| 310 |
+
|
| 311 |
+
if not final_labels: raise RuntimeError("No labels generated.")
|
| 312 |
+
|
| 313 |
+
final_labels["meta_info"] = {
|
| 314 |
+
"prompt_used": prompt_used,
|
| 315 |
+
"model_selection": model_selection,
|
| 316 |
+
"reasoning_method": reasoning_method
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
vec = final_labels.get("veracity_vectors", {})
|
| 320 |
+
mod = final_labels.get("modalities", {})
|
| 321 |
+
fin = final_labels.get("final_assessment", {})
|
| 322 |
+
tags = final_labels.get("tags", [])
|
| 323 |
+
|
| 324 |
+
row = {
|
| 325 |
+
"id": paths["metadata"]["id"],
|
| 326 |
+
"link": paths["metadata"]["link"],
|
| 327 |
+
"caption": caption,
|
| 328 |
+
"postdatetime": paths["metadata"].get("postdatetime", ""),
|
| 329 |
+
"collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 330 |
+
"videotranscriptionpath": paths["transcript"] or "",
|
| 331 |
+
"visual_integrity_score": vec.get("visual_integrity_score", "0"),
|
| 332 |
+
"audio_integrity_score": vec.get("audio_integrity_score", "0"),
|
| 333 |
+
"source_credibility_score": vec.get("source_credibility_score", "0"),
|
| 334 |
+
"logical_consistency_score": vec.get("logical_consistency_score", "0"),
|
| 335 |
+
"emotional_manipulation_score": vec.get("emotional_manipulation_score", "0"),
|
| 336 |
+
"video_audio_score": mod.get("video_audio_score", "0"),
|
| 337 |
+
"video_caption_score": mod.get("video_caption_score", "0"),
|
| 338 |
+
"audio_caption_score": mod.get("audio_caption_score", "0"),
|
| 339 |
+
"final_veracity_score": fin.get("veracity_score_total", "0"),
|
| 340 |
+
"final_reasoning": fin.get("reasoning", ""),
|
| 341 |
+
"tags": ", ".join(tags)
|
| 342 |
+
}
|
| 343 |
+
yield {"csv_row": row, "full_json": final_labels, "raw_toon": raw_toon}
|
| 344 |
+
|
| 345 |
+
except Exception as e:
|
| 346 |
+
yield {"error": str(e)}
|
| 347 |
+
|
| 348 |
+
@app.get("/queue/list")
|
| 349 |
+
async def get_queue_list():
|
| 350 |
+
queue_path = Path("data/batch_queue.csv")
|
| 351 |
+
items = []
|
| 352 |
+
|
| 353 |
+
for row in robust_read_csv(queue_path):
|
| 354 |
+
if len(row) > 0:
|
| 355 |
+
link = row.get("link")
|
| 356 |
+
if not link: continue
|
| 357 |
+
status = "Processed" if check_if_processed(link) else "Pending"
|
| 358 |
+
items.append({
|
| 359 |
+
"link": link,
|
| 360 |
+
"timestamp": row.get("ingest_timestamp", ""),
|
| 361 |
+
"status": status
|
| 362 |
+
})
|
| 363 |
+
return items
|
| 364 |
+
|
| 365 |
+
@app.delete("/queue/delete")
|
| 366 |
+
async def delete_queue_item(link: str):
|
| 367 |
+
queue_path = Path("data/batch_queue.csv")
|
| 368 |
+
if not queue_path.exists():
|
| 369 |
+
return {"status": "error", "message": "Queue file not found"}
|
| 370 |
+
|
| 371 |
+
rows = []
|
| 372 |
+
deleted = False
|
| 373 |
+
try:
|
| 374 |
+
# Read using robust reader to get clean data, but we need to write back using standard writer
|
| 375 |
+
# This implies we might lose bad rows if we rewrite.
|
| 376 |
+
# Strategy: Read all into memory, filter, write back.
|
| 377 |
+
|
| 378 |
+
# Read header first to preserve it
|
| 379 |
+
with open(queue_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 380 |
+
reader = csv.reader(f)
|
| 381 |
+
try: header = next(reader)
|
| 382 |
+
except StopIteration: header = ["link", "ingest_timestamp"]
|
| 383 |
+
|
| 384 |
+
all_rows = list(robust_read_csv(queue_path))
|
| 385 |
+
new_rows = []
|
| 386 |
+
for row in all_rows:
|
| 387 |
+
if row.get("link") == link:
|
| 388 |
+
deleted = True
|
| 389 |
+
else:
|
| 390 |
+
new_rows.append(row)
|
| 391 |
+
|
| 392 |
+
if deleted:
|
| 393 |
+
with open(queue_path, 'w', newline='', encoding='utf-8') as f:
|
| 394 |
+
writer = csv.DictWriter(f, fieldnames=header)
|
| 395 |
+
writer.writeheader()
|
| 396 |
+
writer.writerows(new_rows)
|
| 397 |
+
return {"status": "success", "link": link}
|
| 398 |
+
else:
|
| 399 |
+
return {"status": "not_found", "message": "Link not found in queue"}
|
| 400 |
+
|
| 401 |
+
except Exception as e:
|
| 402 |
+
return {"status": "error", "message": str(e)}
|
| 403 |
+
|
| 404 |
+
@app.post("/queue/stop")
|
| 405 |
+
async def stop_queue_processing():
|
| 406 |
+
global STOP_QUEUE_SIGNAL
|
| 407 |
+
STOP_QUEUE_SIGNAL = True
|
| 408 |
+
return {"status": "stopping"}
|
| 409 |
+
|
| 410 |
+
@app.post("/queue/upload_csv")
|
| 411 |
+
async def upload_csv_to_queue(file: UploadFile = File(...)):
|
| 412 |
+
try:
|
| 413 |
+
content = await file.read()
|
| 414 |
+
decoded = content.decode('utf-8').splitlines()
|
| 415 |
+
reader = csv.reader(decoded)
|
| 416 |
+
links_to_add = []
|
| 417 |
+
header = next(reader, None)
|
| 418 |
+
if not header: return {"status": "empty file"}
|
| 419 |
+
|
| 420 |
+
link_idx = 0
|
| 421 |
+
header_lower = [h.lower() for h in header]
|
| 422 |
+
if "link" in header_lower: link_idx = header_lower.index("link")
|
| 423 |
+
elif "url" in header_lower: link_idx = header_lower.index("url")
|
| 424 |
+
elif "http" in header[0]:
|
| 425 |
+
links_to_add.append(header[0])
|
| 426 |
+
link_idx = 0
|
| 427 |
+
|
| 428 |
+
for row in reader:
|
| 429 |
+
if len(row) > link_idx and row[link_idx].strip():
|
| 430 |
+
links_to_add.append(row[link_idx].strip())
|
| 431 |
+
|
| 432 |
+
queue_path = Path("data/batch_queue.csv")
|
| 433 |
+
existing_links = set()
|
| 434 |
+
if queue_path.exists():
|
| 435 |
+
with open(queue_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 436 |
+
existing_links = set(f.read().splitlines())
|
| 437 |
+
|
| 438 |
+
added_count = 0
|
| 439 |
+
with open(queue_path, 'a', newline='', encoding='utf-8') as f:
|
| 440 |
+
writer = csv.writer(f)
|
| 441 |
+
if not queue_path.exists() or queue_path.stat().st_size == 0:
|
| 442 |
+
writer.writerow(["link", "ingest_timestamp"])
|
| 443 |
+
|
| 444 |
+
for link in links_to_add:
|
| 445 |
+
duplicate = False
|
| 446 |
+
for line in existing_links:
|
| 447 |
+
if link in line:
|
| 448 |
+
duplicate = True
|
| 449 |
+
break
|
| 450 |
+
if duplicate: continue
|
| 451 |
+
|
| 452 |
+
writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
|
| 453 |
+
added_count += 1
|
| 454 |
+
|
| 455 |
+
return {"status": "success", "added": added_count}
|
| 456 |
+
except Exception as e:
|
| 457 |
+
return JSONResponse(status_code=400, content={"error": str(e), "status": "failed"})
|
| 458 |
+
|
| 459 |
+
@app.post("/queue/run")
|
| 460 |
+
async def run_queue_processing(
|
| 461 |
+
model_selection: str = Form(...),
|
| 462 |
+
gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
|
| 463 |
+
vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
|
| 464 |
+
include_comments: bool = Form(False),
|
| 465 |
+
reasoning_method: str = Form("cot")
|
| 466 |
+
):
|
| 467 |
+
global STOP_QUEUE_SIGNAL
|
| 468 |
+
STOP_QUEUE_SIGNAL = False
|
| 469 |
+
gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
|
| 470 |
+
vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
|
| 471 |
+
|
| 472 |
+
async def queue_stream():
|
| 473 |
+
queue_path = Path("data/batch_queue.csv")
|
| 474 |
+
items = []
|
| 475 |
+
for row in robust_read_csv(queue_path):
|
| 476 |
+
l = row.get("link")
|
| 477 |
+
if l: items.append(l)
|
| 478 |
+
|
| 479 |
+
if not items:
|
| 480 |
+
yield "data: Queue empty.\n\n"
|
| 481 |
+
return
|
| 482 |
+
|
| 483 |
+
processed_count = 0
|
| 484 |
+
total = len(items)
|
| 485 |
+
|
| 486 |
+
for i, link in enumerate(items):
|
| 487 |
+
if STOP_QUEUE_SIGNAL:
|
| 488 |
+
yield "data: [SYSTEM] Stopped by user.\n\n"
|
| 489 |
+
break
|
| 490 |
+
|
| 491 |
+
if check_if_processed(link):
|
| 492 |
+
yield f"data: [SKIP] {link} processed.\n\n"
|
| 493 |
+
continue
|
| 494 |
+
|
| 495 |
+
yield f"data: [START] {i+1}/{total}: {link}\n\n"
|
| 496 |
+
final_data = None
|
| 497 |
+
async for res in get_labels_for_link(link, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
|
| 498 |
+
if isinstance(res, str): yield f"data: {res}\n\n"
|
| 499 |
+
if isinstance(res, dict) and "csv_row" in res: final_data = res
|
| 500 |
+
|
| 501 |
+
if final_data:
|
| 502 |
+
row = final_data["csv_row"]
|
| 503 |
+
vid_id = row["id"]
|
| 504 |
+
ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
| 505 |
+
|
| 506 |
+
# 1. Save JSON
|
| 507 |
+
json_path = f"data/labels/{vid_id}_{ts}_labels.json"
|
| 508 |
+
with open(json_path, 'w') as f: json.dump(final_data["full_json"], f, indent=2)
|
| 509 |
+
|
| 510 |
+
# 2. Save TOON
|
| 511 |
+
with open(f"data/labels/{vid_id}_{ts}.toon", 'w') as f: f.write(final_data["raw_toon"])
|
| 512 |
+
|
| 513 |
+
# 3. Save Prompt
|
| 514 |
+
prompt_content = final_data.get("full_json", {}).get("meta_info", {}).get("prompt_used", "")
|
| 515 |
+
if prompt_content:
|
| 516 |
+
with open(f"data/prompts/{vid_id}_{ts}_prompt.txt", 'w', encoding='utf-8') as f:
|
| 517 |
+
f.write(prompt_content)
|
| 518 |
+
|
| 519 |
+
# 4. Save Raw Response
|
| 520 |
+
raw_response = final_data.get("raw_toon", "")
|
| 521 |
+
if raw_response:
|
| 522 |
+
with open(f"data/responses/{vid_id}.txt", 'w', encoding='utf-8') as f:
|
| 523 |
+
f.write(raw_response)
|
| 524 |
+
|
| 525 |
+
row["metadatapath"] = await generate_and_save_croissant_metadata(row)
|
| 526 |
+
row["json_path"] = json_path
|
| 527 |
+
|
| 528 |
+
# 5. Save to CSV
|
| 529 |
+
dpath = Path("data/dataset.csv")
|
| 530 |
+
exists = dpath.exists()
|
| 531 |
+
with open(dpath, 'a', newline='', encoding='utf-8') as f:
|
| 532 |
+
writer = csv.DictWriter(f, fieldnames=list(row.keys()), extrasaction='ignore')
|
| 533 |
+
if not exists: writer.writeheader()
|
| 534 |
+
writer.writerow(row)
|
| 535 |
+
|
| 536 |
+
processed_count += 1
|
| 537 |
+
yield f"data: [SUCCESS] Labeled.\n\n"
|
| 538 |
+
else:
|
| 539 |
+
yield f"data: [FAIL] Failed to label.\n\n"
|
| 540 |
+
|
| 541 |
+
yield f"data: Batch Complete. +{processed_count} videos labeled.\n\n"
|
| 542 |
+
yield "event: close\ndata: Done\n\n"
|
| 543 |
+
|
| 544 |
+
return StreamingResponse(queue_stream(), media_type="text/event-stream")
|
| 545 |
+
|
| 546 |
+
@app.post("/extension/ingest")
|
| 547 |
+
async def extension_ingest(request: Request):
|
| 548 |
+
try:
|
| 549 |
+
data = await request.json()
|
| 550 |
+
link = data.get("link")
|
| 551 |
+
if not link: raise HTTPException(status_code=400, detail="No link")
|
| 552 |
+
queue_path = Path("data/batch_queue.csv")
|
| 553 |
+
file_exists = queue_path.exists()
|
| 554 |
+
|
| 555 |
+
if file_exists:
|
| 556 |
+
with open(queue_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 557 |
+
if link in f.read():
|
| 558 |
+
return {"status": "queued", "msg": "Duplicate"}
|
| 559 |
+
|
| 560 |
+
with open(queue_path, 'a', newline='', encoding='utf-8') as f:
|
| 561 |
+
writer = csv.writer(f)
|
| 562 |
+
if not file_exists: writer.writerow(["link", "ingest_timestamp"])
|
| 563 |
+
writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
|
| 564 |
+
|
| 565 |
+
return {"status": "queued", "link": link}
|
| 566 |
+
except Exception as e:
|
| 567 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 568 |
+
|
| 569 |
+
@app.post("/extension/save_comments")
|
| 570 |
+
async def extension_save_comments(request: Request):
|
| 571 |
+
try:
|
| 572 |
+
data = await request.json()
|
| 573 |
+
link = data.get("link")
|
| 574 |
+
# Comments can be a list of strings (legacy) or objects (new)
|
| 575 |
+
comments = data.get("comments", [])
|
| 576 |
+
if not link or not comments: raise HTTPException(status_code=400, detail="Missing data")
|
| 577 |
+
|
| 578 |
+
csv_path = Path("data/comments.csv")
|
| 579 |
+
exists = csv_path.exists()
|
| 580 |
+
|
| 581 |
+
# Define fields for comment storage
|
| 582 |
+
fieldnames = ["link", "author", "comment_text", "timestamp"]
|
| 583 |
+
|
| 584 |
+
with open(csv_path, 'a', newline='', encoding='utf-8') as f:
|
| 585 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
| 586 |
+
if not exists: writer.writeheader()
|
| 587 |
+
|
| 588 |
+
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 589 |
+
for c in comments:
|
| 590 |
+
row = {"link": link, "timestamp": ts}
|
| 591 |
+
if isinstance(c, dict):
|
| 592 |
+
row["author"] = c.get("author", "Unknown")
|
| 593 |
+
row["comment_text"] = c.get("text", "").strip()
|
| 594 |
+
else:
|
| 595 |
+
# Legacy string support
|
| 596 |
+
row["author"] = "Unknown"
|
| 597 |
+
row["comment_text"] = str(c).strip()
|
| 598 |
+
|
| 599 |
+
if row["comment_text"]:
|
| 600 |
+
writer.writerow(row)
|
| 601 |
+
|
| 602 |
+
return {"status": "saved", "count": len(comments)}
|
| 603 |
+
except Exception as e:
|
| 604 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 605 |
+
|
| 606 |
+
@app.post("/extension/save_manual")
|
| 607 |
+
async def extension_save_manual(request: Request):
|
| 608 |
+
try:
|
| 609 |
+
data = await request.json()
|
| 610 |
+
link = data.get("link")
|
| 611 |
+
labels = data.get("labels", {})
|
| 612 |
+
stats = data.get("stats", {})
|
| 613 |
+
tags = data.get("tags", "") # Accept tags string
|
| 614 |
+
|
| 615 |
+
if not link: raise HTTPException(status_code=400, detail="No link")
|
| 616 |
+
|
| 617 |
+
video_id = extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:16]
|
| 618 |
+
|
| 619 |
+
# Ensure manual dataset exists
|
| 620 |
+
ensure_manual_dataset()
|
| 621 |
+
|
| 622 |
+
# 1. Build row data for Manual Dataset
|
| 623 |
+
row_data = {
|
| 624 |
+
"id": video_id,
|
| 625 |
+
"link": link,
|
| 626 |
+
"caption": data.get("caption", ""),
|
| 627 |
+
"collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 628 |
+
"source": "manual_extension",
|
| 629 |
+
|
| 630 |
+
# Vectors
|
| 631 |
+
"visual_integrity_score": labels.get("visual_integrity_score", 0),
|
| 632 |
+
"audio_integrity_score": labels.get("audio_integrity_score", 0),
|
| 633 |
+
"source_credibility_score": labels.get("source_credibility_score", 0),
|
| 634 |
+
"logical_consistency_score": labels.get("logical_consistency_score", 0),
|
| 635 |
+
"emotional_manipulation_score": labels.get("emotional_manipulation_score", 0),
|
| 636 |
+
|
| 637 |
+
# Modalities
|
| 638 |
+
"video_audio_score": labels.get("video_audio_score", 0),
|
| 639 |
+
"video_caption_score": labels.get("video_caption_score", 0),
|
| 640 |
+
"audio_caption_score": labels.get("audio_caption_score", 0),
|
| 641 |
+
|
| 642 |
+
"final_veracity_score": labels.get("final_veracity_score", 0),
|
| 643 |
+
"final_reasoning": labels.get("reasoning", ""),
|
| 644 |
+
|
| 645 |
+
# New Stats & Tags
|
| 646 |
+
"stats_likes": stats.get("likes", 0),
|
| 647 |
+
"stats_shares": stats.get("shares", 0),
|
| 648 |
+
"stats_comments": stats.get("comments", 0),
|
| 649 |
+
"stats_platform": stats.get("platform", "unknown"),
|
| 650 |
+
"tags": tags
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
# Save to manual_dataset.csv using Upsert logic (Clean Dataset)
|
| 654 |
+
dpath = Path("data/manual_dataset.csv")
|
| 655 |
+
rows = []
|
| 656 |
+
replaced = False
|
| 657 |
+
|
| 658 |
+
# Read existing
|
| 659 |
+
if dpath.exists():
|
| 660 |
+
rows = list(robust_read_csv(dpath))
|
| 661 |
+
|
| 662 |
+
new_rows = []
|
| 663 |
+
for r in rows:
|
| 664 |
+
if r.get('id') == video_id:
|
| 665 |
+
new_rows.append(row_data) # Replace
|
| 666 |
+
replaced = True
|
| 667 |
+
else:
|
| 668 |
+
new_rows.append(r)
|
| 669 |
+
|
| 670 |
+
if not replaced:
|
| 671 |
+
new_rows.append(row_data)
|
| 672 |
+
|
| 673 |
+
# Write back all
|
| 674 |
+
with open(dpath, 'w', newline='', encoding='utf-8') as f:
|
| 675 |
+
writer = csv.DictWriter(f, fieldnames=list(row_data.keys()), extrasaction='ignore')
|
| 676 |
+
writer.writeheader()
|
| 677 |
+
writer.writerows(new_rows)
|
| 678 |
+
|
| 679 |
+
# 2. PERFORM COMPARISON AGAINST AI DATA
|
| 680 |
+
ai_path = Path("data/dataset.csv")
|
| 681 |
+
ai_data = None
|
| 682 |
+
if ai_path.exists():
|
| 683 |
+
for row in robust_read_csv(ai_path):
|
| 684 |
+
# Find by link or ID (Normalize first)
|
| 685 |
+
r_link = normalize_link(row.get('link', ''))
|
| 686 |
+
t_link = normalize_link(link)
|
| 687 |
+
|
| 688 |
+
if r_link == t_link or row.get('id') == video_id:
|
| 689 |
+
ai_data = row
|
| 690 |
+
break
|
| 691 |
+
|
| 692 |
+
if ai_data:
|
| 693 |
+
# Calculate Differences (AI - Manual)
|
| 694 |
+
comp_path = Path("data/comparison.csv")
|
| 695 |
+
comp_exists = comp_path.exists()
|
| 696 |
+
|
| 697 |
+
# Helper to extract int safely
|
| 698 |
+
def get_int(d, k):
|
| 699 |
+
try:
|
| 700 |
+
# sanitize weird strings like "(9)"
|
| 701 |
+
val = str(d.get(k, 0))
|
| 702 |
+
val = re.sub(r'[^\d]', '', val)
|
| 703 |
+
return int(val) if val else 0
|
| 704 |
+
except: return 0
|
| 705 |
+
|
| 706 |
+
comparison_row = {
|
| 707 |
+
"id": video_id,
|
| 708 |
+
"link": link,
|
| 709 |
+
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 710 |
+
|
| 711 |
+
# Visual
|
| 712 |
+
"ai_visual": get_int(ai_data, "visual_integrity_score"),
|
| 713 |
+
"manual_visual": row_data["visual_integrity_score"],
|
| 714 |
+
"delta_visual": get_int(ai_data, "visual_integrity_score") - row_data["visual_integrity_score"],
|
| 715 |
+
|
| 716 |
+
# Audio
|
| 717 |
+
"ai_audio": get_int(ai_data, "audio_integrity_score"),
|
| 718 |
+
"manual_audio": row_data["audio_integrity_score"],
|
| 719 |
+
"delta_audio": get_int(ai_data, "audio_integrity_score") - row_data["audio_integrity_score"],
|
| 720 |
+
|
| 721 |
+
# Source
|
| 722 |
+
"ai_source": get_int(ai_data, "source_credibility_score"),
|
| 723 |
+
"manual_source": row_data["source_credibility_score"],
|
| 724 |
+
"delta_source": get_int(ai_data, "source_credibility_score") - row_data["source_credibility_score"],
|
| 725 |
+
|
| 726 |
+
# Logic
|
| 727 |
+
"ai_logic": get_int(ai_data, "logical_consistency_score"),
|
| 728 |
+
"manual_logic": row_data["logical_consistency_score"],
|
| 729 |
+
"delta_logic": get_int(ai_data, "logical_consistency_score") - row_data["logical_consistency_score"],
|
| 730 |
+
|
| 731 |
+
# Final
|
| 732 |
+
"ai_final": get_int(ai_data, "final_veracity_score"),
|
| 733 |
+
"manual_final": row_data["final_veracity_score"],
|
| 734 |
+
"delta_final": get_int(ai_data, "final_veracity_score") - row_data["final_veracity_score"]
|
| 735 |
+
}
|
| 736 |
+
|
| 737 |
+
# Upsert into comparison.csv as well
|
| 738 |
+
comp_rows = []
|
| 739 |
+
if comp_exists:
|
| 740 |
+
comp_rows = list(robust_read_csv(comp_path))
|
| 741 |
+
|
| 742 |
+
final_comp_rows = []
|
| 743 |
+
comp_replaced = False
|
| 744 |
+
for cr in comp_rows:
|
| 745 |
+
if cr.get('id') == video_id:
|
| 746 |
+
final_comp_rows.append(comparison_row)
|
| 747 |
+
comp_replaced = True
|
| 748 |
+
else:
|
| 749 |
+
final_comp_rows.append(cr)
|
| 750 |
+
if not comp_replaced:
|
| 751 |
+
final_comp_rows.append(comparison_row)
|
| 752 |
+
|
| 753 |
+
with open(comp_path, 'w', newline='', encoding='utf-8') as f:
|
| 754 |
+
writer = csv.DictWriter(f, fieldnames=list(comparison_row.keys()), extrasaction='ignore')
|
| 755 |
+
writer.writeheader()
|
| 756 |
+
writer.writerows(final_comp_rows)
|
| 757 |
+
|
| 758 |
+
return {"status": "saved", "compared": True if ai_data else False}
|
| 759 |
+
except Exception as e:
|
| 760 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 761 |
+
|
| 762 |
+
@app.get("/workflow/status")
|
| 763 |
+
async def get_workflow_status():
|
| 764 |
+
"""
|
| 765 |
+
Returns a list of all known links (from queue and AI dataset),
|
| 766 |
+
indicating whether they have been Manually Labeled.
|
| 767 |
+
Matches primarily on ID (Tweet ID or Hash), falling back to Link.
|
| 768 |
+
"""
|
| 769 |
+
all_links = {}
|
| 770 |
+
|
| 771 |
+
def get_canonical_key(link, rid=None):
|
| 772 |
+
# 1. Try Tweet ID extraction
|
| 773 |
+
tid = extract_tweet_id(link)
|
| 774 |
+
if tid: return tid
|
| 775 |
+
# 2. Use existing ID if valid
|
| 776 |
+
if rid and str(rid).strip(): return str(rid).strip()
|
| 777 |
+
# 3. Fallback to normalized link
|
| 778 |
+
return normalize_link(link)
|
| 779 |
+
|
| 780 |
+
# 1. Load Queue (Raw)
|
| 781 |
+
qp = Path("data/batch_queue.csv")
|
| 782 |
+
for row in robust_read_csv(qp):
|
| 783 |
+
url = row.get("link", "").strip()
|
| 784 |
+
if url:
|
| 785 |
+
# Queue doesn't usually have ID, so we rely on extraction or link
|
| 786 |
+
key = get_canonical_key(url)
|
| 787 |
+
all_links[key] = {
|
| 788 |
+
"link": url, "source": "queue",
|
| 789 |
+
"ai_status": "Pending", "manual_status": "Pending",
|
| 790 |
+
"ai_data": None
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
# 2. Load AI Labels
|
| 794 |
+
dp = Path("data/dataset.csv")
|
| 795 |
+
for row in robust_read_csv(dp):
|
| 796 |
+
url = row.get("link", "").strip()
|
| 797 |
+
rid = row.get("id", "").strip()
|
| 798 |
+
|
| 799 |
+
# If we have data, we definitely have a key
|
| 800 |
+
key = get_canonical_key(url, rid)
|
| 801 |
+
|
| 802 |
+
if key not in all_links:
|
| 803 |
+
all_links[key] = {"link": url, "source": "dataset", "manual_status": "Pending"}
|
| 804 |
+
|
| 805 |
+
all_links[key]["ai_status"] = "Labeled"
|
| 806 |
+
all_links[key]["ai_data"] = {
|
| 807 |
+
"visual": row.get("visual_integrity_score"),
|
| 808 |
+
"final": row.get("final_veracity_score"),
|
| 809 |
+
"reasoning": row.get("final_reasoning"),
|
| 810 |
+
"tags": row.get("tags", "")
|
| 811 |
+
}
|
| 812 |
+
|
| 813 |
+
# 3. Load Manual Labels
|
| 814 |
+
mp = Path("data/manual_dataset.csv")
|
| 815 |
+
for row in robust_read_csv(mp):
|
| 816 |
+
url = row.get("link", "").strip()
|
| 817 |
+
rid = row.get("id", "").strip()
|
| 818 |
+
|
| 819 |
+
key = get_canonical_key(url, rid)
|
| 820 |
+
|
| 821 |
+
if key in all_links:
|
| 822 |
+
all_links[key]["manual_status"] = "Completed"
|
| 823 |
+
all_links[key]["manual_tags"] = row.get("tags", "")
|
| 824 |
+
else:
|
| 825 |
+
# In case manual label exists without queue/AI entry
|
| 826 |
+
all_links[key] = {
|
| 827 |
+
"link": url, "source": "manual_only",
|
| 828 |
+
"ai_status": "Unknown", "manual_status": "Completed",
|
| 829 |
+
"manual_tags": row.get("tags", "")
|
| 830 |
+
}
|
| 831 |
+
|
| 832 |
+
return list(all_links.values())
|
| 833 |
+
|
| 834 |
+
@app.get("/manage/list")
|
| 835 |
+
async def list_data():
|
| 836 |
+
data = []
|
| 837 |
+
|
| 838 |
+
# 1. Build Index of Manual Labels to check "Need Manual" status
|
| 839 |
+
manual_index = set()
|
| 840 |
+
mp = Path("data/manual_dataset.csv")
|
| 841 |
+
for row in robust_read_csv(mp):
|
| 842 |
+
if row.get('link'): manual_index.add(normalize_link(row['link']))
|
| 843 |
+
if row.get('id'): manual_index.add(row['id'].strip())
|
| 844 |
+
|
| 845 |
+
def read_csv(path, source_type):
|
| 846 |
+
for row in robust_read_csv(path):
|
| 847 |
+
if not row.get('id') or row['id'].strip() == "":
|
| 848 |
+
link = row.get('link', '')
|
| 849 |
+
tid = extract_tweet_id(link)
|
| 850 |
+
row['id'] = tid if tid else hashlib.md5(link.encode()).hexdigest()[:16]
|
| 851 |
+
|
| 852 |
+
json_content = None
|
| 853 |
+
if row.get('json_path') and os.path.exists(row['json_path']):
|
| 854 |
+
try:
|
| 855 |
+
with open(row['json_path'], 'r') as jf: json_content = json.load(jf)
|
| 856 |
+
except: pass
|
| 857 |
+
|
| 858 |
+
row['source_type'] = source_type
|
| 859 |
+
row['json_data'] = json_content
|
| 860 |
+
|
| 861 |
+
# NEW: Verification Status for AI rows
|
| 862 |
+
if source_type == "auto":
|
| 863 |
+
lid = row.get('id')
|
| 864 |
+
llink = normalize_link(row.get('link', ''))
|
| 865 |
+
if lid in manual_index or llink in manual_index:
|
| 866 |
+
row['manual_verification_status'] = "Verified"
|
| 867 |
+
else:
|
| 868 |
+
row['manual_verification_status'] = "Need Manual"
|
| 869 |
+
|
| 870 |
+
data.append(row)
|
| 871 |
+
|
| 872 |
+
read_csv(Path("data/dataset.csv"), "auto")
|
| 873 |
+
read_csv(Path("data/manual_dataset.csv"), "manual")
|
| 874 |
+
data.sort(key=lambda x: x.get('collecttime', ''), reverse=True)
|
| 875 |
+
return data
|
| 876 |
+
|
| 877 |
+
@app.get("/manage/comparison_data")
|
| 878 |
+
async def get_comparison_data():
|
| 879 |
+
"""
|
| 880 |
+
Returns an aggregated dataset joining AI Labels vs Manual Labels for visualization.
|
| 881 |
+
"""
|
| 882 |
+
ai_data = {}
|
| 883 |
+
# Load AI Data
|
| 884 |
+
for row in robust_read_csv(Path("data/dataset.csv")):
|
| 885 |
+
# Key by ID or Link
|
| 886 |
+
key = row.get("id")
|
| 887 |
+
if not key: key = normalize_link(row.get("link"))
|
| 888 |
+
ai_data[key] = row
|
| 889 |
+
|
| 890 |
+
comparisons = []
|
| 891 |
+
|
| 892 |
+
# Load Manual Data and Match
|
| 893 |
+
for manual in robust_read_csv(Path("data/manual_dataset.csv")):
|
| 894 |
+
key = manual.get("id")
|
| 895 |
+
if not key: key = normalize_link(manual.get("link"))
|
| 896 |
+
|
| 897 |
+
if key in ai_data:
|
| 898 |
+
ai = ai_data[key]
|
| 899 |
+
|
| 900 |
+
def get_score(d, k):
|
| 901 |
+
try:
|
| 902 |
+
val = str(d.get(k, 0))
|
| 903 |
+
val = re.sub(r'[^\d]', '', val) # strip non-digits
|
| 904 |
+
return int(val) if val else 0
|
| 905 |
+
except: return 0
|
| 906 |
+
|
| 907 |
+
item = {
|
| 908 |
+
"id": key,
|
| 909 |
+
"link": manual.get("link"),
|
| 910 |
+
"scores": {
|
| 911 |
+
"visual": {"ai": get_score(ai, "visual_integrity_score"), "manual": get_score(manual, "visual_integrity_score")},
|
| 912 |
+
"audio": {"ai": get_score(ai, "audio_integrity_score"), "manual": get_score(manual, "audio_integrity_score")},
|
| 913 |
+
"final": {"ai": get_score(ai, "final_veracity_score"), "manual": get_score(manual, "final_veracity_score")}
|
| 914 |
+
}
|
| 915 |
+
}
|
| 916 |
+
# Calculate Delta (AI - Manual)
|
| 917 |
+
item["deltas"] = {
|
| 918 |
+
"visual": item["scores"]["visual"]["ai"] - item["scores"]["visual"]["manual"],
|
| 919 |
+
"audio": item["scores"]["audio"]["ai"] - item["scores"]["audio"]["manual"],
|
| 920 |
+
"final": item["scores"]["final"]["ai"] - item["scores"]["final"]["manual"]
|
| 921 |
+
}
|
| 922 |
+
comparisons.append(item)
|
| 923 |
+
|
| 924 |
+
return comparisons
|
| 925 |
+
|
| 926 |
+
@app.delete("/manage/delete")
|
| 927 |
+
async def delete_data(id: str = "", link: str = ""):
|
| 928 |
+
if not id and not link: raise HTTPException(status_code=400, detail="Must provide ID or Link")
|
| 929 |
+
deleted_count = 0
|
| 930 |
+
target_id = id
|
| 931 |
+
|
| 932 |
+
def remove_from_csv(path):
|
| 933 |
+
nonlocal deleted_count, target_id
|
| 934 |
+
if not path.exists(): return
|
| 935 |
+
|
| 936 |
+
# We need to rewrite, so we read all then write back
|
| 937 |
+
rows = list(robust_read_csv(path))
|
| 938 |
+
|
| 939 |
+
# We need headers for DictWriter, infer from first row or file check
|
| 940 |
+
fieldnames = []
|
| 941 |
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
| 942 |
+
reader = csv.DictReader(f)
|
| 943 |
+
fieldnames = reader.fieldnames
|
| 944 |
+
|
| 945 |
+
new_rows = []
|
| 946 |
+
found_in_file = False
|
| 947 |
+
for row in rows:
|
| 948 |
+
is_match = False
|
| 949 |
+
if id and row.get('id') == id: is_match = True
|
| 950 |
+
elif link and normalize_link(row.get('link', '')) == normalize_link(link): is_match = True
|
| 951 |
+
|
| 952 |
+
if is_match:
|
| 953 |
+
found_in_file = True
|
| 954 |
+
deleted_count += 1
|
| 955 |
+
if not target_id: target_id = row.get('id')
|
| 956 |
+
else:
|
| 957 |
+
new_rows.append(row)
|
| 958 |
+
|
| 959 |
+
if found_in_file and fieldnames:
|
| 960 |
+
with open(path, 'w', newline='', encoding='utf-8') as f:
|
| 961 |
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 962 |
+
writer.writeheader()
|
| 963 |
+
writer.writerows(new_rows)
|
| 964 |
+
|
| 965 |
+
remove_from_csv(Path("data/dataset.csv"))
|
| 966 |
+
remove_from_csv(Path("data/manual_dataset.csv"))
|
| 967 |
+
if target_id:
|
| 968 |
+
for p in Path("data/labels").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
|
| 969 |
+
for p in Path("metadata").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
|
| 970 |
+
return {"status": "deleted", "count": deleted_count}
|
| 971 |
+
|
| 972 |
+
@app.post("/label_video")
|
| 973 |
+
async def label_video_endpoint(
|
| 974 |
+
video_url: str = Form(...), model_selection: str = Form(...),
|
| 975 |
+
gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
|
| 976 |
+
vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
|
| 977 |
+
include_comments: bool = Form(False),
|
| 978 |
+
reasoning_method: str = Form("cot")
|
| 979 |
+
):
|
| 980 |
+
gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
|
| 981 |
+
vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
|
| 982 |
+
async def stream():
|
| 983 |
+
async for msg in get_labels_for_link(video_url, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
|
| 984 |
+
if isinstance(msg, str): yield f"data: {msg}\n\n"
|
| 985 |
+
if isinstance(msg, dict) and "csv_row" in msg: yield "data: Done. Labels generated.\n\n"
|
| 986 |
+
yield "event: close\ndata: Done.\n\n"
|
| 987 |
+
return StreamingResponse(stream(), media_type="text/event-stream")
|
src/croissant_transformer.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import datetime
|
| 5 |
+
|
| 6 |
+
# input dataframe row
|
| 7 |
+
# output croissant json
|
| 8 |
+
def Croissant_JSON_formating(row):
|
| 9 |
+
status_id = row['twitterlink'].split('/')[-1]
|
| 10 |
+
|
| 11 |
+
croissant_JSON = {
|
| 12 |
+
"@context": {
|
| 13 |
+
"@language": "en",
|
| 14 |
+
"@vocab": "https://schema.org/",
|
| 15 |
+
"cr": "http://mlcommons.org/croissant/1.0",
|
| 16 |
+
"sc": "https://schema.org/"
|
| 17 |
+
},
|
| 18 |
+
"@type": "sc:Dataset",
|
| 19 |
+
"name": f"Tweet-ID-{status_id}",
|
| 20 |
+
"description": f"A single social media video post(Tweet ID: {status_id}) by the user, structured as a Croissant Dataset.",
|
| 21 |
+
"url": row['twitterlink'],
|
| 22 |
+
"conformsTo": "http://mlcommons.org/croissant/1.0",
|
| 23 |
+
"datePublished": row['postdatetime'],
|
| 24 |
+
"creator": {
|
| 25 |
+
"@type": "sc:Organization",
|
| 26 |
+
"name": "Social Media Manually Collect Source"
|
| 27 |
+
},
|
| 28 |
+
"distribution": [
|
| 29 |
+
{
|
| 30 |
+
"@type": "cr:FileObject",
|
| 31 |
+
"name": f"source_tweet_url_{status_id}",
|
| 32 |
+
"encodingFormat": "text/html",
|
| 33 |
+
"contentUrl": row['twitterlink']
|
| 34 |
+
}
|
| 35 |
+
],
|
| 36 |
+
|
| 37 |
+
"recordSet": [
|
| 38 |
+
{
|
| 39 |
+
"@type": "cr:RecordSet",
|
| 40 |
+
"name": "tweet_data_record",
|
| 41 |
+
"description": "The specific data fields for this tweet.",
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
"field": [
|
| 45 |
+
{
|
| 46 |
+
"@type": "cr:Field",
|
| 47 |
+
"name": "internal_id",
|
| 48 |
+
"description": "Internal record ID from the source CSV.",
|
| 49 |
+
"data_type": "sc:Integer",
|
| 50 |
+
"sc:value": int(row['id'])
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"@type": "cr:Field",
|
| 54 |
+
"name": "tweet_text",
|
| 55 |
+
"description": "The full text/caption of the tweet.",
|
| 56 |
+
"data_type": "sc:Text",
|
| 57 |
+
"sc:value": row['caption']
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"@type": "cr:Field",
|
| 61 |
+
"name": "likes_count",
|
| 62 |
+
"description": "Number of likes/favorites.",
|
| 63 |
+
"data_type": "sc:Integer",
|
| 64 |
+
"sc:value": int(row['likes'])
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"@type": "cr:Field",
|
| 68 |
+
"name": "shares_count",
|
| 69 |
+
"description": "Number of shares/reposts.",
|
| 70 |
+
"data_type": "sc:Integer",
|
| 71 |
+
"sc:value": int(row['shares'])
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"@type": "cr:Field",
|
| 75 |
+
"name": "post_timestamp",
|
| 76 |
+
"description": "When the tweet was posted (ISO 8601).",
|
| 77 |
+
"data_type": "sc:DateTime",
|
| 78 |
+
"sc:value": row['postdatetime']
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"@type": "cr:Field",
|
| 82 |
+
"name": "collection_timestamp",
|
| 83 |
+
"description": "When this data was collected (ISO 8601).",
|
| 84 |
+
"data_type": "sc:DateTime",
|
| 85 |
+
"sc:value": row['collecttime']
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"@type": "cr:Field",
|
| 89 |
+
"name": "video_transcript_path",
|
| 90 |
+
"description": "Path to the video transcription file.",
|
| 91 |
+
"data_type": "sc:Text",
|
| 92 |
+
"sc:value": row['videotranscriptionpath'] if pd.notna(row['videotranscriptionpath']) else ""
|
| 93 |
+
}
|
| 94 |
+
]
|
| 95 |
+
}
|
| 96 |
+
]
|
| 97 |
+
}
|
| 98 |
+
return croissant_JSON
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# input twitterVideo.csv file path
|
| 102 |
+
# output folder "croissant_tweet_metadata" in current directory
|
| 103 |
+
# for every row in csv file
|
| 104 |
+
def croissant_transform(file_path):
|
| 105 |
+
# set output directory
|
| 106 |
+
OUTPUT_DIR = "croissant_tweet_metadata"
|
| 107 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 108 |
+
|
| 109 |
+
# read csv
|
| 110 |
+
df = pd.read_csv(file_path)
|
| 111 |
+
|
| 112 |
+
# columns with useful information, hard coded
|
| 113 |
+
df = df.iloc[:,0:9]
|
| 114 |
+
|
| 115 |
+
# loop
|
| 116 |
+
for index, row in df.iterrows():
|
| 117 |
+
croissant_json = Croissant_JSON_formating(row)
|
| 118 |
+
|
| 119 |
+
# save JSON-LD file
|
| 120 |
+
file_path = os.path.join(OUTPUT_DIR, f"tweet_metadata_{index+1}.json")
|
| 121 |
+
with open(file_path, 'w') as f:
|
| 122 |
+
json.dump(croissant_json, f, indent=2)
|
| 123 |
+
print(f"{file_path} generated")
|
| 124 |
+
|
| 125 |
+
print("Conversion Finished")
|
| 126 |
+
return
|
src/factuality_logic.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# factuality_logic.py
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import asyncio
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import inference_logic
|
| 9 |
+
from toon_parser import parse_toon_line
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
# --- Enhanced TOON Prompts for Individual Checks ---
|
| 14 |
+
# Using TOON reduces output tokens significantly compared to JSON.
|
| 15 |
+
|
| 16 |
+
PROMPT_VISUAL_ARTIFACTS = (
|
| 17 |
+
"Analyze the video for visual manipulation (Deepfakes, editing anomalies).\n"
|
| 18 |
+
"Steps inside <thinking>: 1. Scan for artifacts. 2. Check cuts.\n"
|
| 19 |
+
"Output TOON format:\n"
|
| 20 |
+
"visual_analysis: result[2]{score,justification}:\n"
|
| 21 |
+
"Score(1-10),\"Justification text\""
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
PROMPT_CONTENT_ANALYSIS = (
|
| 25 |
+
"Analyze the content for accuracy and logic.\n"
|
| 26 |
+
"Steps inside <thinking>: 1. Identify claims. 2. Check fallacies. 3. Assess emotion.\n"
|
| 27 |
+
"**Transcript:**\n{transcript}\n"
|
| 28 |
+
"Output TOON format:\n"
|
| 29 |
+
"content_analysis: result[2]{score,justification}:\n"
|
| 30 |
+
"Score(1-10),\"Justification text\""
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
PROMPT_AUDIO_ANALYSIS = (
|
| 34 |
+
"Analyze audio for synthesis or manipulation.\n"
|
| 35 |
+
"Steps inside <thinking>: 1. Listen for robotic inflections. 2. Check lip-sync.\n"
|
| 36 |
+
"**Transcript:**\n{transcript}\n"
|
| 37 |
+
"Output TOON format:\n"
|
| 38 |
+
"audio_analysis: result[2]{score,justification}:\n"
|
| 39 |
+
"Score(1-10),\"Justification text\""
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def parse_vtt(file_path: str) -> str:
|
| 44 |
+
"""Parses a .vtt subtitle file and returns the clean text content."""
|
| 45 |
+
try:
|
| 46 |
+
if not os.path.exists(file_path):
|
| 47 |
+
return "Transcript file not found."
|
| 48 |
+
|
| 49 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 50 |
+
lines = f.readlines()
|
| 51 |
+
|
| 52 |
+
text_lines = []
|
| 53 |
+
for line in lines:
|
| 54 |
+
line = line.strip()
|
| 55 |
+
if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
|
| 56 |
+
clean_line = re.sub(r'<[^>]+>', '', line)
|
| 57 |
+
if clean_line and (not text_lines or clean_line != text_lines[-1]):
|
| 58 |
+
text_lines.append(clean_line)
|
| 59 |
+
|
| 60 |
+
return "\n".join(text_lines) if text_lines else "No speech found in transcript."
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"Error parsing VTT file {file_path}: {e}")
|
| 63 |
+
return f"Error reading transcript: {e}"
|
| 64 |
+
|
| 65 |
+
async def run_factuality_pipeline(paths: dict, checks: dict, generation_config: dict):
|
| 66 |
+
"""
|
| 67 |
+
Asynchronously runs a pipeline of factuality checks, parses TOON scores, and yields results.
|
| 68 |
+
"""
|
| 69 |
+
video_path = paths.get("video")
|
| 70 |
+
transcript_path = paths.get("transcript")
|
| 71 |
+
|
| 72 |
+
if not video_path:
|
| 73 |
+
yield "ERROR: Video path not found. Cannot start analysis.\n\n"
|
| 74 |
+
return
|
| 75 |
+
|
| 76 |
+
yield "Step 1: Processing Transcript...\n"
|
| 77 |
+
await asyncio.sleep(0.1)
|
| 78 |
+
transcript = "No transcript was downloaded for this video."
|
| 79 |
+
if transcript_path and os.path.exists(transcript_path):
|
| 80 |
+
transcript = parse_vtt(transcript_path)
|
| 81 |
+
yield f" - Transcript file found and processed.\n"
|
| 82 |
+
else:
|
| 83 |
+
yield f" - No transcript file was found.\n"
|
| 84 |
+
|
| 85 |
+
yield f"\n--- Extracted Transcript ---\n{transcript}\n--------------------------\n\n"
|
| 86 |
+
await asyncio.sleep(0.1)
|
| 87 |
+
|
| 88 |
+
analysis_steps = []
|
| 89 |
+
if checks.get("visuals"):
|
| 90 |
+
analysis_steps.append(("Visual Integrity", PROMPT_VISUAL_ARTIFACTS))
|
| 91 |
+
if checks.get("content"):
|
| 92 |
+
analysis_steps.append(("Content Veracity", PROMPT_CONTENT_ANALYSIS.format(transcript=transcript)))
|
| 93 |
+
if checks.get("audio"):
|
| 94 |
+
analysis_steps.append(("Audio Forensics", PROMPT_AUDIO_ANALYSIS.format(transcript=transcript)))
|
| 95 |
+
|
| 96 |
+
for i, (title, prompt) in enumerate(analysis_steps):
|
| 97 |
+
yield f"--- Step {i + 2}: Running '{title}' Analysis ---\n"
|
| 98 |
+
yield "(Model is generating TOON analysis with scores...)\n\n"
|
| 99 |
+
await asyncio.sleep(0.1)
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
current_gen_config = generation_config.copy()
|
| 103 |
+
sampling_fps = current_gen_config.pop("sampling_fps", 2.0)
|
| 104 |
+
current_gen_config.pop("num_perceptions", None)
|
| 105 |
+
|
| 106 |
+
# FORCE LOW TEMP for structured TOON analysis
|
| 107 |
+
current_gen_config["temperature"] = 0.1
|
| 108 |
+
current_gen_config["do_sample"] = True
|
| 109 |
+
|
| 110 |
+
ans = inference_logic.inference_step(
|
| 111 |
+
video_path=video_path,
|
| 112 |
+
prompt=prompt,
|
| 113 |
+
generation_kwargs=current_gen_config,
|
| 114 |
+
sampling_fps=sampling_fps,
|
| 115 |
+
pred_glue=None
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
yield f" - Analysis Complete for '{title}'. Parsing TOON...\n\n"
|
| 119 |
+
|
| 120 |
+
# --- Attempt to parse TOON from the model's response ---
|
| 121 |
+
parsed_result = {}
|
| 122 |
+
# Regex to find the TOON data line: key: type[count]{headers}:\nVALUE
|
| 123 |
+
match = re.search(r'(\w+_analysis): result\[2\]\{score,justification\}:\s*\n(.+)', ans, re.MULTILINE)
|
| 124 |
+
|
| 125 |
+
thinking = "No thinking block found."
|
| 126 |
+
think_match = re.search(r'<thinking>(.*?)</thinking>', ans, re.DOTALL)
|
| 127 |
+
if think_match:
|
| 128 |
+
thinking = think_match.group(1).strip()
|
| 129 |
+
|
| 130 |
+
if match:
|
| 131 |
+
key, value_line = match.groups()
|
| 132 |
+
parsed_result = parse_toon_line({'key': key, 'headers': ['score', 'justification']}, value_line.strip())
|
| 133 |
+
else:
|
| 134 |
+
logger.warning(f"Could not parse TOON for '{title}'. Raw: {ans}")
|
| 135 |
+
yield f"Warning: Model did not return valid TOON. Raw output:\n{ans}\n"
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
# --- Display the parsed, structured result ---
|
| 139 |
+
score = parsed_result.get('score', 'N/A')
|
| 140 |
+
justification = parsed_result.get('justification', 'No justification provided.')
|
| 141 |
+
|
| 142 |
+
yield f"===== ANALYSIS RESULT: {title.upper()} =====\n"
|
| 143 |
+
yield f"SCORE: {score}/10\n"
|
| 144 |
+
yield f"Reasoning (Step-by-Step): {thinking}\n"
|
| 145 |
+
yield f"Final Justification: {justification}\n\n"
|
| 146 |
+
yield f"========================================\n\n"
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
error_message = f"An error occurred during the '{title}' analysis step: {e}"
|
| 150 |
+
logger.error(error_message, exc_info=True)
|
| 151 |
+
yield f"ERROR: {error_message}\n\n"
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
yield "Factuality Analysis Pipeline Finished.\n"
|
src/finetune.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| 8 |
+
from transformers import (
|
| 9 |
+
AutoProcessor,
|
| 10 |
+
QWen3VLForConditionalGeneration,
|
| 11 |
+
TrainingArguments,
|
| 12 |
+
BitsAndBytesConfig,
|
| 13 |
+
)
|
| 14 |
+
from trl import SFTTrainer
|
| 15 |
+
from my_vision_process import process_vision_info
|
| 16 |
+
|
| 17 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# Configuration
|
| 21 |
+
MODEL_ID = "OpenGVLab/VideoChat-R1_5"
|
| 22 |
+
# This path assumes the script runs from /app via "python src/finetune.py"
|
| 23 |
+
DATASET_PATH = "./data/insertlocaldataset.jsonl"
|
| 24 |
+
OUTPUT_DIR = "./lora_adapters"
|
| 25 |
+
|
| 26 |
+
# LoRA Configuration
|
| 27 |
+
LORA_RANK = 16
|
| 28 |
+
LORA_ALPHA = 32
|
| 29 |
+
LORA_DROPOUT = 0.05
|
| 30 |
+
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class MultiModalDataCollator:
|
| 35 |
+
"""A custom data collator to handle multimodal inputs (video + text) for the SFTTrainer."""
|
| 36 |
+
processor: AutoProcessor
|
| 37 |
+
|
| 38 |
+
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 39 |
+
if len(features) != 1:
|
| 40 |
+
logger.warning(f"Data collator expected batch size 1, but got {len(features)}. Processing first item only.")
|
| 41 |
+
|
| 42 |
+
feature = features[0]
|
| 43 |
+
video_path = feature.get("video_path")
|
| 44 |
+
text_prompt = feature.get("text")
|
| 45 |
+
|
| 46 |
+
if not video_path or not text_prompt:
|
| 47 |
+
raise ValueError("Dataset example missing 'video_path' or 'text' field.")
|
| 48 |
+
|
| 49 |
+
# create the msg format expected by the processor
|
| 50 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": video_path}, {"type": "text", "text": ""}]}]
|
| 51 |
+
|
| 52 |
+
text_with_placeholder = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
| 53 |
+
final_text = text_with_placeholder + text_prompt
|
| 54 |
+
|
| 55 |
+
image_inputs, video_inputs, _ = process_vision_info(messages, return_video_kwargs=True)
|
| 56 |
+
|
| 57 |
+
model_inputs = self.processor(
|
| 58 |
+
text=[final_text],
|
| 59 |
+
images=image_inputs,
|
| 60 |
+
videos=video_inputs,
|
| 61 |
+
padding=True,
|
| 62 |
+
return_tensors="pt"
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
model_inputs["labels"] = model_inputs["input_ids"].clone()
|
| 67 |
+
return model_inputs
|
| 68 |
+
|
| 69 |
+
def print_trainable_parameters(model):
|
| 70 |
+
"""Prints the number of trainable parameters in the model."""
|
| 71 |
+
trainable_params, all_param = 0, 0
|
| 72 |
+
for _, param in model.named_parameters():
|
| 73 |
+
all_param += param.numel()
|
| 74 |
+
if param.requires_grad:
|
| 75 |
+
trainable_params += param.numel()
|
| 76 |
+
logger.info(
|
| 77 |
+
f"Trainable params: {trainable_params:,} || All params: {all_param:,} || "
|
| 78 |
+
f"Trainable %: {100 * trainable_params / all_param:.2f}"
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
def main():
|
| 82 |
+
if not os.path.exists(DATASET_PATH):
|
| 83 |
+
logger.error(f"ERROR: Dataset not found at '{DATASET_PATH}'")
|
| 84 |
+
logger.error("Please create a JSONL file with your training data. See README.md for the format.")
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
logger.info(f"Loading base model and processor from {MODEL_ID}...")
|
| 89 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 90 |
+
|
| 91 |
+
quantization_config = BitsAndBytesConfig(
|
| 92 |
+
load_in_4bit=True,
|
| 93 |
+
bnb_4bit_quant_type="nf4",
|
| 94 |
+
bnb_4bit_compute_dtype=torch.bfloat16
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
model = QWen3VLForConditionalGeneration.from_pretrained(
|
| 98 |
+
MODEL_ID,
|
| 99 |
+
quantization_config=quantization_config,
|
| 100 |
+
device_map="auto",
|
| 101 |
+
)
|
| 102 |
+
model = prepare_model_for_kbit_training(model)
|
| 103 |
+
|
| 104 |
+
logger.info("configuring LoRA...")
|
| 105 |
+
lora_config = LoraConfig(
|
| 106 |
+
r=LORA_RANK,
|
| 107 |
+
lora_alpha=LORA_ALPHA,
|
| 108 |
+
lora_dropout=LORA_DROPOUT,
|
| 109 |
+
target_modules=TARGET_MODULES,
|
| 110 |
+
bias="none",
|
| 111 |
+
task_type="CAUSAL_LM",
|
| 112 |
+
)
|
| 113 |
+
model = get_peft_model(model, lora_config)
|
| 114 |
+
print_trainable_parameters(model)
|
| 115 |
+
|
| 116 |
+
logger.info(f"Loading dataset from {DATASET_PATH}...")
|
| 117 |
+
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
|
| 118 |
+
|
| 119 |
+
training_args = TrainingArguments(
|
| 120 |
+
output_dir=OUTPUT_DIR,
|
| 121 |
+
num_train_epochs=3,
|
| 122 |
+
per_device_train_batch_size=1,
|
| 123 |
+
gradient_accumulation_steps=4,
|
| 124 |
+
learning_rate=2e-4,
|
| 125 |
+
logging_steps=5,
|
| 126 |
+
save_strategy="steps",
|
| 127 |
+
save_steps=50,
|
| 128 |
+
save_total_limit=3,
|
| 129 |
+
optim="paged_adamw_8bit",
|
| 130 |
+
report_to="none",
|
| 131 |
+
bf16=True,
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# init trainer
|
| 135 |
+
trainer = SFTTrainer(
|
| 136 |
+
model=model,
|
| 137 |
+
args=training_args,
|
| 138 |
+
train_dataset=dataset,
|
| 139 |
+
dataset_text_field="text",
|
| 140 |
+
max_seq_length=2048,
|
| 141 |
+
|
| 142 |
+
# data_collator=MultiModalDataCollator(processor=processor), # Uncomment if needed
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
logger.info("fine-tuning stage")
|
| 147 |
+
|
| 148 |
+
last_checkpoint = None
|
| 149 |
+
if os.path.isdir(training_args.output_dir):
|
| 150 |
+
potential_dirs = [os.path.join(training_args.output_dir, d) for d in os.listdir(training_args.output_dir)]
|
| 151 |
+
checkpoints = [d for d in potential_dirs if os.path.isdir(d) and os.path.basename(d).startswith("checkpoint-")]
|
| 152 |
+
if checkpoints:
|
| 153 |
+
checkpoints.sort(key=lambda x: int(os.path.basename(x).split('-')[-1]))
|
| 154 |
+
last_checkpoint = checkpoints[-1]
|
| 155 |
+
logger.info(f"Resuming training from checkpoint: {last_checkpoint}")
|
| 156 |
+
|
| 157 |
+
trainer.train(resume_from_checkpoint=last_checkpoint)
|
| 158 |
+
logger.info("Training complete.")
|
| 159 |
+
|
| 160 |
+
fin_path = os.path.join(OUTPUT_DIR, "final_checkpoint")
|
| 161 |
+
trainer.save_model(fin_path)
|
| 162 |
+
logger.info(f"final LoRA adapters saved to {fin_path}")
|
| 163 |
+
|
| 164 |
+
if __name__ == "__main__":
|
| 165 |
+
main()
|
src/inference_logic.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import re
|
| 3 |
+
import ast
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
import logging
|
| 7 |
+
import asyncio
|
| 8 |
+
import json
|
| 9 |
+
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 10 |
+
from peft import PeftModel
|
| 11 |
+
from my_vision_process import process_vision_info, client
|
| 12 |
+
from labeling_logic import (
|
| 13 |
+
LABELING_PROMPT_TEMPLATE, SCORE_INSTRUCTIONS_SIMPLE, SCORE_INSTRUCTIONS_REASONING,
|
| 14 |
+
SCHEMA_SIMPLE, SCHEMA_REASONING,
|
| 15 |
+
FCOT_MACRO_PROMPT, FCOT_MESO_PROMPT, FCOT_SYNTHESIS_PROMPT
|
| 16 |
+
)
|
| 17 |
+
from toon_parser import parse_veracity_toon
|
| 18 |
+
from agents import get_video_analysis_system
|
| 19 |
+
|
| 20 |
+
# Google GenAI Imports
|
| 21 |
+
try:
|
| 22 |
+
import google.generativeai as genai_legacy
|
| 23 |
+
from google.generativeai.types import generation_types
|
| 24 |
+
except ImportError:
|
| 25 |
+
genai_legacy = None
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# Modern Google GenAI SDK (v1)
|
| 29 |
+
from google import genai
|
| 30 |
+
from google.genai.types import (
|
| 31 |
+
GenerateContentConfig,
|
| 32 |
+
HttpOptions,
|
| 33 |
+
Retrieval,
|
| 34 |
+
Tool,
|
| 35 |
+
VertexAISearch,
|
| 36 |
+
GoogleSearch,
|
| 37 |
+
Part
|
| 38 |
+
)
|
| 39 |
+
import vertexai
|
| 40 |
+
except ImportError:
|
| 41 |
+
genai = None
|
| 42 |
+
vertexai = None
|
| 43 |
+
|
| 44 |
+
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 45 |
+
processor = None
|
| 46 |
+
base_model = None
|
| 47 |
+
peft_model = None
|
| 48 |
+
active_model = None
|
| 49 |
+
logger = logging.getLogger(__name__)
|
| 50 |
+
|
| 51 |
+
def load_models():
|
| 52 |
+
# FIX: Declare globals at the start
|
| 53 |
+
global LITE_MODE, processor, base_model, peft_model, active_model
|
| 54 |
+
|
| 55 |
+
if LITE_MODE:
|
| 56 |
+
logger.info("LITE_MODE is enabled. Skipping local model loading.")
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
if base_model is not None: return
|
| 60 |
+
|
| 61 |
+
if not torch.cuda.is_available():
|
| 62 |
+
# In the root container, assume GPU is needed unless forcing LITE mode logic was desired.
|
| 63 |
+
# However, to match the fix:
|
| 64 |
+
logger.warning("CUDA is not available. This application requires a GPU for local models. Switching to LITE_MODE.")
|
| 65 |
+
LITE_MODE = True
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
device = torch.device("cuda")
|
| 69 |
+
logger.info(f"CUDA is available. Initializing models on {device}...")
|
| 70 |
+
local_model_path = "/app/local_model"
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
import flash_attn
|
| 74 |
+
attn_implementation = "flash_attention_2"
|
| 75 |
+
except ImportError:
|
| 76 |
+
attn_implementation = "sdpa"
|
| 77 |
+
|
| 78 |
+
logger.info(f"Loading base model from {local_model_path}...")
|
| 79 |
+
try:
|
| 80 |
+
base_model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 81 |
+
local_model_path, dtype=torch.bfloat16, device_map="auto", attn_implementation=attn_implementation
|
| 82 |
+
).eval()
|
| 83 |
+
processor = AutoProcessor.from_pretrained(local_model_path)
|
| 84 |
+
active_model = base_model
|
| 85 |
+
except Exception as e:
|
| 86 |
+
logger.error(f"Failed to load local model: {e}")
|
| 87 |
+
LITE_MODE = True
|
| 88 |
+
|
| 89 |
+
def switch_active_model(model_name: str):
|
| 90 |
+
global active_model, base_model, peft_model
|
| 91 |
+
if model_name == "custom" and peft_model is not None:
|
| 92 |
+
active_model = peft_model
|
| 93 |
+
else:
|
| 94 |
+
active_model = base_model
|
| 95 |
+
|
| 96 |
+
def inference_step(video_path, prompt, generation_kwargs, sampling_fps, pred_glue=None):
|
| 97 |
+
global processor, active_model
|
| 98 |
+
if active_model is None: raise RuntimeError("Models not loaded.")
|
| 99 |
+
|
| 100 |
+
messages = [
|
| 101 |
+
{"role": "user", "content": [
|
| 102 |
+
{"type": "video", "video": video_path, 'key_time': pred_glue, 'fps': sampling_fps,
|
| 103 |
+
"total_pixels": 128*12 * 28 * 28, "min_pixels": 128 * 28 * 28},
|
| 104 |
+
{"type": "text", "text": prompt},
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
]
|
| 108 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 109 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client=client)
|
| 110 |
+
fps_inputs = video_kwargs['fps'][0]
|
| 111 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
|
| 112 |
+
inputs = {k: v.to(active_model.device) for k, v in inputs.items()}
|
| 113 |
+
|
| 114 |
+
with torch.no_grad():
|
| 115 |
+
output_ids = active_model.generate(**inputs, **generation_kwargs, use_cache=True)
|
| 116 |
+
|
| 117 |
+
generated_ids = [output_ids[i][len(inputs['input_ids'][i]):] for i in range(len(output_ids))]
|
| 118 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
|
| 119 |
+
return output_text[0]
|
| 120 |
+
|
| 121 |
+
async def run_inference_pipeline(video_path, question, generation_config, prompts):
|
| 122 |
+
num_perceptions = int(generation_config.pop("num_perceptions", 3))
|
| 123 |
+
sampling_fps = float(generation_config.pop("sampling_fps", 2.0))
|
| 124 |
+
if generation_config.get("temperature", 0.0) > 0.0: generation_config["do_sample"] = True
|
| 125 |
+
pred_glue = None
|
| 126 |
+
final_answer = "No answer."
|
| 127 |
+
|
| 128 |
+
for percption in range(num_perceptions):
|
| 129 |
+
yield f" Perception {percption + 1}/{num_perceptions} \n"
|
| 130 |
+
current_prompt = prompts["glue"].replace("[QUESTION]", question) if percption < num_perceptions - 1 else prompts["final"].replace("[QUESTION]", question)
|
| 131 |
+
ans = inference_step(video_path, current_prompt, generation_kwargs=generation_config, pred_glue=pred_glue, sampling_fps=sampling_fps)
|
| 132 |
+
yield f"Model Output: {ans}\n"
|
| 133 |
+
final_answer = ans
|
| 134 |
+
match_glue = re.search(r'<glue>(.*?)</glue>', ans, re.DOTALL)
|
| 135 |
+
if match_glue:
|
| 136 |
+
pred_glue = ast.literal_eval(match_glue.group(1).strip())
|
| 137 |
+
else:
|
| 138 |
+
pred_glue = None
|
| 139 |
+
yield f"\n Final Answer \n{final_answer}\n"
|
| 140 |
+
|
| 141 |
+
async def attempt_toon_repair(original_text: str, schema: str, client, model_type: str, config: dict):
|
| 142 |
+
logger.info("Attempting TOON Repair via separate AI call...")
|
| 143 |
+
repair_prompt = (
|
| 144 |
+
f"SYSTEM: You are a data formatting expert. The following output from a previous model "
|
| 145 |
+
f"failed to parse correctly. \n"
|
| 146 |
+
f"YOUR TASK: Rewrite the data strictly into the following TOON schema. Do not add new content, "
|
| 147 |
+
f"just format it. If scores are missing, infer them from the text or default to 0.\n\n"
|
| 148 |
+
f"TARGET SCHEMA:\n{schema}\n\n"
|
| 149 |
+
f"BAD OUTPUT:\n{original_text}\n"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
loop = asyncio.get_event_loop()
|
| 154 |
+
repaired_text = ""
|
| 155 |
+
|
| 156 |
+
if model_type == 'gemini':
|
| 157 |
+
model = genai_legacy.GenerativeModel("models/gemini-2.0-flash-exp")
|
| 158 |
+
response = await loop.run_in_executor(
|
| 159 |
+
None,
|
| 160 |
+
lambda: model.generate_content(repair_prompt, generation_config={"temperature": 0.0})
|
| 161 |
+
)
|
| 162 |
+
repaired_text = response.text
|
| 163 |
+
|
| 164 |
+
elif model_type == 'vertex':
|
| 165 |
+
# Use separate client instance if needed or passed client
|
| 166 |
+
cl = client if client else genai.Client(vertexai=True, project=config['project_id'], location=config['location'])
|
| 167 |
+
response = await loop.run_in_executor(
|
| 168 |
+
None,
|
| 169 |
+
lambda: cl.models.generate_content(
|
| 170 |
+
model=config['model_name'],
|
| 171 |
+
contents=repair_prompt,
|
| 172 |
+
config=GenerateContentConfig(temperature=0.0)
|
| 173 |
+
)
|
| 174 |
+
)
|
| 175 |
+
repaired_text = response.text
|
| 176 |
+
|
| 177 |
+
logger.info(f"Repair successful. New text length: {len(repaired_text)}")
|
| 178 |
+
return repaired_text
|
| 179 |
+
except Exception as e:
|
| 180 |
+
logger.error(f"Repair failed: {e}")
|
| 181 |
+
return original_text
|
| 182 |
+
|
| 183 |
+
async def run_gemini_labeling_pipeline(video_path: str, caption: str, transcript: str, gemini_config: dict, include_comments: bool, reasoning_method: str = "cot"):
|
| 184 |
+
if genai_legacy is None:
|
| 185 |
+
yield "ERROR: Legacy SDK missing.\n"
|
| 186 |
+
return
|
| 187 |
+
|
| 188 |
+
api_key = gemini_config.get("api_key")
|
| 189 |
+
if not api_key: return
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
genai_legacy.configure(api_key=api_key)
|
| 193 |
+
loop = asyncio.get_event_loop()
|
| 194 |
+
uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.upload_file(path=video_path))
|
| 195 |
+
while uploaded_file.state.name == "PROCESSING": await asyncio.sleep(2)
|
| 196 |
+
|
| 197 |
+
model = genai_legacy.GenerativeModel("models/gemini-2.0-flash-exp")
|
| 198 |
+
toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
|
| 199 |
+
score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
|
| 200 |
+
|
| 201 |
+
raw_text = ""
|
| 202 |
+
prompt_used = ""
|
| 203 |
+
|
| 204 |
+
if reasoning_method == "fcot":
|
| 205 |
+
# ... existing fcot logic ...
|
| 206 |
+
pass
|
| 207 |
+
elif reasoning_method == "agentic":
|
| 208 |
+
yield "Starting Multi-Agent Analysis..."
|
| 209 |
+
# Create a clean system instance with the provided API key
|
| 210 |
+
model_name = gemini_config.get("model_name", "models/gemini-2.1-flash-lite")
|
| 211 |
+
system = get_video_analysis_system(api_key=api_key, model_name=model_name)
|
| 212 |
+
|
| 213 |
+
results = await loop.run_in_executor(None, lambda: system.run(
|
| 214 |
+
video_path=video_path,
|
| 215 |
+
caption=caption,
|
| 216 |
+
transcript=transcript
|
| 217 |
+
))
|
| 218 |
+
# Concatenate agent outputs for parsing
|
| 219 |
+
raw_text = "\n\n".join([str(res) for res in results])
|
| 220 |
+
prompt_used = "Multi-Agent System Execution"
|
| 221 |
+
else:
|
| 222 |
+
# --- Standard Chain of Thought (Single Turn) ---
|
| 223 |
+
prompt_text = LABELING_PROMPT_TEMPLATE.format(
|
| 224 |
+
caption=caption,
|
| 225 |
+
transcript=transcript,
|
| 226 |
+
toon_schema=toon_schema,
|
| 227 |
+
score_instructions=score_instructions
|
| 228 |
+
)
|
| 229 |
+
prompt_used = prompt_text
|
| 230 |
+
yield "Generating Labels (Standard CoT)..."
|
| 231 |
+
response = await loop.run_in_executor(None, lambda: model.generate_content([prompt_text, uploaded_file], generation_config={"temperature": 0.1}))
|
| 232 |
+
raw_text = response.text
|
| 233 |
+
|
| 234 |
+
if not raw_text:
|
| 235 |
+
yield "Model returned empty response (possibly triggered safety filter)."
|
| 236 |
+
yield {"error": "Empty Response"}
|
| 237 |
+
return
|
| 238 |
+
|
| 239 |
+
parsed_data = parse_veracity_toon(raw_text)
|
| 240 |
+
|
| 241 |
+
is_zero = parsed_data['veracity_vectors']['visual_integrity_score'] == '0'
|
| 242 |
+
if is_zero:
|
| 243 |
+
yield "Parsing incomplete (score 0). Initiating Auto-Repair..."
|
| 244 |
+
raw_text = await attempt_toon_repair(raw_text, toon_schema, None, 'gemini', gemini_config)
|
| 245 |
+
parsed_data = parse_veracity_toon(raw_text)
|
| 246 |
+
|
| 247 |
+
yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
|
| 248 |
+
await loop.run_in_executor(None, lambda: genai_legacy.delete_file(name=uploaded_file.name))
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
yield f"ERROR: {e}"
|
| 252 |
+
|
| 253 |
+
async def run_vertex_labeling_pipeline(video_path: str, caption: str, transcript: str, vertex_config: dict, include_comments: bool, reasoning_method: str = "cot"):
|
| 254 |
+
if genai is None:
|
| 255 |
+
yield "ERROR: 'google-genai' not installed.\n"
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
project_id = vertex_config.get("project_id")
|
| 259 |
+
location = vertex_config.get("location", "us-central1")
|
| 260 |
+
model_name = vertex_config.get("model_name", "gemini-1.5-pro-preview-0409")
|
| 261 |
+
|
| 262 |
+
if not project_id: return
|
| 263 |
+
|
| 264 |
+
try:
|
| 265 |
+
client = genai.Client(vertexai=True, project=project_id, location=location)
|
| 266 |
+
with open(video_path, 'rb') as f:
|
| 267 |
+
video_bytes = f.read()
|
| 268 |
+
video_part = Part.from_bytes(data=video_bytes, mime_type="video/mp4")
|
| 269 |
+
|
| 270 |
+
toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
|
| 271 |
+
score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
|
| 272 |
+
|
| 273 |
+
raw_text = ""
|
| 274 |
+
prompt_used = ""
|
| 275 |
+
|
| 276 |
+
loop = asyncio.get_event_loop()
|
| 277 |
+
config = GenerateContentConfig(
|
| 278 |
+
temperature=0.1,
|
| 279 |
+
response_mime_type="text/plain",
|
| 280 |
+
max_output_tokens=8192,
|
| 281 |
+
tools=[Tool(google_search=GoogleSearch())]
|
| 282 |
+
)
|
| 283 |
+
|
| 284 |
+
if reasoning_method == "fcot":
|
| 285 |
+
# --- Fractal Chain of Thought (Vertex) ---
|
| 286 |
+
yield "Starting Fractal Chain of Thought (Vertex FCoT)..."
|
| 287 |
+
chat = client.chats.create(model=model_name, config=config)
|
| 288 |
+
|
| 289 |
+
# 1. Macro
|
| 290 |
+
yield "FCoT Step 1: Macro-Scale..."
|
| 291 |
+
macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
|
| 292 |
+
# First message must contain the video
|
| 293 |
+
res1 = await loop.run_in_executor(None, lambda: chat.send_message([video_part, macro_prompt]))
|
| 294 |
+
macro_hypothesis = res1.text
|
| 295 |
+
yield f"Hypothesis: {macro_hypothesis[:80]}...\n"
|
| 296 |
+
|
| 297 |
+
# 2. Meso
|
| 298 |
+
yield "FCoT Step 2: Meso-Scale..."
|
| 299 |
+
meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
|
| 300 |
+
res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt))
|
| 301 |
+
|
| 302 |
+
# 3. Synthesis
|
| 303 |
+
yield "FCoT Step 3: Synthesis..."
|
| 304 |
+
synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
|
| 305 |
+
res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt))
|
| 306 |
+
|
| 307 |
+
raw_text = res3.text
|
| 308 |
+
prompt_used = f"FCoT (Vertex):\nMacro: {macro_prompt}\nMeso: {meso_prompt}\nSynthesis: {synthesis_prompt}"
|
| 309 |
+
|
| 310 |
+
else:
|
| 311 |
+
# --- Standard CoT ---
|
| 312 |
+
prompt_text = LABELING_PROMPT_TEMPLATE.format(
|
| 313 |
+
caption=caption,
|
| 314 |
+
transcript=transcript,
|
| 315 |
+
toon_schema=toon_schema,
|
| 316 |
+
score_instructions=score_instructions
|
| 317 |
+
)
|
| 318 |
+
prompt_used = prompt_text
|
| 319 |
+
yield "Generating Labels (Vertex CoT)..."
|
| 320 |
+
|
| 321 |
+
response = await loop.run_in_executor(
|
| 322 |
+
None,
|
| 323 |
+
lambda: client.models.generate_content(
|
| 324 |
+
model=model_name,
|
| 325 |
+
contents=[video_part, prompt_text],
|
| 326 |
+
config=config
|
| 327 |
+
)
|
| 328 |
+
)
|
| 329 |
+
raw_text = response.text
|
| 330 |
+
|
| 331 |
+
if not raw_text:
|
| 332 |
+
yield "Model returned empty response."
|
| 333 |
+
yield {"error": "Empty Response"}
|
| 334 |
+
return
|
| 335 |
+
|
| 336 |
+
parsed_data = parse_veracity_toon(raw_text)
|
| 337 |
+
|
| 338 |
+
is_zero = parsed_data['veracity_vectors']['visual_integrity_score'] == '0'
|
| 339 |
+
if is_zero:
|
| 340 |
+
yield "Parsing incomplete (score 0). Initiating Auto-Repair..."
|
| 341 |
+
raw_text = await attempt_toon_repair(raw_text, toon_schema, client, 'vertex', vertex_config)
|
| 342 |
+
parsed_data = parse_veracity_toon(raw_text)
|
| 343 |
+
|
| 344 |
+
yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
|
| 345 |
+
|
| 346 |
+
except Exception as e:
|
| 347 |
+
yield f"ERROR: {e}"
|
| 348 |
+
logger.error("Vertex Labeling Error", exc_info=True)
|
| 349 |
+
|
| 350 |
+
# Keep legacy pipeline functions for general Q&A compatibility
|
| 351 |
+
async def run_gemini_pipeline(video_path, question, checks, gemini_config, generation_config=None):
|
| 352 |
+
if genai_legacy is None: return
|
| 353 |
+
api_key = gemini_config.get("api_key")
|
| 354 |
+
if not api_key: return
|
| 355 |
+
genai_legacy.configure(api_key=api_key)
|
| 356 |
+
loop = asyncio.get_event_loop()
|
| 357 |
+
uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.upload_file(path=video_path))
|
| 358 |
+
while uploaded_file.state.name == "PROCESSING": await asyncio.sleep(2)
|
| 359 |
+
model = genai_legacy.GenerativeModel(gemini_config.get("model_name", "models/gemini-1.5-pro-latest"))
|
| 360 |
+
response = await loop.run_in_executor(None, lambda: model.generate_content([question, uploaded_file]))
|
| 361 |
+
yield response.text
|
| 362 |
+
await loop.run_in_executor(None, lambda: genai_legacy.delete_file(name=uploaded_file.name))
|
| 363 |
+
|
| 364 |
+
async def run_vertex_pipeline(video_path, question, checks, vertex_config, generation_config=None):
|
| 365 |
+
if genai is None: return
|
| 366 |
+
client = genai.Client(vertexai=True, project=vertex_config['project_id'], location=vertex_config['location'])
|
| 367 |
+
with open(video_path, 'rb') as f: video_bytes = f.read()
|
| 368 |
+
video_part = Part.from_bytes(data=video_bytes, mime_type="video/mp4")
|
| 369 |
+
loop = asyncio.get_event_loop()
|
| 370 |
+
response = await loop.run_in_executor(
|
| 371 |
+
None,
|
| 372 |
+
lambda: client.models.generate_content(
|
| 373 |
+
model=vertex_config.get("model_name", "gemini-2.5-flash-lite"),
|
| 374 |
+
contents=[video_part, question]
|
| 375 |
+
)
|
| 376 |
+
)
|
| 377 |
+
yield response.text
|
src/labeling_logic.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utilizes TOON (Token-Oriented Object Notation) for token efficiency and structured output.
|
| 2 |
+
|
| 3 |
+
LABELING_PROMPT_TEMPLATE = """
|
| 4 |
+
You are an AI Factuality Assessment Agent operating under the "Ali Arsanjani Factuality Factors" framework.
|
| 5 |
+
Your goal is to mass-label video content, quantifying "Veracity Vectors" and "Modality Alignment".
|
| 6 |
+
|
| 7 |
+
**INPUT DATA:**
|
| 8 |
+
- **User Caption:** "{caption}"
|
| 9 |
+
- **Audio Transcript:** "{transcript}"
|
| 10 |
+
- **Visuals:** (Provided in video context)
|
| 11 |
+
|
| 12 |
+
**INSTRUCTIONS:**
|
| 13 |
+
1. **Grounding:** Cross-reference claims in the transcript with your internal knowledge base (and tools if active).
|
| 14 |
+
2. **Chain of Thought (<thinking>):** You MUST think step-by-step inside a `<thinking>` block before generating output.
|
| 15 |
+
* Analyze *Visual Integrity* (Artifacts, edits).
|
| 16 |
+
* Analyze *Audio Integrity* (Voice cloning, sync).
|
| 17 |
+
* Analyze *Modality Alignment* (Does video match audio? Does caption match content? Does audio match caption?).
|
| 18 |
+
* Analyze *Logic* (Fallacies, gaps).
|
| 19 |
+
* **Classify Tags:** Identify 3-5 relevant tags (e.g., "political", "celebrity", "targeting", "satire", "news").
|
| 20 |
+
* Determine *Disinformation* classification.
|
| 21 |
+
3. **Output Format:** Output strictly in **TOON** format (Token-Oriented Object Notation) as defined below.
|
| 22 |
+
|
| 23 |
+
**CRITICAL CONSTRAINTS:**
|
| 24 |
+
- Do NOT repeat the input data.
|
| 25 |
+
- START your response IMMEDIATELY with the `<thinking>` tag.
|
| 26 |
+
- **DO NOT use Markdown code blocks.** (Output plain text only).
|
| 27 |
+
- Use strict `Key : Type [ Count ] {{ Headers }} :` format followed by data lines.
|
| 28 |
+
- Strings containing commas MUST be quoted.
|
| 29 |
+
- ALL scores must be filled (use 0 if unsure, do not leave blank).
|
| 30 |
+
- **MODALITY SCORING:** You must provide 3 distinct alignment scores: Video-Audio, Video-Caption, and Audio-Caption.
|
| 31 |
+
|
| 32 |
+
**TOON SCHEMA:**
|
| 33 |
+
{toon_schema}
|
| 34 |
+
|
| 35 |
+
{score_instructions}
|
| 36 |
+
|
| 37 |
+
**RESPONSE:**
|
| 38 |
+
<thinking>
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
SCORE_INSTRUCTIONS_REASONING = """
|
| 42 |
+
**Constraints:**
|
| 43 |
+
1. Provide specific reasoning for EACH score in the `vectors` and `modalities` tables.
|
| 44 |
+
2. Ensure strings are properly quoted.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
SCORE_INSTRUCTIONS_SIMPLE = """
|
| 48 |
+
**Constraint:** Focus on objective measurements. Keep text concise.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# Updated Schema based on user requirements - Ensure explicit newlines
|
| 52 |
+
SCHEMA_SIMPLE = """summary: text[1]{text}:
|
| 53 |
+
"Brief neutral summary of the video events"
|
| 54 |
+
|
| 55 |
+
tags: list[1]{keywords}:
|
| 56 |
+
"political, celebrity, deepfake, viral"
|
| 57 |
+
|
| 58 |
+
vectors: scores[1]{visual,audio,source,logic,emotion}:
|
| 59 |
+
(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10)
|
| 60 |
+
*Scale: 1=Fake/Malicious, 10=Authentic/Neutral*
|
| 61 |
+
|
| 62 |
+
modalities: scores[1]{video_audio_score,video_caption_score,audio_caption_score}:
|
| 63 |
+
(Int 1-10),(Int 1-10),(Int 1-10)
|
| 64 |
+
*Scale: 1=Mismatch, 10=Perfect Match*
|
| 65 |
+
|
| 66 |
+
factuality: factors[1]{accuracy,gap,grounding}:
|
| 67 |
+
(Verified/Misleading/False),"Missing evidence description","Grounding check results"
|
| 68 |
+
|
| 69 |
+
disinfo: analysis[1]{class,intent,threat}:
|
| 70 |
+
(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
|
| 71 |
+
|
| 72 |
+
final: assessment[1]{score,reasoning}:
|
| 73 |
+
(Int 1-100),"Final synthesis of why this score was given"
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
SCHEMA_REASONING = """
|
| 77 |
+
summary: text[1]{text}:
|
| 78 |
+
"Brief neutral summary of the video events"
|
| 79 |
+
|
| 80 |
+
tags: list[1]{keywords}:
|
| 81 |
+
"political, celebrity, deepfake, viral"
|
| 82 |
+
|
| 83 |
+
vectors: details[5]{category,score,reasoning}:
|
| 84 |
+
Visual,(Int 1-10),"Reasoning for visual score"
|
| 85 |
+
Audio,(Int 1-10),"Reasoning for audio score"
|
| 86 |
+
Source,(Int 1-10),"Reasoning for source credibility"
|
| 87 |
+
Logic,(Int 1-10),"Reasoning for logical consistency"
|
| 88 |
+
Emotion,(Int 1-10),"Reasoning for emotional manipulation"
|
| 89 |
+
|
| 90 |
+
modalities: details[3]{category,score,reasoning}:
|
| 91 |
+
VideoAudio,(Int 1-10),"Reasoning for video-to-audio alignment"
|
| 92 |
+
VideoCaption,(Int 1-10),"Reasoning for video-to-caption alignment"
|
| 93 |
+
AudioCaption,(Int 1-10),"Reasoning for audio-to-caption alignment"
|
| 94 |
+
|
| 95 |
+
factuality: factors[1]{accuracy,gap,grounding}:
|
| 96 |
+
(Verified/Misleading/False),"Missing evidence description","Grounding check results"
|
| 97 |
+
|
| 98 |
+
disinfo: analysis[1]{class,intent,threat}:
|
| 99 |
+
(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
|
| 100 |
+
|
| 101 |
+
final: assessment[1]{score,reasoning}:
|
| 102 |
+
(Int 1-100),"Final synthesis of why this score was given"
|
| 103 |
+
"""
|
| 104 |
+
|
| 105 |
+
# ==========================================
|
| 106 |
+
# Fractal Chain of Thought (FCoT) Prompts
|
| 107 |
+
# ==========================================
|
| 108 |
+
|
| 109 |
+
FCOT_MACRO_PROMPT = """
|
| 110 |
+
**Fractal Chain of Thought - Stage 1: Macro-Scale Hypothesis (Wide Aperture)**
|
| 111 |
+
|
| 112 |
+
You are analyzing a video for factuality.
|
| 113 |
+
**Context:** Caption: "{caption}" | Transcript: "{transcript}"
|
| 114 |
+
|
| 115 |
+
1. **Global Scan**: Observe the video, audio, and caption as a whole entity.
|
| 116 |
+
2. **Context Aperture**: Wide. Assess the overall intent (Humor, Information, Political, Social) and the setting.
|
| 117 |
+
3. **Macro Hypothesis**: Formulate a high-level hypothesis about the veracity. (e.g., "The video is likely authentic but the caption misrepresents the location" or "The audio quality suggests synthetic generation").
|
| 118 |
+
|
| 119 |
+
**Objective**: Maximize **Coverage** (broadly explore potential angles of manipulation).
|
| 120 |
+
|
| 121 |
+
**Output**: A concise paragraph summarizing the "Macro Hypothesis".
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
FCOT_MESO_PROMPT = """
|
| 125 |
+
**Fractal Chain of Thought - Stage 2: Meso-Scale Expansion (Recursive Verification)**
|
| 126 |
+
|
| 127 |
+
**Current Macro Hypothesis**: "{macro_hypothesis}"
|
| 128 |
+
|
| 129 |
+
**Action**: Zoom In. Decompose the hypothesis into specific verification branches.
|
| 130 |
+
Perform the following checks recursively:
|
| 131 |
+
|
| 132 |
+
1. **Visual Branch**: Look for specific artifacts, lighting inconsistencies, cuts, or deepfake signs.
|
| 133 |
+
2. **Audio Branch**: Analyze lip-sync, background noise consistency, and voice tonality.
|
| 134 |
+
3. **Logical Branch**: Does the visual evidence strictly support the caption's claim? Are there logical fallacies?
|
| 135 |
+
|
| 136 |
+
**Dual-Objective Self-Correction**:
|
| 137 |
+
- **Faithfulness**: Do not hallucinate details not present in the video.
|
| 138 |
+
- **Coverage**: Did you miss any subtle cues?
|
| 139 |
+
|
| 140 |
+
**Output**: Detailed "Micro-Observations" for each branch. If you find contradictions to the Macro Hypothesis, note them explicitly as **"Self-Correction"**.
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
FCOT_SYNTHESIS_PROMPT = """
|
| 144 |
+
**Fractal Chain of Thought - Stage 3: Inter-Scale Consensus & Synthesis**
|
| 145 |
+
|
| 146 |
+
**Action**: Integrate your Macro Hypothesis and Micro-Observations.
|
| 147 |
+
- **Consensus Check**: If Micro-Observations contradict the Macro Hypothesis, prioritize the Micro evidence (Self-Correction).
|
| 148 |
+
- **Compression**: Synthesize the findings into the final structured format.
|
| 149 |
+
- **Tags**: Assign 3-5 high-level tags (e.g., "political", "fabricated", "humor").
|
| 150 |
+
|
| 151 |
+
**Output Format**:
|
| 152 |
+
Strictly fill out the following TOON schema based on the consensus. Do not include markdown code blocks.
|
| 153 |
+
|
| 154 |
+
**TOON SCHEMA**:
|
| 155 |
+
{toon_schema}
|
| 156 |
+
|
| 157 |
+
{score_instructions}
|
| 158 |
+
"""
|
src/model.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 2 |
+
from my_vision_process import process_vision_info
|
| 3 |
+
import torch
|
| 4 |
+
import re
|
| 5 |
+
import ast
|
| 6 |
+
|
| 7 |
+
# Note: The model class has been updated to Qwen3VLForConditionalGeneration for consistency
|
| 8 |
+
# with the main application and the latest transformers library conventions for this model.
|
| 9 |
+
model_path = "OpenGVLab/VideoChat-R1_5"
|
| 10 |
+
# default: Load the model on the available device(s)
|
| 11 |
+
model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 12 |
+
model_path, torch_dtype="auto", device_map="auto",
|
| 13 |
+
attn_implementation="flash_attention_2"
|
| 14 |
+
).eval()
|
| 15 |
+
|
| 16 |
+
# default processer
|
| 17 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
| 18 |
+
|
| 19 |
+
video_path = "your_video.mp4"
|
| 20 |
+
question = "your_qa.mp4"
|
| 21 |
+
num_percptions = 3
|
| 22 |
+
|
| 23 |
+
QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video.
|
| 24 |
+
|
| 25 |
+
Output your think process within the <think> </think> tags.
|
| 26 |
+
|
| 27 |
+
Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer to this question in the format of [(s1, e1), (s2, e2), ...]. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video.
|
| 31 |
+
|
| 32 |
+
Output your think process within the <think> </think> tags.
|
| 33 |
+
|
| 34 |
+
Then, provide your answer within the <answer> </answer> tags, output the corresponding letter of the option. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def inference(video_path, prompt, model, processor, max_new_tokens=2048, client=None, pred_glue=None):
|
| 39 |
+
device = model.device
|
| 40 |
+
messages = [
|
| 41 |
+
{"role": "user", "content": [
|
| 42 |
+
{"type": "video",
|
| 43 |
+
"video": video_path,
|
| 44 |
+
'key_time':pred_glue,
|
| 45 |
+
"total_pixels": 128*12 * 28 * 28,
|
| 46 |
+
"min_pixels": 128 * 28 * 28,
|
| 47 |
+
},
|
| 48 |
+
{"type": "text", "text": prompt},
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
]
|
| 52 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 53 |
+
|
| 54 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client=client)
|
| 55 |
+
fps_inputs = video_kwargs['fps'][0]
|
| 56 |
+
|
| 57 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
|
| 58 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 59 |
+
|
| 60 |
+
with torch.no_grad():
|
| 61 |
+
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
|
| 62 |
+
|
| 63 |
+
generated_ids = [output_ids[i][len(inputs['input_ids'][i]):] for i in range(len(output_ids))]
|
| 64 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 65 |
+
return output_text[0]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# This is example usage code. You should replace the placeholders.
|
| 69 |
+
# For example:
|
| 70 |
+
# item = {"problem": {"question": "What is the person doing in the video?"}}
|
| 71 |
+
# client = None # Or initialize your client
|
| 72 |
+
# pred_glue = None
|
| 73 |
+
# answers = []
|
| 74 |
+
|
| 75 |
+
# for percption in range(num_percptions):
|
| 76 |
+
# if percption == num_percptions - 1:
|
| 77 |
+
# example_prompt = QA_THINK.replace("[QUESTION]", item["problem"]["question"])
|
| 78 |
+
# else:
|
| 79 |
+
# example_prompt = QA_THINK_GLUE.replace("[QUESTION]", item["problem"]["question"])
|
| 80 |
+
|
| 81 |
+
# ans = inference(video_path, example_prompt, model, processor, client=client, pred_glue=pred_glue)
|
| 82 |
+
|
| 83 |
+
# pattern_glue = r'<glue>(.*?)</glue>'
|
| 84 |
+
# match_glue = re.search(pattern_glue, ans, re.DOTALL)
|
| 85 |
+
# answers.append(ans)
|
| 86 |
+
# pred_glue = None
|
| 87 |
+
# try:
|
| 88 |
+
# if match_glue:
|
| 89 |
+
# glue = match_glue.group(1)
|
| 90 |
+
# pred_glue = ast.literal_eval(glue)
|
| 91 |
+
# except Exception as e:
|
| 92 |
+
# pred_glue = None
|
| 93 |
+
# print(ans)
|
src/my_vision_process.py
ADDED
|
@@ -0,0 +1,812 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from torchvision import transforms
|
| 4 |
+
import base64
|
| 5 |
+
import logging
|
| 6 |
+
import math
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
import warnings
|
| 11 |
+
from functools import lru_cache
|
| 12 |
+
from io import BytesIO
|
| 13 |
+
|
| 14 |
+
import requests
|
| 15 |
+
import torch
|
| 16 |
+
import torchvision
|
| 17 |
+
from packaging import version
|
| 18 |
+
from PIL import Image
|
| 19 |
+
from torchvision import io, transforms
|
| 20 |
+
from torchvision.transforms import InterpolationMode
|
| 21 |
+
from typing import Optional
|
| 22 |
+
import random
|
| 23 |
+
import os
|
| 24 |
+
import io
|
| 25 |
+
import av
|
| 26 |
+
import cv2
|
| 27 |
+
import decord
|
| 28 |
+
import imageio
|
| 29 |
+
from decord import VideoReader
|
| 30 |
+
import torch
|
| 31 |
+
import numpy as np
|
| 32 |
+
import math
|
| 33 |
+
import gc
|
| 34 |
+
import torchaudio
|
| 35 |
+
from torchvision.transforms.functional import pil_to_tensor
|
| 36 |
+
import re
|
| 37 |
+
logger = logging.getLogger(__name__)
|
| 38 |
+
|
| 39 |
+
# from models.backbones.beats.BEATs import BEATs, BEATsConfig
|
| 40 |
+
try:
|
| 41 |
+
from petrel_client.client import Client
|
| 42 |
+
client = Client('~/petreloss.conf')
|
| 43 |
+
except (ImportError, FileNotFoundError):
|
| 44 |
+
# This is expected if petrel is not used.
|
| 45 |
+
client = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_index(num_frames, num_segments):
|
| 49 |
+
seg_size = float(num_frames - 1) / num_segments
|
| 50 |
+
start = int(seg_size / 2)
|
| 51 |
+
offsets = np.array([
|
| 52 |
+
start + int(np.round(seg_size * idx)) for idx in range(num_segments)
|
| 53 |
+
])
|
| 54 |
+
return offsets
|
| 55 |
+
|
| 56 |
+
def get_frame_indices(num_frames, vlen, sample='middle', fix_start=None, input_fps=1, min_num_frames=1, max_num_frames=-1, local_num_frames=8):
|
| 57 |
+
|
| 58 |
+
if min_num_frames > vlen:
|
| 59 |
+
if sample == 'dynamic_fps1':
|
| 60 |
+
min_num_frames = (vlen // local_num_frames) * local_num_frames
|
| 61 |
+
else:
|
| 62 |
+
min_num_frames = vlen
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if sample == 'dynamic_fps1':
|
| 66 |
+
|
| 67 |
+
duration = float(vlen) / input_fps
|
| 68 |
+
num_segments = int(duration // local_num_frames)
|
| 69 |
+
if num_segments == 0:
|
| 70 |
+
num_frames = local_num_frames
|
| 71 |
+
else:
|
| 72 |
+
num_frames = local_num_frames * num_segments
|
| 73 |
+
|
| 74 |
+
if max_num_frames > 0:
|
| 75 |
+
num_frames = min(num_frames, max_num_frames)
|
| 76 |
+
sample = "middle" # NOTE
|
| 77 |
+
|
| 78 |
+
# logger.info(f"? is OK (img), duation={duration} frames={num_frames}!!!!")
|
| 79 |
+
|
| 80 |
+
num_frames = max(min_num_frames, num_frames)
|
| 81 |
+
|
| 82 |
+
# print(f"\033[0;31m vlen={vlen}, input_fps={input_fps} num_frames={num_frames} \033[0m")
|
| 83 |
+
|
| 84 |
+
if sample in ["rand", "middle"]: # uniform sampling
|
| 85 |
+
acc_samples = min(num_frames, vlen)
|
| 86 |
+
# split the video into `acc_samples` intervals, and sample from each interval.
|
| 87 |
+
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
|
| 88 |
+
ranges = []
|
| 89 |
+
for idx, interv in enumerate(intervals[:-1]):
|
| 90 |
+
ranges.append((interv, intervals[idx + 1] - 1))
|
| 91 |
+
if sample == 'rand':
|
| 92 |
+
try:
|
| 93 |
+
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
|
| 94 |
+
except:
|
| 95 |
+
frame_indices = np.random.permutation(vlen)[:acc_samples]
|
| 96 |
+
frame_indices.sort()
|
| 97 |
+
frame_indices = list(frame_indices)
|
| 98 |
+
elif fix_start is not None:
|
| 99 |
+
frame_indices = [x[0] + fix_start for x in ranges]
|
| 100 |
+
elif sample == 'middle':
|
| 101 |
+
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
| 102 |
+
else:
|
| 103 |
+
raise NotImplementedError
|
| 104 |
+
|
| 105 |
+
if len(frame_indices) < num_frames: # padded with last frame
|
| 106 |
+
padded_frame_indices = [frame_indices[-1]] * num_frames
|
| 107 |
+
padded_frame_indices[:len(frame_indices)] = frame_indices
|
| 108 |
+
frame_indices = padded_frame_indices
|
| 109 |
+
elif "fps" in sample: # fps0.5, sequentially sample frames at 0.5 fps
|
| 110 |
+
raise NotImplementedError
|
| 111 |
+
output_fps = float(sample[3:])
|
| 112 |
+
duration = float(vlen) / input_fps
|
| 113 |
+
delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents
|
| 114 |
+
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
|
| 115 |
+
frame_indices = np.around(frame_seconds * input_fps).astype(int)
|
| 116 |
+
frame_indices = [e for e in frame_indices if e < vlen]
|
| 117 |
+
if max_num_frames > 0 and len(frame_indices) > max_num_frames:
|
| 118 |
+
frame_indices = frame_indices[:max_num_frames]
|
| 119 |
+
# frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
|
| 120 |
+
else:
|
| 121 |
+
raise ValueError(f"Not support sample type: {sample}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
return frame_indices
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
logger = logging.getLogger(__name__)
|
| 129 |
+
|
| 130 |
+
IMAGE_FACTOR = 28
|
| 131 |
+
MIN_PIXELS = 4 * 28 * 28
|
| 132 |
+
MAX_PIXELS = 768 * 28 * 28
|
| 133 |
+
MAX_RATIO = 200
|
| 134 |
+
|
| 135 |
+
VIDEO_MIN_PIXELS = 128 * 28 * 28
|
| 136 |
+
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
| 137 |
+
FRAME_FACTOR = 2
|
| 138 |
+
FPS = 2.0
|
| 139 |
+
FPS_MIN_FRAMES = 4
|
| 140 |
+
FPS_MAX_FRAMES = 512
|
| 141 |
+
|
| 142 |
+
# Set the maximum number of video token inputs.
|
| 143 |
+
# Here, 128K represents the maximum number of input tokens for the VLLM model.
|
| 144 |
+
# Remember to adjust it according to your own configuration.
|
| 145 |
+
VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
|
| 146 |
+
# logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def round_by_factor(number: int, factor: int) -> int:
|
| 150 |
+
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
| 151 |
+
return round(number / factor) * factor
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def ceil_by_factor(number: int, factor: int) -> int:
|
| 155 |
+
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
| 156 |
+
return math.ceil(number / factor) * factor
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def floor_by_factor(number: int, factor: int) -> int:
|
| 160 |
+
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
| 161 |
+
return math.floor(number / factor) * factor
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def smart_resize(
|
| 165 |
+
height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
|
| 166 |
+
) -> tuple[int, int]:
|
| 167 |
+
"""
|
| 168 |
+
Rescales the image so that the following conditions are met:
|
| 169 |
+
|
| 170 |
+
1. Both dimensions (height and width) are divisible by 'factor'.
|
| 171 |
+
|
| 172 |
+
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
| 173 |
+
|
| 174 |
+
3. The aspect ratio of the image is maintained as closely as possible.
|
| 175 |
+
"""
|
| 176 |
+
if min(height, width) == 0:
|
| 177 |
+
return 0, 0
|
| 178 |
+
if max(height, width) / min(height, width) > MAX_RATIO:
|
| 179 |
+
raise ValueError(
|
| 180 |
+
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
|
| 181 |
+
)
|
| 182 |
+
h_bar = max(factor, round_by_factor(height, factor))
|
| 183 |
+
w_bar = max(factor, round_by_factor(width, factor))
|
| 184 |
+
if h_bar * w_bar > max_pixels:
|
| 185 |
+
beta = math.sqrt((height * width) / max_pixels)
|
| 186 |
+
h_bar = floor_by_factor(height / beta, factor)
|
| 187 |
+
w_bar = floor_by_factor(width / beta, factor)
|
| 188 |
+
elif h_bar * w_bar < min_pixels:
|
| 189 |
+
beta = math.sqrt(min_pixels / (height * width))
|
| 190 |
+
h_bar = ceil_by_factor(height * beta, factor)
|
| 191 |
+
w_bar = ceil_by_factor(width * beta, factor)
|
| 192 |
+
return h_bar, w_bar
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def to_rgb(pil_image: Image.Image) -> Image.Image:
|
| 196 |
+
if pil_image.mode == 'RGBA':
|
| 197 |
+
white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
|
| 198 |
+
white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
|
| 199 |
+
return white_background
|
| 200 |
+
else:
|
| 201 |
+
return pil_image.convert("RGB")
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def fetch_image(ele: dict[str, str | Image.Image], client= None, size_factor: int = IMAGE_FACTOR) -> Image.Image:
|
| 205 |
+
if "image" in ele:
|
| 206 |
+
image = ele["image"]
|
| 207 |
+
else:
|
| 208 |
+
image = ele["image_url"]
|
| 209 |
+
image_obj = None
|
| 210 |
+
if isinstance(image, Image.Image):
|
| 211 |
+
image_obj = image
|
| 212 |
+
elif isinstance(image, list) and isinstance(image[0], Image.Image):
|
| 213 |
+
image_obj = image[0]
|
| 214 |
+
elif image.startswith("http://") or image.startswith("https://"):
|
| 215 |
+
response = requests.get(image, stream=True)
|
| 216 |
+
image_obj = Image.open(BytesIO(response.content))
|
| 217 |
+
elif image.startswith("file://"):
|
| 218 |
+
image_obj = Image.open(image[7:])
|
| 219 |
+
elif image.startswith("data:image"):
|
| 220 |
+
if "base64," in image:
|
| 221 |
+
_, base64_data = image.split("base64,", 1)
|
| 222 |
+
data = base64.b64decode(base64_data)
|
| 223 |
+
image_obj = Image.open(BytesIO(data))
|
| 224 |
+
elif 's3' in image and client is not None:
|
| 225 |
+
file_content = client.get(image)
|
| 226 |
+
image_obj = Image.open(io.BytesIO(file_content))
|
| 227 |
+
else:
|
| 228 |
+
image_obj = Image.open(image)
|
| 229 |
+
if image_obj is None:
|
| 230 |
+
raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
|
| 231 |
+
image = to_rgb(image_obj)
|
| 232 |
+
## resize
|
| 233 |
+
if "resized_height" in ele and "resized_width" in ele:
|
| 234 |
+
resized_height, resized_width = smart_resize(
|
| 235 |
+
ele["resized_height"],
|
| 236 |
+
ele["resized_width"],
|
| 237 |
+
factor=size_factor,
|
| 238 |
+
)
|
| 239 |
+
else:
|
| 240 |
+
width, height = image.size
|
| 241 |
+
min_pixels = ele.get("min_pixels", MIN_PIXELS)
|
| 242 |
+
max_pixels = ele.get("max_pixels", MAX_PIXELS)
|
| 243 |
+
resized_height, resized_width = smart_resize(
|
| 244 |
+
height,
|
| 245 |
+
width,
|
| 246 |
+
factor=size_factor,
|
| 247 |
+
min_pixels=min_pixels,
|
| 248 |
+
max_pixels=max_pixels,
|
| 249 |
+
)
|
| 250 |
+
image = image.resize((resized_width, resized_height))
|
| 251 |
+
|
| 252 |
+
return image
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def smart_nframes(
|
| 256 |
+
ele: dict,
|
| 257 |
+
total_frames: int,
|
| 258 |
+
video_fps: int | float,
|
| 259 |
+
) -> int:
|
| 260 |
+
"""calculate the number of frames for video used for model inputs.
|
| 261 |
+
|
| 262 |
+
Args:
|
| 263 |
+
ele (dict): a dict contains the configuration of video.
|
| 264 |
+
support either `fps` or `nframes`:
|
| 265 |
+
- nframes: the number of frames to extract for model inputs.
|
| 266 |
+
- fps: the fps to extract frames for model inputs.
|
| 267 |
+
- min_frames: the minimum number of frames of the video, only used when fps is provided.
|
| 268 |
+
- max_frames: the maximum number of frames of the video, only used when fps is provided.
|
| 269 |
+
total_frames (int): the original total number of frames of the video.
|
| 270 |
+
video_fps (int | float): the original fps of the video.
|
| 271 |
+
|
| 272 |
+
Raises:
|
| 273 |
+
ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
|
| 274 |
+
|
| 275 |
+
Returns:
|
| 276 |
+
int: the number of frames for video used for model inputs.
|
| 277 |
+
"""
|
| 278 |
+
assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
|
| 279 |
+
if "nframes" in ele:
|
| 280 |
+
nframes = round_by_factor(ele["nframes"], FRAME_FACTOR)
|
| 281 |
+
else:
|
| 282 |
+
fps = ele.get("fps", FPS)
|
| 283 |
+
min_frames = ceil_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR)
|
| 284 |
+
max_frames = floor_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR)
|
| 285 |
+
if video_fps > 0:
|
| 286 |
+
nframes = total_frames / video_fps * fps
|
| 287 |
+
else: # if video_fps is 0, use total_frames as nframes
|
| 288 |
+
nframes = total_frames
|
| 289 |
+
if nframes > total_frames:
|
| 290 |
+
logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
|
| 291 |
+
nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
|
| 292 |
+
nframes = floor_by_factor(nframes, FRAME_FACTOR)
|
| 293 |
+
if not (FRAME_FACTOR <= nframes and nframes <= total_frames) and total_frames > FRAME_FACTOR:
|
| 294 |
+
logger.warning(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
|
| 295 |
+
nframes = max(FRAME_FACTOR, min(nframes, total_frames))
|
| 296 |
+
return nframes
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _read_video_torchvision(
|
| 300 |
+
ele: dict,
|
| 301 |
+
) -> (torch.Tensor, float):
|
| 302 |
+
raise NotImplementedError("Torchvision reader is not fully supported in this context.")
|
| 303 |
+
"""read video using torchvision.io.read_video
|
| 304 |
+
|
| 305 |
+
Args:
|
| 306 |
+
ele (dict): a dict contains the configuration of video.
|
| 307 |
+
support keys:
|
| 308 |
+
- video: the path of video. support "file://", "http://", "https://" and local path.
|
| 309 |
+
- video_start: the start time of video.
|
| 310 |
+
- video_end: the end time of video.
|
| 311 |
+
Returns:
|
| 312 |
+
torch.Tensor: the video tensor with shape (T, C, H, W).
|
| 313 |
+
"""
|
| 314 |
+
video_path = ele["video"]
|
| 315 |
+
if version.parse(torchvision.__version__) < version.parse("0.19.0"):
|
| 316 |
+
if "http://" in video_path or "https://" in video_path:
|
| 317 |
+
warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
|
| 318 |
+
if "file://" in video_path:
|
| 319 |
+
video_path = video_path[7:]
|
| 320 |
+
st = time.time()
|
| 321 |
+
video, audio, info = io.read_video(
|
| 322 |
+
video_path,
|
| 323 |
+
start_pts=ele.get("video_start", 0.0),
|
| 324 |
+
end_pts=ele.get("video_end", None),
|
| 325 |
+
pts_unit="sec",
|
| 326 |
+
output_format="TCHW",
|
| 327 |
+
)
|
| 328 |
+
total_frames, video_fps = video.size(0), info["video_fps"]
|
| 329 |
+
logger.info(f"torchvision: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
|
| 330 |
+
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
|
| 331 |
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long()
|
| 332 |
+
sample_fps = nframes / max(total_frames, 1e-6) * video_fps
|
| 333 |
+
video = video[idx]
|
| 334 |
+
return video, sample_fps
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def is_decord_available() -> bool:
|
| 338 |
+
import importlib.util
|
| 339 |
+
|
| 340 |
+
return importlib.util.find_spec("decord") is not None
|
| 341 |
+
|
| 342 |
+
def pts_to_secs(pts: int, time_base: float, start_pts: int) -> float:
|
| 343 |
+
"""
|
| 344 |
+
Converts a present time with the given time base and start_pts offset to seconds.
|
| 345 |
+
|
| 346 |
+
Returns:
|
| 347 |
+
time_in_seconds (float): The corresponding time in seconds.
|
| 348 |
+
|
| 349 |
+
https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/data/utils.py#L54-L64
|
| 350 |
+
"""
|
| 351 |
+
if pts == math.inf:
|
| 352 |
+
return math.inf
|
| 353 |
+
|
| 354 |
+
return int(pts - start_pts) * time_base
|
| 355 |
+
|
| 356 |
+
def get_pyav_video_duration(video_reader):
|
| 357 |
+
video_stream = video_reader.streams.video[0]
|
| 358 |
+
video_duration = pts_to_secs(
|
| 359 |
+
video_stream.duration,
|
| 360 |
+
video_stream.time_base,
|
| 361 |
+
video_stream.start_time
|
| 362 |
+
)
|
| 363 |
+
return float(video_duration)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def _read_video_img(
|
| 368 |
+
ele: dict,
|
| 369 |
+
client
|
| 370 |
+
) -> (torch.Tensor, float):
|
| 371 |
+
"""read video using decord.VideoReader
|
| 372 |
+
|
| 373 |
+
Args:
|
| 374 |
+
ele (dict): a dict contains the configuration of video.
|
| 375 |
+
support keys:
|
| 376 |
+
- video: the path of video. support "file://", "http://", "https://" and local path.
|
| 377 |
+
- video_start: the start time of video.
|
| 378 |
+
- video_end: the end time of video.
|
| 379 |
+
Returns:
|
| 380 |
+
torch.Tensor: the video tensor with shape (T, C, H, W).
|
| 381 |
+
"""
|
| 382 |
+
import re
|
| 383 |
+
|
| 384 |
+
def extract_frame_number(filename):
|
| 385 |
+
# Extract the numeric part from the filename using regular expressions
|
| 386 |
+
if filename.endswith('.jpg'):
|
| 387 |
+
match = re.search(r'_(\d+).jpg$', filename)
|
| 388 |
+
elif filename.endswith('.jpeg'):
|
| 389 |
+
match = re.search(r'_(\d+).jpeg$', filename)
|
| 390 |
+
elif filename.endswith('.png'):
|
| 391 |
+
match = re.search(r'_(\d+).png$', filename)
|
| 392 |
+
else:
|
| 393 |
+
raise NotImplementedError(f"Wrong filename: {filename}")
|
| 394 |
+
|
| 395 |
+
return int(match.group(1)) if match else -1
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
def sort_frames(frame_paths):
|
| 399 |
+
# Extract filenames from each path and sort by their numeric part
|
| 400 |
+
return sorted(frame_paths, key=lambda x: extract_frame_number(os.path.basename(x)))
|
| 401 |
+
video_path = ele["video"]
|
| 402 |
+
# import pdb; pdb.set_trace()
|
| 403 |
+
if 'media_dict' in ele:
|
| 404 |
+
media_dict = ele["media_dict"]
|
| 405 |
+
else:
|
| 406 |
+
media_dict = {}
|
| 407 |
+
if "s3://" in video_path and client is not None:
|
| 408 |
+
img_list = sort_frames(client.list(video_path))
|
| 409 |
+
else:
|
| 410 |
+
img_list = sort_frames(list(os.listdir(video_path)))
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
if "start" in media_dict.keys():
|
| 415 |
+
clip = [media_dict['start'], media_dict['end']]
|
| 416 |
+
else:
|
| 417 |
+
clip = None
|
| 418 |
+
|
| 419 |
+
if 'tvqa' in video_path.lower():
|
| 420 |
+
fps = 3.0 # TVQA is 3fps
|
| 421 |
+
else:
|
| 422 |
+
fps = 1.0 # NOTE: Treat unknown data as 1fps by default
|
| 423 |
+
|
| 424 |
+
if clip is not None:
|
| 425 |
+
start = float(clip[0])
|
| 426 |
+
end = float(clip[1])
|
| 427 |
+
start = max(0, start)
|
| 428 |
+
end = min(len(img_list) / fps, end) # prevent end from exceeding video end
|
| 429 |
+
vlen = (end - start) * fps
|
| 430 |
+
else:
|
| 431 |
+
vlen = len(img_list)
|
| 432 |
+
|
| 433 |
+
duration = vlen / fps
|
| 434 |
+
|
| 435 |
+
num_frames = smart_nframes(ele, total_frames=vlen, video_fps=fps)
|
| 436 |
+
sample = 'middle'
|
| 437 |
+
if clip is not None:
|
| 438 |
+
def _get_index_by_time(start_sec, end_sec, num_segments=8, fps=1., max_frame=9999):
|
| 439 |
+
start_idx = max(1, round(start_sec * fps))
|
| 440 |
+
end_idx = min(round(end_sec * fps), max_frame)
|
| 441 |
+
seg_size = float(end_idx - start_idx) / (num_segments - 1)
|
| 442 |
+
offsets = np.array([start_idx + int(np.round(seg_size * idx)) for idx in range(num_segments)])
|
| 443 |
+
return offsets
|
| 444 |
+
|
| 445 |
+
frame_indices = _get_index_by_time(float(clip[0]), float(clip[1]), num_segments=num_frames, fps=fps, max_frame=len(img_list)-1)
|
| 446 |
+
else:
|
| 447 |
+
frame_indices = get_frame_indices(
|
| 448 |
+
num_frames, vlen, sample=sample, local_num_frames=1, input_fps=fps
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
imgs = []
|
| 452 |
+
for idx in frame_indices:
|
| 453 |
+
frame_fname = os.path.join(video_path, img_list[idx])
|
| 454 |
+
if "s3://" in video_path and client is not None:
|
| 455 |
+
img_bytes = client.get(frame_fname)
|
| 456 |
+
else:
|
| 457 |
+
with open(frame_fname, 'rb') as f:
|
| 458 |
+
img_bytes = f.read()
|
| 459 |
+
img_np = np.frombuffer(img_bytes, np.uint8)
|
| 460 |
+
img = cv2.imdecode(img_np, cv2.IMREAD_COLOR)
|
| 461 |
+
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
|
| 462 |
+
imgs.append(img)
|
| 463 |
+
|
| 464 |
+
# print(f"\033[0;31m img_list={len(img_list)} video_path={video_path}, len(imgs)={len(imgs)}, frame_indices={frame_indices} num_frames={num_frames} \033[0m")
|
| 465 |
+
frames = np.array(imgs, dtype=np.uint8)
|
| 466 |
+
|
| 467 |
+
frames = torch.tensor(np.array(imgs), dtype=torch.uint8).permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8
|
| 468 |
+
sample_fps = num_frames / max(vlen, 1e-6) * fps
|
| 469 |
+
|
| 470 |
+
return frames, sample_fps
|
| 471 |
+
|
| 472 |
+
def _read_video_av(
|
| 473 |
+
ele: dict,
|
| 474 |
+
client
|
| 475 |
+
) -> (torch.Tensor, float):
|
| 476 |
+
"""read video using decord.VideoReader
|
| 477 |
+
|
| 478 |
+
Args:
|
| 479 |
+
ele (dict): a dict contains the configuration of video.
|
| 480 |
+
support keys:
|
| 481 |
+
- video: the path of video. support "file://", "http://", "https://" and local path.
|
| 482 |
+
- video_start: the start time of video.
|
| 483 |
+
- video_end: the end time of video.
|
| 484 |
+
Returns:
|
| 485 |
+
torch.Tensor: the video tensor with shape (T, C, H, W).
|
| 486 |
+
"""
|
| 487 |
+
video_path = ele["video"]
|
| 488 |
+
if 'media_dict' in ele:
|
| 489 |
+
media_dict = ele["media_dict"]
|
| 490 |
+
else:
|
| 491 |
+
media_dict = {}
|
| 492 |
+
if "start" in media_dict.keys():
|
| 493 |
+
clip = [media_dict['start'], media_dict['end']]
|
| 494 |
+
else:
|
| 495 |
+
clip = None
|
| 496 |
+
|
| 497 |
+
if clip is not None:
|
| 498 |
+
raise NotImplementedError("av don't support clip!!!")
|
| 499 |
+
if 's3://' in video_path and client is not None:
|
| 500 |
+
video_bytes = client.get(video_path)
|
| 501 |
+
byteio = io.BytesIO(video_bytes)
|
| 502 |
+
byteio.seek(0)
|
| 503 |
+
reader = av.open(byteio)
|
| 504 |
+
else:
|
| 505 |
+
byteio = None
|
| 506 |
+
reader = av.open(video_path)
|
| 507 |
+
frames = [f.to_rgb().to_ndarray() for f in reader.decode(video=0)]
|
| 508 |
+
vlen = len(frames)
|
| 509 |
+
sample = 'middle'
|
| 510 |
+
duration = get_pyav_video_duration(reader)
|
| 511 |
+
fps = vlen / float(duration)
|
| 512 |
+
num_frames = smart_nframes(ele, total_frames=vlen, video_fps=fps)
|
| 513 |
+
frame_indices = get_frame_indices(
|
| 514 |
+
num_frames, vlen, sample=sample,
|
| 515 |
+
input_fps=fps, local_num_frames=1
|
| 516 |
+
)
|
| 517 |
+
frames = np.stack([frames[idx] for idx in frame_indices]) # (T, H, W, C), torch.uint8
|
| 518 |
+
frames = torch.tensor(frames).permute(0, 3, 1, 2) # (T, C, H, W), torch.uint8
|
| 519 |
+
if byteio != None:
|
| 520 |
+
byteio.close()
|
| 521 |
+
|
| 522 |
+
reader.close()
|
| 523 |
+
sample_fps = num_frames / max(vlen, 1e-6) * fps
|
| 524 |
+
|
| 525 |
+
return frames, sample_fps
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
import numpy as np
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
def sample_frames(key_timestamps, total_frames, num_frames, key_ratio, fps):
|
| 532 |
+
"""
|
| 533 |
+
Samples num_frames from total_frames, where the frame density from key_timestamps is key_ratio times that of other parts.
|
| 534 |
+
|
| 535 |
+
Parameters:
|
| 536 |
+
key_timestamps (list of tuple): Each element is a tuple containing start and end times in seconds.
|
| 537 |
+
total_frames (int): Total number of frames.
|
| 538 |
+
num_frames (int): Number of frames to sample.
|
| 539 |
+
key_ratio (float): The frame density from key_timestamps is key_ratio times that of other parts.
|
| 540 |
+
fps (int): Video frame rate (frames per second).
|
| 541 |
+
|
| 542 |
+
Returns:
|
| 543 |
+
list: A list containing the sampled frame indices.
|
| 544 |
+
"""
|
| 545 |
+
|
| 546 |
+
# Step 1: Convert key time periods to frame intervals
|
| 547 |
+
key_frame_ranges = []
|
| 548 |
+
for start_sec, end_sec in key_timestamps:
|
| 549 |
+
start_frame = int(start_sec * fps)
|
| 550 |
+
end_frame = min(int(end_sec * fps), total_frames - 1)
|
| 551 |
+
if start_frame <= end_frame:
|
| 552 |
+
key_frame_ranges.append((start_frame, end_frame))
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
|
| 556 |
+
# Step 2: Calculate the number of frames in key and non-key regions
|
| 557 |
+
key_frames_count = sum(end - start + 1 for start, end in key_frame_ranges)
|
| 558 |
+
|
| 559 |
+
non_key_frames_count = total_frames - key_frames_count
|
| 560 |
+
|
| 561 |
+
# Step 3: Allocate sample frame counts based on key_ratio
|
| 562 |
+
if non_key_frames_count == 0 and key_frames_count == 0:
|
| 563 |
+
return []
|
| 564 |
+
if non_key_frames_count == 0 and key_ratio == 0:
|
| 565 |
+
raise ValueError("No frames available for sampling.")
|
| 566 |
+
|
| 567 |
+
# Set weights: key region weight is key_ratio, non-key is 1
|
| 568 |
+
key_weight = key_ratio
|
| 569 |
+
non_key_weight = 1.0
|
| 570 |
+
|
| 571 |
+
total_weighted_frames = key_frames_count * key_weight + non_key_frames_count * non_key_weight
|
| 572 |
+
|
| 573 |
+
if total_weighted_frames == 0:
|
| 574 |
+
# Handle case where all weights are zero
|
| 575 |
+
key_sample_count = 0
|
| 576 |
+
non_key_sample_count = num_frames
|
| 577 |
+
else:
|
| 578 |
+
key_sample_count = round(num_frames * (key_frames_count * key_weight) / total_weighted_frames)
|
| 579 |
+
non_key_sample_count = num_frames - key_sample_count
|
| 580 |
+
|
| 581 |
+
# Step 4: Sample from key regions
|
| 582 |
+
key_samples = []
|
| 583 |
+
for start, end in key_frame_ranges:
|
| 584 |
+
if key_frames_count > 0:
|
| 585 |
+
num = max(1, round(key_sample_count * (end - start + 1) / key_frames_count))
|
| 586 |
+
samples = np.linspace(start, end, num=num, dtype=int)
|
| 587 |
+
key_samples.extend(samples)
|
| 588 |
+
|
| 589 |
+
# Step 5: Sample from non-key regions
|
| 590 |
+
non_key_samples = []
|
| 591 |
+
|
| 592 |
+
# Build a set of frames for non-key regions
|
| 593 |
+
key_set = set()
|
| 594 |
+
for start, end in key_frame_ranges:
|
| 595 |
+
key_set.update(range(start, end + 1))
|
| 596 |
+
|
| 597 |
+
non_key_frames = [i for i in range(total_frames) if i not in key_set]
|
| 598 |
+
|
| 599 |
+
# Uniformly sample non-key frames
|
| 600 |
+
if non_key_sample_count > 0 and len(non_key_frames) > 0:
|
| 601 |
+
indices = np.linspace(0, len(non_key_frames) - 1, num=non_key_sample_count, dtype=int)
|
| 602 |
+
non_key_samples = [non_key_frames[i] for i in indices]
|
| 603 |
+
|
| 604 |
+
# Combine results and sort
|
| 605 |
+
all_samples = sorted(set(key_samples + non_key_samples))
|
| 606 |
+
|
| 607 |
+
return all_samples
|
| 608 |
+
|
| 609 |
+
def _read_video_decord(
|
| 610 |
+
ele: dict,
|
| 611 |
+
client = None,
|
| 612 |
+
) -> (torch.Tensor, float):
|
| 613 |
+
"""read video using decord.VideoReader
|
| 614 |
+
|
| 615 |
+
Args:
|
| 616 |
+
ele (dict): a dict contains the configuration of video.
|
| 617 |
+
support keys:
|
| 618 |
+
- video: the path of video. support "file://", "http://", "https://" and local path.
|
| 619 |
+
- video_start: the start time of video.
|
| 620 |
+
- video_end: the end time of video.
|
| 621 |
+
Returns:
|
| 622 |
+
torch.Tensor: the video tensor with shape (T, C, H, W).
|
| 623 |
+
"""
|
| 624 |
+
import decord
|
| 625 |
+
video_path = ele["video"]
|
| 626 |
+
|
| 627 |
+
if video_path.endswith('.avi'):
|
| 628 |
+
return _read_video_av(ele, client)
|
| 629 |
+
st = time.time()
|
| 630 |
+
|
| 631 |
+
if 's3://' in video_path and client is not None:
|
| 632 |
+
video_bytes = client.get(video_path)
|
| 633 |
+
if video_bytes is None or len(video_bytes) == 0:
|
| 634 |
+
raise ValueError(f"Can't read byte from {video_path}!")
|
| 635 |
+
byteio = BytesIO(video_bytes)
|
| 636 |
+
vr = decord.VideoReader(byteio, num_threads=1)
|
| 637 |
+
else:
|
| 638 |
+
byteio = None
|
| 639 |
+
vr = decord.VideoReader(video_path, num_threads=1)
|
| 640 |
+
|
| 641 |
+
# TODO: support start_pts and end_pts
|
| 642 |
+
if 'video_start' in ele or 'video_end' in ele:
|
| 643 |
+
raise NotImplementedError("not support start_pts and end_pts in decord for now.")
|
| 644 |
+
total_frames, video_fps = len(vr), vr.get_avg_fps()
|
| 645 |
+
logger.info(f"decord: {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
|
| 646 |
+
nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
|
| 647 |
+
# import pdb; pdb.set_trace()
|
| 648 |
+
if 'key_time' in ele and ele['key_time'] is not None:
|
| 649 |
+
try:
|
| 650 |
+
idx = sample_frames(ele['key_time'], total_frames-1, nframes, 1.5, vr.get_avg_fps())
|
| 651 |
+
except Exception as e:
|
| 652 |
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
|
| 653 |
+
else:
|
| 654 |
+
idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
|
| 655 |
+
|
| 656 |
+
if not idx:
|
| 657 |
+
if total_frames > 0:
|
| 658 |
+
idx = [0] * nframes # Fallback if sampling returns empty
|
| 659 |
+
else: # Cannot get any frames from video
|
| 660 |
+
return torch.empty(0, 3, 224, 224), 0.0
|
| 661 |
+
|
| 662 |
+
video = vr.get_batch(idx).asnumpy()
|
| 663 |
+
video = torch.tensor(video).permute(0, 3, 1, 2) # Convert to TCHW format
|
| 664 |
+
sample_fps = nframes / max(total_frames, 1e-6) * video_fps
|
| 665 |
+
return video, sample_fps
|
| 666 |
+
|
| 667 |
+
|
| 668 |
+
VIDEO_READER_BACKENDS = {
|
| 669 |
+
"decord": _read_video_decord,
|
| 670 |
+
'img': _read_video_img,
|
| 671 |
+
'frame': _read_video_img,
|
| 672 |
+
'av': _read_video_av,
|
| 673 |
+
'torchvision':_read_video_torchvision
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
FORCE_QWENVL_VIDEO_READER = os.getenv("FORCE_QWENVL_VIDEO_READER", None)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
@lru_cache(maxsize=1)
|
| 680 |
+
def get_video_reader_backend() -> str:
|
| 681 |
+
if FORCE_QWENVL_VIDEO_READER is not None:
|
| 682 |
+
video_reader_backend = FORCE_QWENVL_VIDEO_READER
|
| 683 |
+
elif is_decord_available():
|
| 684 |
+
video_reader_backend = "decord"
|
| 685 |
+
else:
|
| 686 |
+
video_reader_backend = "torchvision"
|
| 687 |
+
logger.info(f"qwen-vl-utils using {video_reader_backend} to read video.")
|
| 688 |
+
return video_reader_backend
|
| 689 |
+
|
| 690 |
+
def fetch_video(ele: dict, client = None, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
|
| 691 |
+
video_reader_backend = get_video_reader_backend()
|
| 692 |
+
# import pdb; pdb.set_trace()
|
| 693 |
+
if isinstance(ele["video"], list):
|
| 694 |
+
if len(ele["video"]) > 1 and isinstance(ele["video"][1], dict) and 'video_read_type' in ele['video'][1]:
|
| 695 |
+
video_reader_backend = ele['video'][1]['video_read_type']
|
| 696 |
+
ele['video'] = ele['video'][0]
|
| 697 |
+
|
| 698 |
+
if isinstance(ele["video"], str):
|
| 699 |
+
|
| 700 |
+
# print(f'video_reader_backend:{video_reader_backend}')
|
| 701 |
+
# import pdb; pdb.set_trace()
|
| 702 |
+
try:
|
| 703 |
+
video, sample_fps = VIDEO_READER_BACKENDS[video_reader_backend](ele, client=client)
|
| 704 |
+
except Exception as e:
|
| 705 |
+
logger.warning(f"video_reader_backend {video_reader_backend} error, trying other backends. msg: {e}")
|
| 706 |
+
try:
|
| 707 |
+
video, sample_fps = VIDEO_READER_BACKENDS["av"](ele, client=client)
|
| 708 |
+
except Exception as e2:
|
| 709 |
+
logger.error(f"All video backends failed. Last error: {e2}")
|
| 710 |
+
raise e2
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
nframes, _, height, width = video.shape if video.ndim == 4 else (0,0,0,0)
|
| 714 |
+
|
| 715 |
+
if nframes == 0: # Handle empty video
|
| 716 |
+
if return_video_sample_fps:
|
| 717 |
+
return video, 0.0
|
| 718 |
+
return video
|
| 719 |
+
|
| 720 |
+
min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
|
| 721 |
+
total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
|
| 722 |
+
max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
|
| 723 |
+
max_pixels_supposed = ele.get("max_pixels", max_pixels)
|
| 724 |
+
if max_pixels_supposed > max_pixels:
|
| 725 |
+
logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
|
| 726 |
+
max_pixels = min(max_pixels_supposed, max_pixels)
|
| 727 |
+
|
| 728 |
+
if "resized_height" in ele and "resized_width" in ele:
|
| 729 |
+
resized_height, resized_width = smart_resize(
|
| 730 |
+
ele["resized_height"],
|
| 731 |
+
ele["resized_width"],
|
| 732 |
+
factor=image_factor,
|
| 733 |
+
)
|
| 734 |
+
else:
|
| 735 |
+
resized_height, resized_width = smart_resize(
|
| 736 |
+
height,
|
| 737 |
+
width,
|
| 738 |
+
factor=image_factor,
|
| 739 |
+
min_pixels=min_pixels,
|
| 740 |
+
max_pixels=max_pixels,
|
| 741 |
+
)
|
| 742 |
+
if resized_height > 0 and resized_width > 0:
|
| 743 |
+
video = transforms.functional.resize(
|
| 744 |
+
video,
|
| 745 |
+
[resized_height, resized_width],
|
| 746 |
+
interpolation=InterpolationMode.BICUBIC,
|
| 747 |
+
antialias=True,
|
| 748 |
+
).float()
|
| 749 |
+
if return_video_sample_fps:
|
| 750 |
+
return video, sample_fps
|
| 751 |
+
return video
|
| 752 |
+
else:
|
| 753 |
+
assert isinstance(ele["video"], (list, tuple))
|
| 754 |
+
process_info = ele.copy()
|
| 755 |
+
process_info.pop("type", None)
|
| 756 |
+
process_info.pop("video", None)
|
| 757 |
+
images = [
|
| 758 |
+
fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
|
| 759 |
+
for video_element in ele["video"]
|
| 760 |
+
]
|
| 761 |
+
nframes = ceil_by_factor(len(images), FRAME_FACTOR)
|
| 762 |
+
if len(images) < nframes:
|
| 763 |
+
images.extend([images[-1]] * (nframes - len(images)))
|
| 764 |
+
if return_video_sample_fps:
|
| 765 |
+
return images, process_info.pop("fps", 2.0)
|
| 766 |
+
return images
|
| 767 |
+
|
| 768 |
+
def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
|
| 769 |
+
vision_infos = []
|
| 770 |
+
if not conversations or not isinstance(conversations[0], (list, dict)):
|
| 771 |
+
return vision_infos
|
| 772 |
+
|
| 773 |
+
if isinstance(conversations[0], dict):
|
| 774 |
+
conversations = [conversations]
|
| 775 |
+
for conversation in conversations:
|
| 776 |
+
for message in conversation:
|
| 777 |
+
if isinstance(message.get("content"), list):
|
| 778 |
+
for ele in message["content"]:
|
| 779 |
+
if (
|
| 780 |
+
ele.get("type") in ("image", "image_url", "video")
|
| 781 |
+
):
|
| 782 |
+
vision_infos.append(ele)
|
| 783 |
+
return vision_infos
|
| 784 |
+
|
| 785 |
+
|
| 786 |
+
def process_vision_info(
|
| 787 |
+
conversations: list[dict] | list[list[dict]],
|
| 788 |
+
return_video_kwargs: bool = False,
|
| 789 |
+
client = None
|
| 790 |
+
) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
|
| 791 |
+
|
| 792 |
+
vision_infos = extract_vision_info(conversations)
|
| 793 |
+
## Read images or videos
|
| 794 |
+
image_inputs = []
|
| 795 |
+
video_inputs = []
|
| 796 |
+
video_sample_fps_list = []
|
| 797 |
+
for vision_info in vision_infos:
|
| 798 |
+
if vision_info.get("type") in ("image", "image_url"):
|
| 799 |
+
image_inputs.append(fetch_image(vision_info,client=client))
|
| 800 |
+
elif vision_info.get("type") == "video":
|
| 801 |
+
video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True, client=client)
|
| 802 |
+
video_sample_fps_list.append(video_sample_fps)
|
| 803 |
+
video_inputs.append(video_input)
|
| 804 |
+
else:
|
| 805 |
+
raise ValueError("`image`, `image_url` or `video` type not found in content.")
|
| 806 |
+
if len(image_inputs) == 0:
|
| 807 |
+
image_inputs = None
|
| 808 |
+
if len(video_inputs) == 0:
|
| 809 |
+
video_inputs = None
|
| 810 |
+
if return_video_kwargs:
|
| 811 |
+
return image_inputs, video_inputs, {'fps': video_sample_fps_list}
|
| 812 |
+
return image_inputs, video_inputs
|
src/run_inference.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import re
|
| 3 |
+
import ast
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
import argparse
|
| 7 |
+
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 8 |
+
from my_vision_process import process_vision_info, client
|
| 9 |
+
|
| 10 |
+
# --- Prompt Definitions ---
|
| 11 |
+
QA_THINK_GLUE = """Answer the question: "[QUESTION]" according to the content of the video.
|
| 12 |
+
|
| 13 |
+
Output your think process within the <think> </think> tags.
|
| 14 |
+
|
| 15 |
+
Then, provide your answer within the <answer> </answer> tags. At the same time, in the <glue> </glue> tags, present the precise time period in seconds of the video clips on which you base your answer in the format of [(s1, e1), (s2, e2), ...]. For example: <think>...</think><answer>A</answer><glue>[(5.2, 10.4)]</glue>.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
QA_THINK = """Answer the question: "[QUESTION]" according to the content of the video.
|
| 19 |
+
|
| 20 |
+
Output your think process within the <think> </think> tags.
|
| 21 |
+
|
| 22 |
+
Then, provide your answer within the <answer> </answer> tags. For example: <think>...</think><answer>A</answer>.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def setup_model():
|
| 26 |
+
"""Loads and returns the model and processor onto the GPU."""
|
| 27 |
+
model_path = "OpenGVLab/VideoChat-R1_5"
|
| 28 |
+
print(f"Loading model from {model_path} onto GPU...")
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
import flash_attn
|
| 32 |
+
attn_implementation = "flash_attention_2"
|
| 33 |
+
print("flash-attn is available, using 'flash_attention_2'.")
|
| 34 |
+
except ImportError:
|
| 35 |
+
print("flash-attn not installed. Falling back to 'sdpa' (PyTorch's native attention).")
|
| 36 |
+
attn_implementation = "sdpa"
|
| 37 |
+
|
| 38 |
+
model = Qwen3VLForConditionalGeneration.from_pretrained(
|
| 39 |
+
model_path,
|
| 40 |
+
torch_dtype=torch.bfloat16,
|
| 41 |
+
device_map="cuda", # Explicitly load the model on the GPU
|
| 42 |
+
attn_implementation=attn_implementation
|
| 43 |
+
).eval()
|
| 44 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
| 45 |
+
print("Model and processor loaded successfully onto GPU.")
|
| 46 |
+
return model, processor
|
| 47 |
+
|
| 48 |
+
def inference(video_path, prompt, model, processor, max_new_tokens=2048, client=None, pred_glue=None):
|
| 49 |
+
"""Runs a single inference pass on the model."""
|
| 50 |
+
messages = [
|
| 51 |
+
{"role": "user", "content": [
|
| 52 |
+
{"type": "video",
|
| 53 |
+
"video": video_path,
|
| 54 |
+
'key_time': pred_glue,
|
| 55 |
+
"total_pixels": 128*12 * 28 * 28,
|
| 56 |
+
"min_pixels": 128 * 28 * 28,
|
| 57 |
+
},
|
| 58 |
+
{"type": "text", "text": prompt},
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
]
|
| 62 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 63 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, client=client)
|
| 64 |
+
fps_inputs = video_kwargs['fps'][0]
|
| 65 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, fps=fps_inputs, padding=True, return_tensors="pt")
|
| 66 |
+
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
| 67 |
+
|
| 68 |
+
with torch.no_grad():
|
| 69 |
+
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=True)
|
| 70 |
+
|
| 71 |
+
generated_ids = [output_ids[i][len(inputs.input_ids[i]):] for i in range(len(output_ids))]
|
| 72 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 73 |
+
return output_text[0]
|
| 74 |
+
|
| 75 |
+
def main(args):
|
| 76 |
+
"""Main function to orchestrate the multi-perception inference process."""
|
| 77 |
+
if not torch.cuda.is_available():
|
| 78 |
+
print("Error: CUDA is not available. This script requires a GPU to run.", file=sys.stderr)
|
| 79 |
+
sys.exit(1)
|
| 80 |
+
print("CUDA is available. Proceeding with GPU setup.")
|
| 81 |
+
|
| 82 |
+
if not os.path.exists(args.video_path):
|
| 83 |
+
print(f"Error: Video file not found at '{args.video_path}'", file=sys.stderr)
|
| 84 |
+
sys.exit(1)
|
| 85 |
+
|
| 86 |
+
model, processor = setup_model()
|
| 87 |
+
|
| 88 |
+
answers = []
|
| 89 |
+
pred_glue = None
|
| 90 |
+
|
| 91 |
+
print(f"\nStarting inference for video: '{args.video_path}'")
|
| 92 |
+
print(f"Question: '{args.question}'")
|
| 93 |
+
print(f"Number of perception iterations: {args.num_perceptions}\n")
|
| 94 |
+
|
| 95 |
+
for perception in range(args.num_perceptions):
|
| 96 |
+
print(f"--- Perception Iteration {perception + 1}/{args.num_perceptions} ---")
|
| 97 |
+
|
| 98 |
+
if perception < args.num_perceptions - 1:
|
| 99 |
+
current_prompt = QA_THINK_GLUE.replace("[QUESTION]", args.question)
|
| 100 |
+
else:
|
| 101 |
+
current_prompt = QA_THINK.replace("[QUESTION]", args.question)
|
| 102 |
+
|
| 103 |
+
ans = inference(
|
| 104 |
+
args.video_path, current_prompt, model, processor,
|
| 105 |
+
client=client, pred_glue=pred_glue
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
print(f"Model Raw Output: {ans}")
|
| 109 |
+
answers.append(ans)
|
| 110 |
+
|
| 111 |
+
pred_glue = None
|
| 112 |
+
try:
|
| 113 |
+
pattern_glue = r'<glue>(.*?)</glue>'
|
| 114 |
+
match_glue = re.search(pattern_glue, ans, re.DOTALL)
|
| 115 |
+
if match_glue:
|
| 116 |
+
glue_str = match_glue.group(1).strip()
|
| 117 |
+
pred_glue = ast.literal_eval(glue_str)
|
| 118 |
+
print(f"Found glue for next iteration: {pred_glue}\n")
|
| 119 |
+
else:
|
| 120 |
+
print("No glue found for next iteration.\n")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"Could not parse glue from output: {e}\n")
|
| 123 |
+
pred_glue = None
|
| 124 |
+
|
| 125 |
+
print("\n--- Final Answer ---")
|
| 126 |
+
final_answer = answers[-1] if answers else "No answer was generated."
|
| 127 |
+
print(final_answer)
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
parser = argparse.ArgumentParser(description="Run video chat inference from the command line.")
|
| 131 |
+
parser.add_argument("video_path", type=str, help="Path to the video file.")
|
| 132 |
+
parser.add_argument("question", type=str, help="Question to ask about the video.")
|
| 133 |
+
parser.add_argument("--num_perceptions", type=int, default=3, help="Number of perception iterations to run.")
|
| 134 |
+
args = parser.parse_args()
|
| 135 |
+
main(args)
|
src/toon_parser.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
import csv
|
| 4 |
+
from io import StringIO
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
def parse_toon_line(line_def, data_line):
|
| 9 |
+
"""
|
| 10 |
+
Parses a single TOON data line based on headers.
|
| 11 |
+
Handles CSV-style quoting for text fields.
|
| 12 |
+
Robustly handles '9/10' or '(9)' formats in numeric fields.
|
| 13 |
+
"""
|
| 14 |
+
if not data_line or data_line.isspace():
|
| 15 |
+
return {}
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
# Use csv module to handle quoted strings
|
| 19 |
+
reader = csv.reader(StringIO(data_line), skipinitialspace=True)
|
| 20 |
+
try:
|
| 21 |
+
values = next(reader)
|
| 22 |
+
except StopIteration:
|
| 23 |
+
values = []
|
| 24 |
+
|
| 25 |
+
cleaned_values = []
|
| 26 |
+
for v in values:
|
| 27 |
+
v_str = v.strip()
|
| 28 |
+
# Remove parens: (9) -> 9
|
| 29 |
+
v_str = v_str.replace('(', '').replace(')', '')
|
| 30 |
+
# Handle fractional scores: 9/10 -> 9
|
| 31 |
+
if '/' in v_str and any(c.isdigit() for c in v_str):
|
| 32 |
+
parts = v_str.split('/')
|
| 33 |
+
# If first part is digit, take it.
|
| 34 |
+
if parts[0].strip().isdigit():
|
| 35 |
+
v_str = parts[0].strip()
|
| 36 |
+
cleaned_values.append(v_str)
|
| 37 |
+
|
| 38 |
+
headers = line_def.get('headers', [])
|
| 39 |
+
|
| 40 |
+
# Ensure values match headers length if possible, or pad
|
| 41 |
+
if len(cleaned_values) < len(headers):
|
| 42 |
+
cleaned_values += [""] * (len(headers) - len(cleaned_values))
|
| 43 |
+
elif len(cleaned_values) > len(headers):
|
| 44 |
+
cleaned_values = cleaned_values[:len(headers)]
|
| 45 |
+
|
| 46 |
+
return dict(zip(headers, cleaned_values))
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Error parsing TOON line '{data_line}': {e}")
|
| 49 |
+
return {}
|
| 50 |
+
|
| 51 |
+
def fuzzy_extract_scores(text: str) -> dict:
|
| 52 |
+
"""
|
| 53 |
+
Fallback method. Scans text for key metrics followed near-immediately by a number.
|
| 54 |
+
Handles: "Visual: 9", "Visual - 9", "Visual: 9/10", "Accuracy: 9/10"
|
| 55 |
+
"""
|
| 56 |
+
scores = {
|
| 57 |
+
'visual': '0', 'audio': '0', 'source': '0', 'logic': '0', 'emotion': '0',
|
| 58 |
+
'video_audio': '0', 'video_caption': '0', 'audio_caption': '0'
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Mappings: Regex Pattern -> Score Key
|
| 62 |
+
mappings = [
|
| 63 |
+
('visual', 'visual'),
|
| 64 |
+
('visual.*?integrity', 'visual'),
|
| 65 |
+
('accuracy', 'visual'), # Fallback
|
| 66 |
+
('audio', 'audio'),
|
| 67 |
+
('source', 'source'),
|
| 68 |
+
('logic', 'logic'),
|
| 69 |
+
('emotion', 'emotion'),
|
| 70 |
+
(r'video.*?audio', 'video_audio'),
|
| 71 |
+
(r'video.*?caption', 'video_caption'),
|
| 72 |
+
(r'audio.*?caption', 'audio_caption')
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
for pattern_str, key in mappings:
|
| 76 |
+
pattern = re.compile(fr'(?i){pattern_str}.*?[:=\-\s\(]+(\b10\b|\b\d\b)(?:/10)?')
|
| 77 |
+
match = pattern.search(text)
|
| 78 |
+
if match:
|
| 79 |
+
if scores[key] == '0':
|
| 80 |
+
scores[key] = match.group(1)
|
| 81 |
+
|
| 82 |
+
return scores
|
| 83 |
+
|
| 84 |
+
def parse_veracity_toon(text: str) -> dict:
|
| 85 |
+
"""
|
| 86 |
+
Parses the Veracity Vector TOON output into a standardized dictionary.
|
| 87 |
+
Handles "Simple", "Reasoning", and new "Modalities" blocks.
|
| 88 |
+
Robust against Markdown formatting artifacts and nested reports.
|
| 89 |
+
"""
|
| 90 |
+
if not text:
|
| 91 |
+
return {}
|
| 92 |
+
|
| 93 |
+
# 1. Cleanup
|
| 94 |
+
text = re.sub(r'```\w*', '', text)
|
| 95 |
+
text = re.sub(r'```', '', text)
|
| 96 |
+
text = text.strip()
|
| 97 |
+
|
| 98 |
+
parsed_sections = {}
|
| 99 |
+
|
| 100 |
+
# 2. Robust Regex for TOON Block Headers
|
| 101 |
+
# Matches: key : type [ count ] { headers } :
|
| 102 |
+
block_pattern = re.compile(
|
| 103 |
+
r'([a-zA-Z0-9_]+)\s*:\s*(?:\w+\s*)?(?:\[\s*(\d+)\s*\])?\s*\{\s*(.*?)\s*\}\s*:\s*',
|
| 104 |
+
re.MULTILINE
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
matches = list(block_pattern.finditer(text))
|
| 108 |
+
|
| 109 |
+
for i, match in enumerate(matches):
|
| 110 |
+
key = match.group(1).lower()
|
| 111 |
+
# Default to 1 if count is missing
|
| 112 |
+
count = int(match.group(2)) if match.group(2) else 1
|
| 113 |
+
headers_str = match.group(3)
|
| 114 |
+
headers = [h.strip().lower() for h in headers_str.split(',')]
|
| 115 |
+
|
| 116 |
+
start_idx = match.end()
|
| 117 |
+
# End at next match or end of text
|
| 118 |
+
end_idx = matches[i+1].start() if i + 1 < len(matches) else len(text)
|
| 119 |
+
block_content = text[start_idx:end_idx].strip()
|
| 120 |
+
|
| 121 |
+
lines = [line.strip() for line in block_content.splitlines() if line.strip()]
|
| 122 |
+
|
| 123 |
+
data_items = []
|
| 124 |
+
valid_lines = [l for l in lines if len(l) > 1]
|
| 125 |
+
|
| 126 |
+
for line in valid_lines[:count]:
|
| 127 |
+
item = parse_toon_line({'key': key, 'headers': headers}, line)
|
| 128 |
+
data_items.append(item)
|
| 129 |
+
|
| 130 |
+
if count == 1 and data_items:
|
| 131 |
+
parsed_sections[key] = data_items[0]
|
| 132 |
+
else:
|
| 133 |
+
parsed_sections[key] = data_items
|
| 134 |
+
|
| 135 |
+
# --- Flatten logic to standardized structure ---
|
| 136 |
+
flat_result = {
|
| 137 |
+
'veracity_vectors': {
|
| 138 |
+
'visual_integrity_score': '0',
|
| 139 |
+
'audio_integrity_score': '0',
|
| 140 |
+
'source_credibility_score': '0',
|
| 141 |
+
'logical_consistency_score': '0',
|
| 142 |
+
'emotional_manipulation_score': '0'
|
| 143 |
+
},
|
| 144 |
+
'modalities': {
|
| 145 |
+
'video_audio_score': '0',
|
| 146 |
+
'video_caption_score': '0',
|
| 147 |
+
'audio_caption_score': '0'
|
| 148 |
+
},
|
| 149 |
+
'video_context_summary': '',
|
| 150 |
+
'political_bias': {},
|
| 151 |
+
'criticism_level': {},
|
| 152 |
+
'sentiment_and_bias': '',
|
| 153 |
+
'tags': [],
|
| 154 |
+
'factuality_factors': {},
|
| 155 |
+
'disinformation_analysis': {},
|
| 156 |
+
'final_assessment': {}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
got_vectors = False
|
| 160 |
+
got_modalities = False
|
| 161 |
+
|
| 162 |
+
# 1. Process 'vectors'
|
| 163 |
+
vectors_data = parsed_sections.get('vectors', [])
|
| 164 |
+
if isinstance(vectors_data, dict): # Simple schema
|
| 165 |
+
v = vectors_data
|
| 166 |
+
if any(val and val != '0' for val in v.values()):
|
| 167 |
+
if 'visual' in v: flat_result['veracity_vectors']['visual_integrity_score'] = v['visual']
|
| 168 |
+
if 'audio' in v: flat_result['veracity_vectors']['audio_integrity_score'] = v['audio']
|
| 169 |
+
if 'source' in v: flat_result['veracity_vectors']['source_credibility_score'] = v['source']
|
| 170 |
+
if 'logic' in v: flat_result['veracity_vectors']['logical_consistency_score'] = v['logic']
|
| 171 |
+
if 'emotion' in v: flat_result['veracity_vectors']['emotional_manipulation_score'] = v['emotion']
|
| 172 |
+
got_vectors = True
|
| 173 |
+
|
| 174 |
+
elif isinstance(vectors_data, list): # Reasoning schema
|
| 175 |
+
for item in vectors_data:
|
| 176 |
+
cat = item.get('category', '').lower()
|
| 177 |
+
score = item.get('score', '0')
|
| 178 |
+
if score and score != '0':
|
| 179 |
+
got_vectors = True
|
| 180 |
+
if 'visual' in cat: flat_result['veracity_vectors']['visual_integrity_score'] = score
|
| 181 |
+
elif 'audio' in cat: flat_result['veracity_vectors']['audio_integrity_score'] = score
|
| 182 |
+
elif 'source' in cat: flat_result['veracity_vectors']['source_credibility_score'] = score
|
| 183 |
+
elif 'logic' in cat: flat_result['veracity_vectors']['logical_consistency_score'] = score
|
| 184 |
+
elif 'emotion' in cat: flat_result['veracity_vectors']['emotional_manipulation_score'] = score
|
| 185 |
+
|
| 186 |
+
# 2. Process 'modalities'
|
| 187 |
+
modalities_data = parsed_sections.get('modalities', [])
|
| 188 |
+
if isinstance(modalities_data, dict): # Simple schema
|
| 189 |
+
m = modalities_data
|
| 190 |
+
for k, v in m.items():
|
| 191 |
+
k_clean = k.lower().replace(' ', '').replace('-', '').replace('_', '')
|
| 192 |
+
if 'videoaudio' in k_clean: flat_result['modalities']['video_audio_score'] = v
|
| 193 |
+
elif 'videocaption' in k_clean: flat_result['modalities']['video_caption_score'] = v
|
| 194 |
+
elif 'audiocaption' in k_clean: flat_result['modalities']['audio_caption_score'] = v
|
| 195 |
+
if v and v != '0': got_modalities = True
|
| 196 |
+
|
| 197 |
+
elif isinstance(modalities_data, list): # Reasoning schema
|
| 198 |
+
for item in modalities_data:
|
| 199 |
+
cat = item.get('category', '').lower().replace(' ', '').replace('-', '').replace('_', '')
|
| 200 |
+
score = item.get('score', '0')
|
| 201 |
+
if score and score != '0':
|
| 202 |
+
got_modalities = True
|
| 203 |
+
if 'videoaudio' in cat: flat_result['modalities']['video_audio_score'] = score
|
| 204 |
+
elif 'videocaption' in cat: flat_result['modalities']['video_caption_score'] = score
|
| 205 |
+
elif 'audiocaption' in cat: flat_result['modalities']['audio_caption_score'] = score
|
| 206 |
+
|
| 207 |
+
# --- FUZZY FALLBACK ---
|
| 208 |
+
if not got_vectors or not got_modalities:
|
| 209 |
+
fuzzy_scores = fuzzy_extract_scores(text)
|
| 210 |
+
if not got_vectors:
|
| 211 |
+
flat_result['veracity_vectors']['visual_integrity_score'] = fuzzy_scores['visual']
|
| 212 |
+
flat_result['veracity_vectors']['audio_integrity_score'] = fuzzy_scores['audio']
|
| 213 |
+
flat_result['veracity_vectors']['source_credibility_score'] = fuzzy_scores['source']
|
| 214 |
+
flat_result['veracity_vectors']['logical_consistency_score'] = fuzzy_scores['logic']
|
| 215 |
+
flat_result['veracity_vectors']['emotional_manipulation_score'] = fuzzy_scores['emotion']
|
| 216 |
+
if not got_modalities:
|
| 217 |
+
flat_result['modalities']['video_audio_score'] = fuzzy_scores['video_audio']
|
| 218 |
+
flat_result['modalities']['video_caption_score'] = fuzzy_scores['video_caption']
|
| 219 |
+
flat_result['modalities']['audio_caption_score'] = fuzzy_scores['audio_caption']
|
| 220 |
+
|
| 221 |
+
# 3. Factuality
|
| 222 |
+
f = parsed_sections.get('factuality', {})
|
| 223 |
+
if isinstance(f, list): f = f[0] if f else {}
|
| 224 |
+
flat_result['factuality_factors'] = {
|
| 225 |
+
'claim_accuracy': f.get('accuracy', 'Unverifiable'),
|
| 226 |
+
'evidence_gap': f.get('gap', ''),
|
| 227 |
+
'grounding_check': f.get('grounding', '')
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
# 4. Disinfo
|
| 231 |
+
d = parsed_sections.get('disinfo', {})
|
| 232 |
+
if isinstance(d, list): d = d[0] if d else {}
|
| 233 |
+
flat_result['disinformation_analysis'] = {
|
| 234 |
+
'classification': d.get('class', 'None'),
|
| 235 |
+
'intent': d.get('intent', 'None'),
|
| 236 |
+
'threat_vector': d.get('threat', 'None')
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# 5. Final Assessment
|
| 240 |
+
fn = parsed_sections.get('final', {})
|
| 241 |
+
if isinstance(fn, list): fn = fn[0] if fn else {}
|
| 242 |
+
flat_result['final_assessment'] = {
|
| 243 |
+
'veracity_score_total': fn.get('score', '0'),
|
| 244 |
+
'reasoning': fn.get('reasoning', '')
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
# 6. Tags (New)
|
| 248 |
+
t = parsed_sections.get('tags', {})
|
| 249 |
+
if isinstance(t, list): t = t[0] if t else {}
|
| 250 |
+
raw_tags = t.get('keywords', '')
|
| 251 |
+
if raw_tags:
|
| 252 |
+
flat_result['tags'] = [x.strip() for x in raw_tags.split(',')]
|
| 253 |
+
|
| 254 |
+
# 7. Summary
|
| 255 |
+
s = parsed_sections.get('summary', {})
|
| 256 |
+
if isinstance(s, list): s = s[0] if s else {}
|
| 257 |
+
flat_result['video_context_summary'] = s.get('text', '')
|
| 258 |
+
|
| 259 |
+
# 8. Political Bias (New)
|
| 260 |
+
pb = parsed_sections.get('political_bias', {})
|
| 261 |
+
if isinstance(pb, list): pb = pb[0] if pb else {}
|
| 262 |
+
flat_result['political_bias'] = {
|
| 263 |
+
'score': pb.get('score', '0'),
|
| 264 |
+
'reasoning': pb.get('reasoning', '')
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
# 9. Criticism Level (New)
|
| 268 |
+
cl = parsed_sections.get('criticism_level', {})
|
| 269 |
+
if isinstance(cl, list): cl = cl[0] if cl else {}
|
| 270 |
+
flat_result['criticism_level'] = {
|
| 271 |
+
'score': cl.get('score', '0'),
|
| 272 |
+
'reasoning': cl.get('reasoning', '')
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
# 10. Sentiment and Bias (New)
|
| 276 |
+
sb = parsed_sections.get('sentiment_and_bias', {})
|
| 277 |
+
if isinstance(sb, list): sb = sb[0] if sb else {}
|
| 278 |
+
flat_result['sentiment_and_bias'] = sb.get('text', '')
|
| 279 |
+
|
| 280 |
+
flat_result['raw_parsed_structure'] = parsed_sections
|
| 281 |
+
|
| 282 |
+
return flat_result
|
src/transcription.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import logging
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
transcription_model = None
|
| 10 |
+
|
| 11 |
+
def load_model():
|
| 12 |
+
"""Loads the Whisper transcription model."""
|
| 13 |
+
if LITE_MODE:
|
| 14 |
+
logger.info("LITE_MODE is enabled. Skipping Whisper model loading.")
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
global transcription_model
|
| 18 |
+
if transcription_model is None:
|
| 19 |
+
try:
|
| 20 |
+
# Using 'base.en' for a good balance of speed and accuracy with English videos.
|
| 21 |
+
# For multilingual support, 'base' could be used.
|
| 22 |
+
logger.info("Loading 'base.en' Whisper model for transcription...")
|
| 23 |
+
transcription_model = whisper.load_model("base.en")
|
| 24 |
+
logger.info("Whisper model loaded successfully.")
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
|
| 27 |
+
# The app can continue without transcription, but we should log the error.
|
| 28 |
+
transcription_model = None
|
| 29 |
+
|
| 30 |
+
def generate_transcript(audio_path_str: str) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Transcribes the given audio file and saves the transcript as a .vtt file.
|
| 33 |
+
Returns the path to the generated .vtt file.
|
| 34 |
+
"""
|
| 35 |
+
if transcription_model is None:
|
| 36 |
+
logger.warning("Transcription model is not available. Cannot generate transcript.")
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
audio_path = Path(audio_path_str)
|
| 41 |
+
logger.info(f"Starting transcription for: {audio_path.name}")
|
| 42 |
+
|
| 43 |
+
# The result object contains the transcript in various formats.
|
| 44 |
+
result = transcription_model.transcribe(audio_path_str, verbose=False)
|
| 45 |
+
|
| 46 |
+
# Define the output VTT path
|
| 47 |
+
vtt_path = audio_path.with_suffix('.vtt')
|
| 48 |
+
|
| 49 |
+
# Use whisper's built-in VTT writer
|
| 50 |
+
from whisper.utils import get_writer
|
| 51 |
+
writer = get_writer("vtt", str(vtt_path.parent))
|
| 52 |
+
writer(result, str(audio_path.name))
|
| 53 |
+
|
| 54 |
+
logger.info(f"Transcription complete. VTT file saved to: {vtt_path}")
|
| 55 |
+
return str(vtt_path)
|
| 56 |
+
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"An error occurred during transcription for {audio_path_str}: {e}", exc_info=True)
|
| 59 |
+
return None
|
start.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# 1. Start Python FastAPI in the background
|
| 4 |
+
# We set PYTHONPATH to include the /app/src directory so imports work correctly
|
| 5 |
+
# We run from /app so that data/ and videos/ directories are created in the root volume mount
|
| 6 |
+
echo "Starting Python Inference Engine..."
|
| 7 |
+
export PYTHONPATH=$PYTHONPATH:/app/src
|
| 8 |
+
python -m uvicorn src.app:app --host 127.0.0.1 --port 8001 &
|
| 9 |
+
|
| 10 |
+
# Wait for Python to initialize
|
| 11 |
+
sleep 5
|
| 12 |
+
|
| 13 |
+
# 2. Start Golang Web Server
|
| 14 |
+
echo "Starting Go Web Server..."
|
| 15 |
+
# FIXED: Run the binary from the system path
|
| 16 |
+
/usr/local/bin/vchat-server
|