Commit ·
7b1d1ea
1
Parent(s): bb1cad5
wow
Browse files- .gitattributes +0 -35
- Dockerfile +0 -89
- README.md +0 -10
- frontend/src/App.tsx +2 -2
- main.go +0 -60
- requirements.txt +0 -42
- src/app.py +0 -808
- src/factuality_logic.py +0 -143
- src/inference_logic.py +0 -303
- src/labeling_logic.py +0 -145
- src/my_vision_process.py +0 -17
- src/toon_parser.py +0 -220
- src/transcription.py +0 -48
- start.sh +0 -23
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
# ==========================================
|
| 2 |
-
# Stage 1: Build Frontend (React/TS/Vite)
|
| 3 |
-
# ==========================================
|
| 4 |
-
FROM node:20-slim AS frontend-builder
|
| 5 |
-
WORKDIR /app/frontend
|
| 6 |
-
|
| 7 |
-
# Copy frontend definitions
|
| 8 |
-
COPY frontend/package.json frontend/package-lock.json* ./
|
| 9 |
-
RUN npm install
|
| 10 |
-
|
| 11 |
-
# Copy source and build
|
| 12 |
-
COPY frontend/ ./
|
| 13 |
-
RUN npm run build
|
| 14 |
-
|
| 15 |
-
# ==========================================
|
| 16 |
-
# Stage 2: Build Backend (Golang)
|
| 17 |
-
# ==========================================
|
| 18 |
-
FROM golang:1.23 AS backend-builder
|
| 19 |
-
WORKDIR /app/backend
|
| 20 |
-
|
| 21 |
-
# Copy Go source
|
| 22 |
-
COPY main.go .
|
| 23 |
-
|
| 24 |
-
# Build static binary
|
| 25 |
-
RUN go mod init vchat-server && \
|
| 26 |
-
go mod tidy && \
|
| 27 |
-
CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o vchat-server main.go
|
| 28 |
-
|
| 29 |
-
# ==========================================
|
| 30 |
-
# Stage 3: Final Runtime (Hugging Face Space)
|
| 31 |
-
# ==========================================
|
| 32 |
-
FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime
|
| 33 |
-
|
| 34 |
-
# Default to LITE_MODE=true for HF Spaces (API Only)
|
| 35 |
-
ENV PYTHONUNBUFFERED=1 \
|
| 36 |
-
DEBIAN_FRONTEND=noninteractive \
|
| 37 |
-
LITE_MODE=true \
|
| 38 |
-
PATH="/home/user/.local/bin:$PATH" \
|
| 39 |
-
PIP_NO_CACHE_DIR=1
|
| 40 |
-
|
| 41 |
-
# Create a non-root user (Required for HF Spaces)
|
| 42 |
-
RUN useradd -m -u 1000 user
|
| 43 |
-
|
| 44 |
-
WORKDIR /app
|
| 45 |
-
|
| 46 |
-
# 1. Fix FFmpeg Conflict (Critical Step)
|
| 47 |
-
RUN conda uninstall -y ffmpeg || true
|
| 48 |
-
|
| 49 |
-
# 2. Install System Dependencies
|
| 50 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 51 |
-
ffmpeg \
|
| 52 |
-
git \
|
| 53 |
-
curl \
|
| 54 |
-
gnupg \
|
| 55 |
-
ca-certificates \
|
| 56 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 57 |
-
|
| 58 |
-
# 3. Install Python Dependencies
|
| 59 |
-
RUN pip install uv
|
| 60 |
-
COPY requirements.txt ./
|
| 61 |
-
RUN uv pip install --system -r requirements.txt
|
| 62 |
-
# Explicitly force latest yt-dlp to handle Twitter/X API changes
|
| 63 |
-
RUN uv pip install --system --upgrade "yt-dlp[default]"
|
| 64 |
-
|
| 65 |
-
# 4. Copy Python Application Code
|
| 66 |
-
COPY --chown=user src/ ./src/
|
| 67 |
-
|
| 68 |
-
# 5. Install Built Artifacts
|
| 69 |
-
COPY --from=backend-builder --chown=user /app/backend/vchat-server /app/vchat-server
|
| 70 |
-
RUN mkdir -p /app/static
|
| 71 |
-
COPY --from=frontend-builder --chown=user /app/frontend/dist /app/static
|
| 72 |
-
|
| 73 |
-
# 6. Setup Directories and Permissions
|
| 74 |
-
RUN mkdir -p /app/data /app/data/videos /app/data/labels /app/data/prompts /app/data/responses /app/metadata \
|
| 75 |
-
&& chown -R user:user /app/data /app/metadata
|
| 76 |
-
|
| 77 |
-
# 7. Setup Entrypoint
|
| 78 |
-
COPY --chown=user start.sh /app/start.sh
|
| 79 |
-
RUN sed -i 's/\r$//' /app/start.sh && \
|
| 80 |
-
chmod +x /app/start.sh
|
| 81 |
-
|
| 82 |
-
# Switch to non-root user
|
| 83 |
-
USER user
|
| 84 |
-
|
| 85 |
-
# Expose the HF Space port
|
| 86 |
-
EXPOSE 7860
|
| 87 |
-
|
| 88 |
-
# Run the Orchestrator
|
| 89 |
-
CMD ["/app/start.sh"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: VFacts
|
| 3 |
-
emoji: 😻
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: gray
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
-
|
| 10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
frontend/src/App.tsx
CHANGED
|
@@ -143,7 +143,7 @@ function App() {
|
|
| 143 |
<div className="w-8 h-8 rounded-lg bg-indigo-600 flex items-center justify-center">
|
| 144 |
<Bot className="w-5 h-5 text-white" />
|
| 145 |
</div>
|
| 146 |
-
<h1 className="text-sm font-bold text-white">vChat <span className="text-slate-500">
|
| 147 |
</div>
|
| 148 |
</div>
|
| 149 |
|
|
@@ -322,7 +322,7 @@ function App() {
|
|
| 322 |
</h2>
|
| 323 |
<p className="text-slate-400 text-sm">
|
| 324 |
Switch to the <strong>Queue</strong> tab, upload your CSV file, and click <strong>Start Batch</strong>.
|
| 325 |
-
The system will download videos
|
| 326 |
</p>
|
| 327 |
</section>
|
| 328 |
</div>
|
|
|
|
| 143 |
<div className="w-8 h-8 rounded-lg bg-indigo-600 flex items-center justify-center">
|
| 144 |
<Bot className="w-5 h-5 text-white" />
|
| 145 |
</div>
|
| 146 |
+
<h1 className="text-sm font-bold text-white">vChat <span className="text-slate-500">API Lite</span></h1>
|
| 147 |
</div>
|
| 148 |
</div>
|
| 149 |
|
|
|
|
| 322 |
</h2>
|
| 323 |
<p className="text-slate-400 text-sm">
|
| 324 |
Switch to the <strong>Queue</strong> tab, upload your CSV file, and click <strong>Start Batch</strong>.
|
| 325 |
+
The system will download videos and run the selected AI model to generate factuality labels natively.
|
| 326 |
</p>
|
| 327 |
</section>
|
| 328 |
</div>
|
main.go
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
package main
|
| 2 |
-
|
| 3 |
-
import (
|
| 4 |
-
"log"
|
| 5 |
-
"net/http"
|
| 6 |
-
"net/http/httputil"
|
| 7 |
-
"net/url"
|
| 8 |
-
"os"
|
| 9 |
-
"strings"
|
| 10 |
-
)
|
| 11 |
-
|
| 12 |
-
func main() {
|
| 13 |
-
// Target Python FastAPI server (running locally in the container)
|
| 14 |
-
pythonTarget := "http://127.0.0.1:8001"
|
| 15 |
-
pythonURL, err := url.Parse(pythonTarget)
|
| 16 |
-
if err != nil {
|
| 17 |
-
log.Fatalf("Invalid Python target URL: %v", err)
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
// Create Reverse Proxy
|
| 21 |
-
proxy := httputil.NewSingleHostReverseProxy(pythonURL)
|
| 22 |
-
|
| 23 |
-
// HF Spaces: Files are copied to /app/static in Dockerfile
|
| 24 |
-
staticPath := "/app/static"
|
| 25 |
-
fs := http.FileServer(http.Dir(staticPath))
|
| 26 |
-
|
| 27 |
-
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
| 28 |
-
// Proxy API requests to Python
|
| 29 |
-
if strings.HasPrefix(r.URL.Path, "/process") ||
|
| 30 |
-
strings.HasPrefix(r.URL.Path, "/label_video") ||
|
| 31 |
-
strings.HasPrefix(r.URL.Path, "/batch_label") ||
|
| 32 |
-
strings.HasPrefix(r.URL.Path, "/model-architecture") ||
|
| 33 |
-
strings.HasPrefix(r.URL.Path, "/download-dataset") ||
|
| 34 |
-
strings.HasPrefix(r.URL.Path, "/extension") ||
|
| 35 |
-
strings.HasPrefix(r.URL.Path, "/manage") ||
|
| 36 |
-
strings.HasPrefix(r.URL.Path, "/queue") {
|
| 37 |
-
|
| 38 |
-
log.Printf("Proxying %s to Python Backend...", r.URL.Path)
|
| 39 |
-
proxy.ServeHTTP(w, r)
|
| 40 |
-
return
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
// Check if file exists in static dir, otherwise serve index.html (SPA Routing)
|
| 44 |
-
path := staticPath + r.URL.Path
|
| 45 |
-
if _, err := os.Stat(path); os.IsNotExist(err) {
|
| 46 |
-
http.ServeFile(w, r, staticPath+"/index.html")
|
| 47 |
-
return
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
fs.ServeHTTP(w, r)
|
| 51 |
-
})
|
| 52 |
-
|
| 53 |
-
// HF Spaces requires listening on port 7860
|
| 54 |
-
port := "7860"
|
| 55 |
-
log.Printf("vChat HF Server listening on port %s", port)
|
| 56 |
-
log.Printf("Serving static files from %s", staticPath)
|
| 57 |
-
if err := http.ListenAndServe(":"+port, nil); err != nil {
|
| 58 |
-
log.Fatal(err)
|
| 59 |
-
}
|
| 60 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
torch
|
| 2 |
-
torchvision
|
| 3 |
-
torchaudio
|
| 4 |
-
# --- Core Server ---
|
| 5 |
-
fastapi
|
| 6 |
-
uvicorn[standard]
|
| 7 |
-
python-multipart
|
| 8 |
-
requests
|
| 9 |
-
aiofiles
|
| 10 |
-
jinja2
|
| 11 |
-
python-dotenv
|
| 12 |
-
|
| 13 |
-
# --- AI & Vision Processing ---
|
| 14 |
-
transformers
|
| 15 |
-
accelerate
|
| 16 |
-
Pillow
|
| 17 |
-
packaging
|
| 18 |
-
av
|
| 19 |
-
# Use headless to avoid installing X11/GL libraries in Docker
|
| 20 |
-
opencv-python-headless
|
| 21 |
-
decord
|
| 22 |
-
imageio
|
| 23 |
-
numpy
|
| 24 |
-
einops
|
| 25 |
-
|
| 26 |
-
# --- Google Cloud & APIs ---
|
| 27 |
-
google-generativeai>=0.4.0
|
| 28 |
-
google-cloud-aiplatform
|
| 29 |
-
google-genai
|
| 30 |
-
mlcroissant
|
| 31 |
-
|
| 32 |
-
# --- Fine-Tuning (LoRA/QLoRA) ---
|
| 33 |
-
peft
|
| 34 |
-
bitsandbytes
|
| 35 |
-
trl
|
| 36 |
-
datasets
|
| 37 |
-
|
| 38 |
-
# --- Audio ---
|
| 39 |
-
openai-whisper
|
| 40 |
-
# FORCE LATEST YT-DLP (Often required for X/Twitter)
|
| 41 |
-
yt-dlp>=2024.11.18
|
| 42 |
-
ffmpeg-python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app.py
DELETED
|
@@ -1,808 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
import asyncio
|
| 4 |
-
import subprocess
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
import logging
|
| 7 |
-
import csv
|
| 8 |
-
import io
|
| 9 |
-
import datetime
|
| 10 |
-
import json
|
| 11 |
-
import hashlib
|
| 12 |
-
import re
|
| 13 |
-
import glob
|
| 14 |
-
import shutil
|
| 15 |
-
import time
|
| 16 |
-
from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
|
| 17 |
-
from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
|
| 18 |
-
from fastapi.templating import Jinja2Templates
|
| 19 |
-
from fastapi.staticfiles import StaticFiles
|
| 20 |
-
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
-
import yt_dlp
|
| 22 |
-
import inference_logic
|
| 23 |
-
import factuality_logic
|
| 24 |
-
import transcription
|
| 25 |
-
from factuality_logic import parse_vtt
|
| 26 |
-
from toon_parser import parse_veracity_toon
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
-
import mlcroissant as mlc
|
| 30 |
-
CROISSANT_AVAILABLE = True
|
| 31 |
-
except ImportError:
|
| 32 |
-
try:
|
| 33 |
-
import croissant as mlc
|
| 34 |
-
CROISSANT_AVAILABLE = True
|
| 35 |
-
except ImportError:
|
| 36 |
-
mlc = None
|
| 37 |
-
CROISSANT_AVAILABLE = False
|
| 38 |
-
|
| 39 |
-
# Configure Logging with High Verbosity
|
| 40 |
-
logging.basicConfig(
|
| 41 |
-
level=logging.INFO,
|
| 42 |
-
format="%(asctime)s - %(levelname)s - %(message)s",
|
| 43 |
-
handlers=[logging.StreamHandler(sys.stdout)]
|
| 44 |
-
)
|
| 45 |
-
logger = logging.getLogger("vChat")
|
| 46 |
-
|
| 47 |
-
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 48 |
-
|
| 49 |
-
app = FastAPI()
|
| 50 |
-
|
| 51 |
-
app.add_middleware(
|
| 52 |
-
CORSMiddleware,
|
| 53 |
-
allow_origins=["*"],
|
| 54 |
-
allow_credentials=True,
|
| 55 |
-
allow_methods=["*"],
|
| 56 |
-
allow_headers=["*"],
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
# HF Spaces specific path
|
| 60 |
-
STATIC_DIR = "/app/static"
|
| 61 |
-
if not os.path.isdir(STATIC_DIR):
|
| 62 |
-
# Fallback if running locally
|
| 63 |
-
STATIC_DIR = "static"
|
| 64 |
-
os.makedirs(STATIC_DIR, exist_ok=True)
|
| 65 |
-
|
| 66 |
-
app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
|
| 67 |
-
templates = Jinja2Templates(directory=STATIC_DIR)
|
| 68 |
-
|
| 69 |
-
# Ensure data directories exist (HF Spaces writable locations)
|
| 70 |
-
os.makedirs("data/videos", exist_ok=True)
|
| 71 |
-
os.makedirs("data", exist_ok=True)
|
| 72 |
-
os.makedirs("data/labels", exist_ok=True)
|
| 73 |
-
os.makedirs("data/prompts", exist_ok=True)
|
| 74 |
-
os.makedirs("data/responses", exist_ok=True)
|
| 75 |
-
os.makedirs("metadata", exist_ok=True)
|
| 76 |
-
|
| 77 |
-
STOP_QUEUE_SIGNAL = False
|
| 78 |
-
|
| 79 |
-
@app.on_event("startup")
|
| 80 |
-
async def startup_event():
|
| 81 |
-
logger.info("Application starting up...")
|
| 82 |
-
try:
|
| 83 |
-
transcription.load_model()
|
| 84 |
-
except Exception as e:
|
| 85 |
-
logger.warning(f"Could not load Whisper model: {e}")
|
| 86 |
-
|
| 87 |
-
if not LITE_MODE:
|
| 88 |
-
try:
|
| 89 |
-
inference_logic.load_models()
|
| 90 |
-
except Exception as e:
|
| 91 |
-
logger.fatal(f"Could not load local inference models. Error: {e}", exc_info=True)
|
| 92 |
-
else:
|
| 93 |
-
logger.info("Running in LITE mode (API Only).")
|
| 94 |
-
|
| 95 |
-
@app.get("/", response_class=HTMLResponse)
|
| 96 |
-
async def read_root(request: Request):
|
| 97 |
-
custom_model_available = False
|
| 98 |
-
if not LITE_MODE:
|
| 99 |
-
custom_model_available = inference_logic.peft_model is not None
|
| 100 |
-
if not (Path(STATIC_DIR) / "index.html").exists():
|
| 101 |
-
return HTMLResponse(content="Frontend not found.", status_code=404)
|
| 102 |
-
return templates.TemplateResponse("index.html", {
|
| 103 |
-
"request": request,
|
| 104 |
-
"custom_model_available": custom_model_available,
|
| 105 |
-
"lite_mode": LITE_MODE
|
| 106 |
-
})
|
| 107 |
-
|
| 108 |
-
@app.get("/model-architecture", response_class=PlainTextResponse)
|
| 109 |
-
async def get_model_architecture():
|
| 110 |
-
if LITE_MODE: return "Running in LITE mode."
|
| 111 |
-
if inference_logic.base_model: return str(inference_logic.base_model)
|
| 112 |
-
return "Model not loaded."
|
| 113 |
-
|
| 114 |
-
@app.get("/download-dataset")
|
| 115 |
-
async def download_dataset():
|
| 116 |
-
file_path = Path("data/dataset.csv")
|
| 117 |
-
if file_path.exists():
|
| 118 |
-
return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
|
| 119 |
-
return Response("Dataset not found.", status_code=404)
|
| 120 |
-
|
| 121 |
-
progress_message = ""
|
| 122 |
-
def progress_hook(d):
|
| 123 |
-
global progress_message
|
| 124 |
-
if d['status'] == 'downloading':
|
| 125 |
-
progress_message = f"Downloading: {d.get('_percent_str', 'N/A')} at {d.get('_speed_str', 'N/A')}\r"
|
| 126 |
-
elif d['status'] == 'finished':
|
| 127 |
-
progress_message = f"\nDownload finished. Preparing video assets...\n"
|
| 128 |
-
|
| 129 |
-
def get_cookies_path():
|
| 130 |
-
"""Look for cookies file in known locations for better yt-dlp support."""
|
| 131 |
-
candidates = ["cookies.txt", "data/cookies.txt", "/app/cookies.txt"]
|
| 132 |
-
for c in candidates:
|
| 133 |
-
if os.path.exists(c):
|
| 134 |
-
return os.path.abspath(c)
|
| 135 |
-
return None
|
| 136 |
-
|
| 137 |
-
async def run_subprocess_async(command: list[str]):
|
| 138 |
-
cmd_str = ' '.join(command)
|
| 139 |
-
logger.info(f"[Subprocess] Running: {cmd_str}")
|
| 140 |
-
process = await asyncio.create_subprocess_exec(*command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 141 |
-
stdout, stderr = await process.communicate()
|
| 142 |
-
|
| 143 |
-
if process.returncode != 0:
|
| 144 |
-
err_msg = stderr.decode()
|
| 145 |
-
logger.error(f"[Subprocess] Failed ({process.returncode}): {err_msg}")
|
| 146 |
-
raise RuntimeError(f"Command failed: {err_msg}")
|
| 147 |
-
logger.info(f"[Subprocess] Success.")
|
| 148 |
-
return stdout.decode()
|
| 149 |
-
|
| 150 |
-
def extract_tweet_id(url: str) -> str | None:
|
| 151 |
-
match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
|
| 152 |
-
if match: return match.group(1)
|
| 153 |
-
return None
|
| 154 |
-
|
| 155 |
-
def check_if_processed(link: str) -> bool:
|
| 156 |
-
target_id = extract_tweet_id(link)
|
| 157 |
-
link_clean = link.split('?')[0].strip().rstrip('/')
|
| 158 |
-
|
| 159 |
-
for filename in ["data/dataset.csv", "data/manual_dataset.csv"]:
|
| 160 |
-
path = Path(filename)
|
| 161 |
-
if not path.exists(): continue
|
| 162 |
-
try:
|
| 163 |
-
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 164 |
-
sample = f.read(2048)
|
| 165 |
-
f.seek(0)
|
| 166 |
-
try: has_header = csv.Sniffer().has_header(sample)
|
| 167 |
-
except: has_header = True
|
| 168 |
-
|
| 169 |
-
if has_header:
|
| 170 |
-
reader = csv.DictReader(f)
|
| 171 |
-
for row in reader:
|
| 172 |
-
row_link = row.get('link', '').split('?')[0].strip().rstrip('/')
|
| 173 |
-
if row_link == link_clean: return True
|
| 174 |
-
row_id = row.get('id', '')
|
| 175 |
-
if target_id and row_id == target_id: return True
|
| 176 |
-
else:
|
| 177 |
-
reader = csv.reader(f)
|
| 178 |
-
for row in reader:
|
| 179 |
-
if not row: continue
|
| 180 |
-
if link_clean in row: return True
|
| 181 |
-
if target_id and target_id in row: return True
|
| 182 |
-
except Exception:
|
| 183 |
-
continue
|
| 184 |
-
return False
|
| 185 |
-
|
| 186 |
-
async def prepare_video_assets_async(url: str) -> dict:
|
| 187 |
-
global progress_message
|
| 188 |
-
loop = asyncio.get_event_loop()
|
| 189 |
-
is_local = not (url.startswith("http://") or url.startswith("https://"))
|
| 190 |
-
video_id = "unknown"
|
| 191 |
-
transcript_path = None
|
| 192 |
-
|
| 193 |
-
logger.info(f"Preparing assets for URL: {url}")
|
| 194 |
-
|
| 195 |
-
if is_local:
|
| 196 |
-
original_path = Path(url)
|
| 197 |
-
if not original_path.exists(): raise FileNotFoundError(f"File not found: {url}")
|
| 198 |
-
video_id = hashlib.md5(str(url).encode('utf-8')).hexdigest()[:16]
|
| 199 |
-
metadata = {"id": video_id, "link": url, "caption": original_path.stem}
|
| 200 |
-
else:
|
| 201 |
-
tweet_id = extract_tweet_id(url)
|
| 202 |
-
video_id = tweet_id if tweet_id else hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
|
| 203 |
-
sanitized_check = Path(f"data/videos/{video_id}_fixed.mp4")
|
| 204 |
-
|
| 205 |
-
cookies_path = get_cookies_path()
|
| 206 |
-
ydl_opts = {
|
| 207 |
-
'format': 'best[ext=mp4]/best',
|
| 208 |
-
'outtmpl': 'data/videos/%(id)s.%(ext)s',
|
| 209 |
-
'progress_hooks': [progress_hook],
|
| 210 |
-
'quiet': False,
|
| 211 |
-
'no_warnings': False,
|
| 212 |
-
'noplaylist': True,
|
| 213 |
-
'no_overwrites': True,
|
| 214 |
-
'writesubtitles': True,
|
| 215 |
-
'writeautomaticsub': True,
|
| 216 |
-
'subtitleslangs': ['en'],
|
| 217 |
-
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 218 |
-
}
|
| 219 |
-
|
| 220 |
-
if cookies_path:
|
| 221 |
-
ydl_opts['cookiefile'] = cookies_path
|
| 222 |
-
logger.info(f"Using cookies from {cookies_path}")
|
| 223 |
-
|
| 224 |
-
if sanitized_check.exists():
|
| 225 |
-
logger.info(f"Video {video_id} already cached at {sanitized_check}")
|
| 226 |
-
original_path = sanitized_check
|
| 227 |
-
metadata = {"id": video_id, "link": url, "caption": "Cached Video"}
|
| 228 |
-
else:
|
| 229 |
-
try:
|
| 230 |
-
logger.info(f"Starting yt-dlp download for {video_id}...")
|
| 231 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 232 |
-
info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=True))
|
| 233 |
-
original_path = Path(ydl.prepare_filename(info))
|
| 234 |
-
metadata = {
|
| 235 |
-
"id": info.get("id", video_id), "link": info.get("webpage_url", url),
|
| 236 |
-
"caption": info.get("description", info.get("title", "N/A")).encode('ascii', 'ignore').decode('ascii').strip()[:500],
|
| 237 |
-
"postdatetime": info.get("upload_date", "N/A")
|
| 238 |
-
}
|
| 239 |
-
video_id = info.get("id", video_id)
|
| 240 |
-
logger.info("yt-dlp download successful.")
|
| 241 |
-
except yt_dlp.utils.DownloadError as e:
|
| 242 |
-
logger.error(f"yt-dlp download error: {e}")
|
| 243 |
-
if "No video could be found" in str(e):
|
| 244 |
-
raise ValueError(f"No video content found at {url}")
|
| 245 |
-
raise RuntimeError(f"Download failed: {str(e)}")
|
| 246 |
-
except Exception as e:
|
| 247 |
-
logger.error(f"Unexpected yt-dlp error: {e}")
|
| 248 |
-
raise RuntimeError(f"Download failed: {str(e)}")
|
| 249 |
-
|
| 250 |
-
transcript_path = next(Path("data/videos").glob(f"{video_id}*.en.vtt"), None)
|
| 251 |
-
if not transcript_path: transcript_path = next(Path("data/videos").glob(f"{video_id}*.vtt"), None)
|
| 252 |
-
|
| 253 |
-
sanitized_path = Path(f"data/videos/{video_id}_fixed.mp4")
|
| 254 |
-
|
| 255 |
-
# --- FFmpeg Sanitization Logic with Robust Fallback ---
|
| 256 |
-
if not sanitized_path.exists() and original_path.exists():
|
| 257 |
-
logger.info(f"Sanitizing video {video_id} (Original: {original_path})...")
|
| 258 |
-
ffmpeg_bin = shutil.which('ffmpeg')
|
| 259 |
-
if not ffmpeg_bin: raise RuntimeError("FFmpeg binary not found in system path!")
|
| 260 |
-
|
| 261 |
-
try:
|
| 262 |
-
await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-y", str(sanitized_path)])
|
| 263 |
-
logger.info("Sanitization (re-encode) successful.")
|
| 264 |
-
except Exception as e:
|
| 265 |
-
logger.warning(f"Re-encode failed ({e}). Attempting Stream Copy...")
|
| 266 |
-
try:
|
| 267 |
-
await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c", "copy", "-y", str(sanitized_path)])
|
| 268 |
-
logger.info("Sanitization (copy) successful.")
|
| 269 |
-
except Exception as e2:
|
| 270 |
-
logger.error(f"Sanitization failed completely: {e2}")
|
| 271 |
-
if original_path.suffix == '.mp4':
|
| 272 |
-
logger.warning("Using original file as sanitized file.")
|
| 273 |
-
shutil.copy(original_path, sanitized_path)
|
| 274 |
-
else:
|
| 275 |
-
raise RuntimeError("Could not produce a valid MP4 file.")
|
| 276 |
-
|
| 277 |
-
# --- Audio Extraction ---
|
| 278 |
-
audio_path = sanitized_path.with_suffix('.wav')
|
| 279 |
-
if not audio_path.exists() and sanitized_path.exists():
|
| 280 |
-
logger.info(f"Extracting audio to {audio_path}...")
|
| 281 |
-
try:
|
| 282 |
-
await run_subprocess_async(["ffmpeg", "-i", str(sanitized_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", str(audio_path)])
|
| 283 |
-
logger.info("Audio extraction successful.")
|
| 284 |
-
except Exception as e:
|
| 285 |
-
logger.error(f"Audio extraction failed: {e}")
|
| 286 |
-
|
| 287 |
-
# --- Transcription ---
|
| 288 |
-
if not transcript_path and audio_path.exists() and transcription.transcription_model is not None:
|
| 289 |
-
logger.info("Generating transcript via Whisper...")
|
| 290 |
-
transcript_path = await loop.run_in_executor(None, transcription.generate_transcript, str(audio_path))
|
| 291 |
-
elif not transcript_path:
|
| 292 |
-
logger.info("Skipping local transcription (Whisper not loaded or audio missing).")
|
| 293 |
-
|
| 294 |
-
return {"video": str(sanitized_path), "transcript": str(transcript_path) if transcript_path else None, "metadata": metadata}
|
| 295 |
-
|
| 296 |
-
def safe_int(value):
|
| 297 |
-
try:
|
| 298 |
-
clean = re.sub(r'[^\d]', '', str(value))
|
| 299 |
-
return int(clean) if clean else 0
|
| 300 |
-
except Exception:
|
| 301 |
-
return 0
|
| 302 |
-
|
| 303 |
-
async def generate_and_save_croissant_metadata(row_data: dict) -> str:
|
| 304 |
-
try:
|
| 305 |
-
sanitized_data = {
|
| 306 |
-
"id": str(row_data.get("id", "")),
|
| 307 |
-
"link": str(row_data.get("link", "")),
|
| 308 |
-
"visual_integrity_score": safe_int(row_data.get("visual_integrity_score")),
|
| 309 |
-
"final_veracity_score": safe_int(row_data.get("final_veracity_score"))
|
| 310 |
-
}
|
| 311 |
-
video_id = sanitized_data["id"]
|
| 312 |
-
timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
| 313 |
-
croissant_json = {
|
| 314 |
-
"@context": "https://schema.org/",
|
| 315 |
-
"@type": "Dataset",
|
| 316 |
-
"name": f"vchat-label-{video_id}",
|
| 317 |
-
"description": f"Veracity analysis labels for video {video_id}",
|
| 318 |
-
"url": sanitized_data["link"],
|
| 319 |
-
"variableMeasured": sanitized_data
|
| 320 |
-
}
|
| 321 |
-
path = Path("metadata") / f"{video_id}_{timestamp}.json"
|
| 322 |
-
path.write_text(json.dumps(croissant_json, indent=2))
|
| 323 |
-
return str(path)
|
| 324 |
-
except Exception:
|
| 325 |
-
return "N/A (Error)"
|
| 326 |
-
|
| 327 |
-
async def get_labels_for_link(video_url: str, gemini_config: dict, vertex_config: dict, model_selection: str, include_comments: bool, reasoning_method: str = "cot"):
|
| 328 |
-
try:
|
| 329 |
-
yield f"Downloading assets for {video_url}..."
|
| 330 |
-
|
| 331 |
-
try:
|
| 332 |
-
paths = await prepare_video_assets_async(video_url)
|
| 333 |
-
except ValueError as ve:
|
| 334 |
-
yield f"Skipped: {str(ve)}"
|
| 335 |
-
logger.warning(f"Skipping {video_url}: {ve}")
|
| 336 |
-
return
|
| 337 |
-
except Exception as e:
|
| 338 |
-
yield f"Error preparing assets: {str(e)}"
|
| 339 |
-
logger.error(f"Asset prep failed for {video_url}: {e}")
|
| 340 |
-
return
|
| 341 |
-
|
| 342 |
-
video_path = paths["video"]
|
| 343 |
-
transcript_text = parse_vtt(paths["transcript"]) if paths["transcript"] else "No transcript (Audio/Video Analysis only)."
|
| 344 |
-
caption = paths["metadata"].get("caption", "")
|
| 345 |
-
|
| 346 |
-
yield f"Assets ready. Running inference ({model_selection}, {reasoning_method.upper()})..."
|
| 347 |
-
logger.info(f"Starting inference pipeline for {video_url} (Transcript len: {len(transcript_text)})")
|
| 348 |
-
|
| 349 |
-
final_labels = None
|
| 350 |
-
raw_toon = ""
|
| 351 |
-
prompt_used = ""
|
| 352 |
-
|
| 353 |
-
pipeline = inference_logic.run_gemini_labeling_pipeline if model_selection == 'gemini' else inference_logic.run_vertex_labeling_pipeline
|
| 354 |
-
config = gemini_config if model_selection == 'gemini' else vertex_config
|
| 355 |
-
|
| 356 |
-
# Add timeout protection for inference
|
| 357 |
-
try:
|
| 358 |
-
async for msg in pipeline(video_path, caption, transcript_text, config, include_comments, reasoning_method):
|
| 359 |
-
if isinstance(msg, dict) and "parsed_data" in msg:
|
| 360 |
-
final_labels = msg["parsed_data"]
|
| 361 |
-
raw_toon = msg.get("raw_toon", "")
|
| 362 |
-
prompt_used = msg.get("prompt_used", "")
|
| 363 |
-
logger.info("Inference successful. Data parsed.")
|
| 364 |
-
elif isinstance(msg, str):
|
| 365 |
-
yield msg
|
| 366 |
-
elif isinstance(msg, dict) and "error" in msg:
|
| 367 |
-
yield f"API Error: {msg['error']}"
|
| 368 |
-
except Exception as pipe_err:
|
| 369 |
-
logger.error(f"Pipeline crashed: {pipe_err}")
|
| 370 |
-
yield f"Critical Pipeline Failure: {pipe_err}"
|
| 371 |
-
return
|
| 372 |
-
|
| 373 |
-
if not final_labels:
|
| 374 |
-
logger.error(f"Inference pipeline completed but returned no labels for {video_url}")
|
| 375 |
-
yield "No labels generated. Check logs."
|
| 376 |
-
return
|
| 377 |
-
|
| 378 |
-
final_labels["meta_info"] = {
|
| 379 |
-
"prompt_used": prompt_used,
|
| 380 |
-
"model_selection": model_selection,
|
| 381 |
-
"reasoning_method": reasoning_method
|
| 382 |
-
}
|
| 383 |
-
|
| 384 |
-
vec = final_labels.get("veracity_vectors", {})
|
| 385 |
-
mod = final_labels.get("modalities", {})
|
| 386 |
-
fin = final_labels.get("final_assessment", {})
|
| 387 |
-
|
| 388 |
-
row = {
|
| 389 |
-
"id": paths["metadata"]["id"],
|
| 390 |
-
"link": paths["metadata"]["link"],
|
| 391 |
-
"caption": caption,
|
| 392 |
-
"postdatetime": paths["metadata"].get("postdatetime", ""),
|
| 393 |
-
"collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 394 |
-
"videotranscriptionpath": paths["transcript"] or "",
|
| 395 |
-
"visual_integrity_score": vec.get("visual_integrity_score", "0"),
|
| 396 |
-
"audio_integrity_score": vec.get("audio_integrity_score", "0"),
|
| 397 |
-
"source_credibility_score": vec.get("source_credibility_score", "0"),
|
| 398 |
-
"logical_consistency_score": vec.get("logical_consistency_score", "0"),
|
| 399 |
-
"emotional_manipulation_score": vec.get("emotional_manipulation_score", "0"),
|
| 400 |
-
"video_audio_score": mod.get("video_audio_score", "0"),
|
| 401 |
-
"video_caption_score": mod.get("video_caption_score", "0"),
|
| 402 |
-
"audio_caption_score": mod.get("audio_caption_score", "0"),
|
| 403 |
-
"final_veracity_score": fin.get("veracity_score_total", "0"),
|
| 404 |
-
"final_reasoning": fin.get("reasoning", "")
|
| 405 |
-
}
|
| 406 |
-
yield {"csv_row": row, "full_json": final_labels, "raw_toon": raw_toon}
|
| 407 |
-
|
| 408 |
-
except Exception as e:
|
| 409 |
-
logger.error(f"Fatal error in get_labels_for_link: {e}", exc_info=True)
|
| 410 |
-
yield {"error": str(e)}
|
| 411 |
-
|
| 412 |
-
@app.get("/queue/list")
|
| 413 |
-
async def get_queue_list():
|
| 414 |
-
queue_path = Path("data/batch_queue.csv")
|
| 415 |
-
if not queue_path.exists(): return []
|
| 416 |
-
items = []
|
| 417 |
-
with open(queue_path, 'r', encoding='utf-8') as f:
|
| 418 |
-
reader = csv.reader(f)
|
| 419 |
-
try: next(reader)
|
| 420 |
-
except: pass
|
| 421 |
-
for row in reader:
|
| 422 |
-
if len(row) > 0:
|
| 423 |
-
link = row[0]
|
| 424 |
-
status = "Processed" if check_if_processed(link) else "Pending"
|
| 425 |
-
items.append({
|
| 426 |
-
"link": link,
|
| 427 |
-
"timestamp": row[1] if len(row) > 1 else "",
|
| 428 |
-
"status": status
|
| 429 |
-
})
|
| 430 |
-
return items
|
| 431 |
-
|
| 432 |
-
@app.delete("/queue/delete")
|
| 433 |
-
async def delete_queue_item(link: str):
|
| 434 |
-
queue_path = Path("data/batch_queue.csv")
|
| 435 |
-
if not queue_path.exists():
|
| 436 |
-
return {"status": "error", "message": "Queue file not found"}
|
| 437 |
-
|
| 438 |
-
rows = []
|
| 439 |
-
deleted = False
|
| 440 |
-
try:
|
| 441 |
-
with open(queue_path, 'r', encoding='utf-8') as f:
|
| 442 |
-
reader = csv.reader(f)
|
| 443 |
-
rows = list(reader)
|
| 444 |
-
|
| 445 |
-
new_rows = []
|
| 446 |
-
if rows and len(rows) > 0 and rows[0][0] == "link":
|
| 447 |
-
new_rows.append(rows[0])
|
| 448 |
-
rows = rows[1:]
|
| 449 |
-
|
| 450 |
-
for row in rows:
|
| 451 |
-
if not row: continue
|
| 452 |
-
if row[0] == link:
|
| 453 |
-
deleted = True
|
| 454 |
-
else:
|
| 455 |
-
new_rows.append(row)
|
| 456 |
-
|
| 457 |
-
with open(queue_path, 'w', newline='', encoding='utf-8') as f:
|
| 458 |
-
writer = csv.writer(f)
|
| 459 |
-
writer.writerows(new_rows)
|
| 460 |
-
|
| 461 |
-
if deleted:
|
| 462 |
-
return {"status": "success", "link": link}
|
| 463 |
-
else:
|
| 464 |
-
return {"status": "not_found", "message": "Link not found in queue"}
|
| 465 |
-
|
| 466 |
-
except Exception as e:
|
| 467 |
-
return {"status": "error", "message": str(e)}
|
| 468 |
-
|
| 469 |
-
@app.post("/queue/stop")
|
| 470 |
-
async def stop_queue_processing():
|
| 471 |
-
global STOP_QUEUE_SIGNAL
|
| 472 |
-
logger.info("Received Stop Signal from User.")
|
| 473 |
-
STOP_QUEUE_SIGNAL = True
|
| 474 |
-
return {"status": "stopping"}
|
| 475 |
-
|
| 476 |
-
@app.post("/queue/upload_csv")
|
| 477 |
-
async def upload_csv_to_queue(file: UploadFile = File(...)):
|
| 478 |
-
try:
|
| 479 |
-
content = await file.read()
|
| 480 |
-
try:
|
| 481 |
-
decoded = content.decode('utf-8').splitlines()
|
| 482 |
-
except UnicodeDecodeError:
|
| 483 |
-
decoded = content.decode('latin-1').splitlines()
|
| 484 |
-
|
| 485 |
-
reader = csv.reader(decoded)
|
| 486 |
-
links_to_add = []
|
| 487 |
-
header = next(reader, None)
|
| 488 |
-
if not header: return {"status": "empty file"}
|
| 489 |
-
|
| 490 |
-
link_idx = 0
|
| 491 |
-
header_lower = [h.lower() for h in header]
|
| 492 |
-
|
| 493 |
-
if "link" in header_lower: link_idx = header_lower.index("link")
|
| 494 |
-
elif "url" in header_lower: link_idx = header_lower.index("url")
|
| 495 |
-
elif len(header) > 0 and header[0].strip().startswith("http"):
|
| 496 |
-
links_to_add.append(header[0])
|
| 497 |
-
link_idx = 0
|
| 498 |
-
|
| 499 |
-
for row in reader:
|
| 500 |
-
if len(row) > link_idx and row[link_idx].strip():
|
| 501 |
-
links_to_add.append(row[link_idx].strip())
|
| 502 |
-
|
| 503 |
-
queue_path = Path("data/batch_queue.csv")
|
| 504 |
-
existing_links = set()
|
| 505 |
-
if queue_path.exists():
|
| 506 |
-
with open(queue_path, 'r', encoding='utf-8') as f:
|
| 507 |
-
existing_links = set(f.read().splitlines())
|
| 508 |
-
|
| 509 |
-
added_count = 0
|
| 510 |
-
with open(queue_path, 'a', newline='', encoding='utf-8') as f:
|
| 511 |
-
writer = csv.writer(f)
|
| 512 |
-
if not queue_path.exists() or queue_path.stat().st_size == 0:
|
| 513 |
-
writer.writerow(["link", "ingest_timestamp"])
|
| 514 |
-
|
| 515 |
-
for link in links_to_add:
|
| 516 |
-
duplicate = False
|
| 517 |
-
for line in existing_links:
|
| 518 |
-
if link in line:
|
| 519 |
-
duplicate = True
|
| 520 |
-
break
|
| 521 |
-
if duplicate: continue
|
| 522 |
-
|
| 523 |
-
writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
|
| 524 |
-
added_count += 1
|
| 525 |
-
|
| 526 |
-
return {"status": "success", "added": added_count}
|
| 527 |
-
except Exception as e:
|
| 528 |
-
logging.error(f"Upload CSV error: {e}")
|
| 529 |
-
return JSONResponse(status_code=400, content={"error": str(e), "status": "failed"})
|
| 530 |
-
|
| 531 |
-
@app.post("/queue/run")
|
| 532 |
-
async def run_queue_processing(
|
| 533 |
-
model_selection: str = Form(...),
|
| 534 |
-
gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
|
| 535 |
-
vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
|
| 536 |
-
include_comments: bool = Form(False),
|
| 537 |
-
reasoning_method: str = Form("cot")
|
| 538 |
-
):
|
| 539 |
-
global STOP_QUEUE_SIGNAL
|
| 540 |
-
STOP_QUEUE_SIGNAL = False
|
| 541 |
-
gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
|
| 542 |
-
vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
|
| 543 |
-
|
| 544 |
-
async def queue_stream():
|
| 545 |
-
queue_path = Path("data/batch_queue.csv")
|
| 546 |
-
if not queue_path.exists():
|
| 547 |
-
yield "data: Queue empty.\n\n"
|
| 548 |
-
return
|
| 549 |
-
|
| 550 |
-
items = []
|
| 551 |
-
with open(queue_path, 'r', encoding='utf-8') as f:
|
| 552 |
-
reader = csv.reader(f)
|
| 553 |
-
try: next(reader)
|
| 554 |
-
except: pass
|
| 555 |
-
for row in reader:
|
| 556 |
-
if row: items.append(row[0])
|
| 557 |
-
|
| 558 |
-
processed_count = 0
|
| 559 |
-
total = len(items)
|
| 560 |
-
|
| 561 |
-
logger.info(f"Starting batch queue processing for {total} items.")
|
| 562 |
-
|
| 563 |
-
for i, link in enumerate(items):
|
| 564 |
-
if STOP_QUEUE_SIGNAL:
|
| 565 |
-
yield "data: [SYSTEM] Stopped by user.\n\n"
|
| 566 |
-
logger.info("Stopping queue loop.")
|
| 567 |
-
break
|
| 568 |
-
|
| 569 |
-
if check_if_processed(link):
|
| 570 |
-
yield f"data: [SKIP] {link} processed.\n\n"
|
| 571 |
-
continue
|
| 572 |
-
|
| 573 |
-
yield f"data: [START] {i+1}/{total}: {link}\n\n"
|
| 574 |
-
final_data = None
|
| 575 |
-
|
| 576 |
-
# Streaming results from pipeline
|
| 577 |
-
async for res in get_labels_for_link(link, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
|
| 578 |
-
if isinstance(res, str):
|
| 579 |
-
msg = res.replace('\n', ' ')
|
| 580 |
-
yield f"data: {msg}\n\n"
|
| 581 |
-
if isinstance(res, dict):
|
| 582 |
-
if "error" in res:
|
| 583 |
-
yield f"data: [ERROR DETAIL] {res['error']}\n\n"
|
| 584 |
-
if "csv_row" in res:
|
| 585 |
-
final_data = res
|
| 586 |
-
|
| 587 |
-
if final_data:
|
| 588 |
-
row = final_data["csv_row"]
|
| 589 |
-
vid_id = row["id"]
|
| 590 |
-
ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
| 591 |
-
|
| 592 |
-
# Save artifacts
|
| 593 |
-
json_path = f"data/labels/{vid_id}_{ts}_labels.json"
|
| 594 |
-
with open(json_path, 'w') as f: json.dump(final_data["full_json"], f, indent=2)
|
| 595 |
-
with open(f"data/labels/{vid_id}_{ts}.toon", 'w') as f: f.write(final_data["raw_toon"])
|
| 596 |
-
|
| 597 |
-
prompt_content = final_data.get("full_json", {}).get("meta_info", {}).get("prompt_used", "")
|
| 598 |
-
if prompt_content:
|
| 599 |
-
with open(f"data/prompts/{vid_id}_{ts}_prompt.txt", 'w', encoding='utf-8') as f:
|
| 600 |
-
f.write(prompt_content)
|
| 601 |
-
|
| 602 |
-
raw_response = final_data.get("raw_toon", "")
|
| 603 |
-
if raw_response:
|
| 604 |
-
with open(f"data/responses/{vid_id}.txt", 'w', encoding='utf-8') as f:
|
| 605 |
-
f.write(raw_response)
|
| 606 |
-
|
| 607 |
-
row["metadatapath"] = await generate_and_save_croissant_metadata(row)
|
| 608 |
-
row["json_path"] = json_path
|
| 609 |
-
|
| 610 |
-
dpath = Path("data/dataset.csv")
|
| 611 |
-
exists = dpath.exists()
|
| 612 |
-
with open(dpath, 'a', newline='', encoding='utf-8') as f:
|
| 613 |
-
writer = csv.DictWriter(f, fieldnames=list(row.keys()), extrasaction='ignore')
|
| 614 |
-
if not exists: writer.writeheader()
|
| 615 |
-
writer.writerow(row)
|
| 616 |
-
|
| 617 |
-
processed_count += 1
|
| 618 |
-
yield f"data: [SUCCESS] Labeled.\n\n"
|
| 619 |
-
else:
|
| 620 |
-
yield f"data: [FAIL] Failed to label. Check logs.\n\n"
|
| 621 |
-
|
| 622 |
-
yield f"data: Batch Complete. +{processed_count} videos labeled.\n\n"
|
| 623 |
-
yield "event: close\ndata: Done\n\n"
|
| 624 |
-
|
| 625 |
-
return StreamingResponse(queue_stream(), media_type="text/event-stream")
|
| 626 |
-
|
| 627 |
-
@app.post("/extension/ingest")
|
| 628 |
-
async def extension_ingest(request: Request):
|
| 629 |
-
try:
|
| 630 |
-
data = await request.json()
|
| 631 |
-
link = data.get("link")
|
| 632 |
-
if not link: raise HTTPException(status_code=400, detail="No link")
|
| 633 |
-
queue_path = Path("data/batch_queue.csv")
|
| 634 |
-
file_exists = queue_path.exists()
|
| 635 |
-
|
| 636 |
-
if file_exists:
|
| 637 |
-
with open(queue_path, 'r', encoding='utf-8') as f:
|
| 638 |
-
if link in f.read():
|
| 639 |
-
return {"status": "queued", "msg": "Duplicate"}
|
| 640 |
-
|
| 641 |
-
with open(queue_path, 'a', newline='', encoding='utf-8') as f:
|
| 642 |
-
writer = csv.writer(f)
|
| 643 |
-
if not file_exists: writer.writerow(["link", "ingest_timestamp"])
|
| 644 |
-
writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
|
| 645 |
-
|
| 646 |
-
return {"status": "queued", "link": link}
|
| 647 |
-
except Exception as e:
|
| 648 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 649 |
-
|
| 650 |
-
@app.post("/extension/save_comments")
|
| 651 |
-
async def extension_save_comments(request: Request):
|
| 652 |
-
try:
|
| 653 |
-
data = await request.json()
|
| 654 |
-
link = data.get("link")
|
| 655 |
-
comments = data.get("comments", [])
|
| 656 |
-
if not link or not comments: raise HTTPException(status_code=400, detail="Missing data")
|
| 657 |
-
|
| 658 |
-
csv_path = Path("data/comments.csv")
|
| 659 |
-
exists = csv_path.exists()
|
| 660 |
-
fieldnames = ["link", "author", "comment_text", "timestamp"]
|
| 661 |
-
|
| 662 |
-
with open(csv_path, 'a', newline='', encoding='utf-8') as f:
|
| 663 |
-
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
| 664 |
-
if not exists: writer.writeheader()
|
| 665 |
-
|
| 666 |
-
ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 667 |
-
for c in comments:
|
| 668 |
-
row = {"link": link, "timestamp": ts}
|
| 669 |
-
if isinstance(c, dict):
|
| 670 |
-
row["author"] = c.get("author", "Unknown")
|
| 671 |
-
row["comment_text"] = c.get("text", "").strip()
|
| 672 |
-
else:
|
| 673 |
-
row["author"] = "Unknown"
|
| 674 |
-
row["comment_text"] = str(c).strip()
|
| 675 |
-
|
| 676 |
-
if row["comment_text"]:
|
| 677 |
-
writer.writerow(row)
|
| 678 |
-
|
| 679 |
-
return {"status": "saved", "count": len(comments)}
|
| 680 |
-
except Exception as e:
|
| 681 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 682 |
-
|
| 683 |
-
@app.post("/extension/save_manual")
|
| 684 |
-
async def extension_save_manual(request: Request):
|
| 685 |
-
try:
|
| 686 |
-
data = await request.json()
|
| 687 |
-
link = data.get("link")
|
| 688 |
-
labels = data.get("labels", {})
|
| 689 |
-
stats = data.get("stats", {})
|
| 690 |
-
if not link: raise HTTPException(status_code=400, detail="No link")
|
| 691 |
-
|
| 692 |
-
video_id = extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:16]
|
| 693 |
-
|
| 694 |
-
row_data = {
|
| 695 |
-
"id": video_id,
|
| 696 |
-
"link": link,
|
| 697 |
-
"caption": data.get("caption", ""),
|
| 698 |
-
"collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 699 |
-
"source": "manual_extension",
|
| 700 |
-
"visual_integrity_score": labels.get("visual_integrity_score", 0),
|
| 701 |
-
"audio_integrity_score": labels.get("audio_integrity_score", 0),
|
| 702 |
-
"source_credibility_score": labels.get("source_credibility_score", 0),
|
| 703 |
-
"logical_consistency_score": labels.get("logical_consistency_score", 0),
|
| 704 |
-
"emotional_manipulation_score": labels.get("emotional_manipulation_score", 0),
|
| 705 |
-
"video_audio_score": labels.get("video_audio_score", 0),
|
| 706 |
-
"video_caption_score": labels.get("video_caption_score", 0),
|
| 707 |
-
"audio_caption_score": labels.get("audio_caption_score", 0),
|
| 708 |
-
"final_veracity_score": labels.get("final_veracity_score", 0),
|
| 709 |
-
"final_reasoning": labels.get("reasoning", ""),
|
| 710 |
-
"stats_likes": stats.get("likes", 0),
|
| 711 |
-
"stats_shares": stats.get("shares", 0),
|
| 712 |
-
"stats_comments": stats.get("comments", 0),
|
| 713 |
-
"stats_platform": stats.get("platform", "unknown")
|
| 714 |
-
}
|
| 715 |
-
|
| 716 |
-
dpath = Path("data/manual_dataset.csv")
|
| 717 |
-
exists = dpath.exists()
|
| 718 |
-
fieldnames = list(row_data.keys())
|
| 719 |
-
|
| 720 |
-
with open(dpath, 'a', newline='', encoding='utf-8') as f:
|
| 721 |
-
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
| 722 |
-
if not exists: writer.writeheader()
|
| 723 |
-
writer.writerow(row_data)
|
| 724 |
-
|
| 725 |
-
return {"status": "saved"}
|
| 726 |
-
except Exception as e:
|
| 727 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 728 |
-
|
| 729 |
-
@app.get("/manage/list")
|
| 730 |
-
async def list_data():
|
| 731 |
-
data = []
|
| 732 |
-
def read_csv(path, source_type):
|
| 733 |
-
if not path.exists(): return
|
| 734 |
-
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 735 |
-
reader = csv.DictReader(f)
|
| 736 |
-
for row in reader:
|
| 737 |
-
if not row.get('id') or row['id'].strip() == "":
|
| 738 |
-
link = row.get('link', '')
|
| 739 |
-
tid = extract_tweet_id(link)
|
| 740 |
-
row['id'] = tid if tid else hashlib.md5(link.encode()).hexdigest()[:16]
|
| 741 |
-
|
| 742 |
-
json_content = None
|
| 743 |
-
if row.get('json_path') and os.path.exists(row['json_path']):
|
| 744 |
-
try:
|
| 745 |
-
with open(row['json_path'], 'r') as jf: json_content = json.load(jf)
|
| 746 |
-
except: pass
|
| 747 |
-
|
| 748 |
-
row['source_type'] = source_type
|
| 749 |
-
row['json_data'] = json_content
|
| 750 |
-
data.append(row)
|
| 751 |
-
|
| 752 |
-
read_csv(Path("data/dataset.csv"), "auto")
|
| 753 |
-
read_csv(Path("data/manual_dataset.csv"), "manual")
|
| 754 |
-
data.sort(key=lambda x: x.get('collecttime', ''), reverse=True)
|
| 755 |
-
return data
|
| 756 |
-
|
| 757 |
-
@app.delete("/manage/delete")
|
| 758 |
-
async def delete_data(id: str = "", link: str = ""):
|
| 759 |
-
if not id and not link: raise HTTPException(status_code=400, detail="Must provide ID or Link")
|
| 760 |
-
deleted_count = 0
|
| 761 |
-
target_id = id
|
| 762 |
-
|
| 763 |
-
def remove_from_csv(path):
|
| 764 |
-
nonlocal deleted_count, target_id
|
| 765 |
-
if not path.exists(): return
|
| 766 |
-
rows = []
|
| 767 |
-
found_in_file = False
|
| 768 |
-
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 769 |
-
reader = csv.DictReader(f)
|
| 770 |
-
fieldnames = reader.fieldnames
|
| 771 |
-
for row in reader:
|
| 772 |
-
is_match = False
|
| 773 |
-
if id and row.get('id') == id: is_match = True
|
| 774 |
-
elif link and row.get('link') == link: is_match = True
|
| 775 |
-
if is_match:
|
| 776 |
-
found_in_file = True
|
| 777 |
-
deleted_count += 1
|
| 778 |
-
if not target_id: target_id = row.get('id')
|
| 779 |
-
else: rows.append(row)
|
| 780 |
-
if found_in_file:
|
| 781 |
-
with open(path, 'w', newline='', encoding='utf-8') as f:
|
| 782 |
-
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
| 783 |
-
writer.writeheader()
|
| 784 |
-
writer.writerows(rows)
|
| 785 |
-
|
| 786 |
-
remove_from_csv(Path("data/dataset.csv"))
|
| 787 |
-
remove_from_csv(Path("data/manual_dataset.csv"))
|
| 788 |
-
if target_id:
|
| 789 |
-
for p in Path("data/labels").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
|
| 790 |
-
for p in Path("metadata").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
|
| 791 |
-
return {"status": "deleted", "count": deleted_count}
|
| 792 |
-
|
| 793 |
-
@app.post("/label_video")
|
| 794 |
-
async def label_video_endpoint(
|
| 795 |
-
video_url: str = Form(...), model_selection: str = Form(...),
|
| 796 |
-
gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
|
| 797 |
-
vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
|
| 798 |
-
include_comments: bool = Form(False),
|
| 799 |
-
reasoning_method: str = Form("cot")
|
| 800 |
-
):
|
| 801 |
-
gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
|
| 802 |
-
vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
|
| 803 |
-
async def stream():
|
| 804 |
-
async for msg in get_labels_for_link(video_url, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
|
| 805 |
-
if isinstance(msg, str): yield f"data: {msg}\n\n"
|
| 806 |
-
if isinstance(msg, dict) and "csv_row" in msg: yield "data: Done. Labels generated.\n\n"
|
| 807 |
-
yield "event: close\ndata: Done.\n\n"
|
| 808 |
-
return StreamingResponse(stream(), media_type="text/event-stream")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/factuality_logic.py
DELETED
|
@@ -1,143 +0,0 @@
|
|
| 1 |
-
# factuality_logic.py
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
import json
|
| 5 |
-
import logging
|
| 6 |
-
import asyncio
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
import inference_logic
|
| 9 |
-
from toon_parser import parse_toon_line
|
| 10 |
-
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
PROMPT_VISUAL_ARTIFACTS = (
|
| 14 |
-
"Analyze the video for visual manipulation (Deepfakes, editing anomalies).\n"
|
| 15 |
-
"Steps inside <thinking>: 1. Scan for artifacts. 2. Check cuts.\n"
|
| 16 |
-
"Output TOON format:\n"
|
| 17 |
-
"visual_analysis: result[2]{score,justification}:\n"
|
| 18 |
-
"Score(1-10),\"Justification text\""
|
| 19 |
-
)
|
| 20 |
-
|
| 21 |
-
PROMPT_CONTENT_ANALYSIS = (
|
| 22 |
-
"Analyze the content for accuracy and logic.\n"
|
| 23 |
-
"Steps inside <thinking>: 1. Identify claims. 2. Check fallacies. 3. Assess emotion.\n"
|
| 24 |
-
"**Transcript:**\n{transcript}\n"
|
| 25 |
-
"Output TOON format:\n"
|
| 26 |
-
"content_analysis: result[2]{score,justification}:\n"
|
| 27 |
-
"Score(1-10),\"Justification text\""
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
PROMPT_AUDIO_ANALYSIS = (
|
| 31 |
-
"Analyze audio for synthesis or manipulation.\n"
|
| 32 |
-
"Steps inside <thinking>: 1. Listen for robotic inflections. 2. Check lip-sync.\n"
|
| 33 |
-
"**Transcript:**\n{transcript}\n"
|
| 34 |
-
"Output TOON format:\n"
|
| 35 |
-
"audio_analysis: result[2]{score,justification}:\n"
|
| 36 |
-
"Score(1-10),\"Justification text\""
|
| 37 |
-
)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def parse_vtt(file_path: str) -> str:
|
| 41 |
-
try:
|
| 42 |
-
if not os.path.exists(file_path):
|
| 43 |
-
return "Transcript file not found."
|
| 44 |
-
|
| 45 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
| 46 |
-
lines = f.readlines()
|
| 47 |
-
|
| 48 |
-
text_lines = []
|
| 49 |
-
for line in lines:
|
| 50 |
-
line = line.strip()
|
| 51 |
-
if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
|
| 52 |
-
clean_line = re.sub(r'<[^>]+>', '', line)
|
| 53 |
-
if clean_line and (not text_lines or clean_line != text_lines[-1]):
|
| 54 |
-
text_lines.append(clean_line)
|
| 55 |
-
|
| 56 |
-
return "\n".join(text_lines) if text_lines else "No speech found in transcript."
|
| 57 |
-
except Exception as e:
|
| 58 |
-
logger.error(f"Error parsing VTT file {file_path}: {e}")
|
| 59 |
-
return f"Error reading transcript: {e}"
|
| 60 |
-
|
| 61 |
-
async def run_factuality_pipeline(paths: dict, checks: dict, generation_config: dict):
|
| 62 |
-
video_path = paths.get("video")
|
| 63 |
-
transcript_path = paths.get("transcript")
|
| 64 |
-
|
| 65 |
-
if not video_path:
|
| 66 |
-
yield "ERROR: Video path not found. Cannot start analysis.\n\n"
|
| 67 |
-
return
|
| 68 |
-
|
| 69 |
-
yield "Step 1: Processing Transcript...\n"
|
| 70 |
-
await asyncio.sleep(0.1)
|
| 71 |
-
transcript = "No transcript was downloaded for this video."
|
| 72 |
-
if transcript_path and os.path.exists(transcript_path):
|
| 73 |
-
transcript = parse_vtt(transcript_path)
|
| 74 |
-
yield f" - Transcript file found and processed.\n"
|
| 75 |
-
else:
|
| 76 |
-
yield f" - No transcript file was found.\n"
|
| 77 |
-
|
| 78 |
-
yield f"\n--- Extracted Transcript ---\n{transcript}\n--------------------------\n\n"
|
| 79 |
-
await asyncio.sleep(0.1)
|
| 80 |
-
|
| 81 |
-
analysis_steps = []
|
| 82 |
-
if checks.get("visuals"):
|
| 83 |
-
analysis_steps.append(("Visual Integrity", PROMPT_VISUAL_ARTIFACTS))
|
| 84 |
-
if checks.get("content"):
|
| 85 |
-
analysis_steps.append(("Content Veracity", PROMPT_CONTENT_ANALYSIS.format(transcript=transcript)))
|
| 86 |
-
if checks.get("audio"):
|
| 87 |
-
analysis_steps.append(("Audio Forensics", PROMPT_AUDIO_ANALYSIS.format(transcript=transcript)))
|
| 88 |
-
|
| 89 |
-
for i, (title, prompt) in enumerate(analysis_steps):
|
| 90 |
-
yield f"--- Step {i + 2}: Running '{title}' Analysis ---\n"
|
| 91 |
-
yield "(Model is generating TOON analysis with scores...)\n\n"
|
| 92 |
-
await asyncio.sleep(0.1)
|
| 93 |
-
|
| 94 |
-
try:
|
| 95 |
-
current_gen_config = generation_config.copy()
|
| 96 |
-
sampling_fps = current_gen_config.pop("sampling_fps", 2.0)
|
| 97 |
-
current_gen_config.pop("num_perceptions", None)
|
| 98 |
-
|
| 99 |
-
current_gen_config["temperature"] = 0.1
|
| 100 |
-
current_gen_config["do_sample"] = True
|
| 101 |
-
|
| 102 |
-
ans = inference_logic.inference_step(
|
| 103 |
-
video_path=video_path,
|
| 104 |
-
prompt=prompt,
|
| 105 |
-
generation_kwargs=current_gen_config,
|
| 106 |
-
sampling_fps=sampling_fps,
|
| 107 |
-
pred_glue=None
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
yield f" - Analysis Complete for '{title}'. Parsing TOON...\n\n"
|
| 111 |
-
|
| 112 |
-
parsed_result = {}
|
| 113 |
-
match = re.search(r'(\w+_analysis): result\[2\]\{score,justification\}:\s*\n(.+)', ans, re.MULTILINE)
|
| 114 |
-
|
| 115 |
-
thinking = "No thinking block found."
|
| 116 |
-
think_match = re.search(r'<thinking>(.*?)</thinking>', ans, re.DOTALL)
|
| 117 |
-
if think_match:
|
| 118 |
-
thinking = think_match.group(1).strip()
|
| 119 |
-
|
| 120 |
-
if match:
|
| 121 |
-
key, value_line = match.groups()
|
| 122 |
-
parsed_result = parse_toon_line({'key': key, 'headers': ['score', 'justification']}, value_line.strip())
|
| 123 |
-
else:
|
| 124 |
-
logger.warning(f"Could not parse TOON for '{title}'. Raw: {ans}")
|
| 125 |
-
yield f"Warning: Model did not return valid TOON. Raw output:\n{ans}\n"
|
| 126 |
-
continue
|
| 127 |
-
|
| 128 |
-
score = parsed_result.get('score', 'N/A')
|
| 129 |
-
justification = parsed_result.get('justification', 'No justification provided.')
|
| 130 |
-
|
| 131 |
-
yield f"===== ANALYSIS RESULT: {title.upper()} =====\n"
|
| 132 |
-
yield f"SCORE: {score}/10\n"
|
| 133 |
-
yield f"Reasoning (Step-by-Step): {thinking}\n"
|
| 134 |
-
yield f"Final Justification: {justification}\n\n"
|
| 135 |
-
yield f"========================================\n\n"
|
| 136 |
-
|
| 137 |
-
except Exception as e:
|
| 138 |
-
error_message = f"An error occurred during the '{title}' analysis step: {e}"
|
| 139 |
-
logger.error(error_message, exc_info=True)
|
| 140 |
-
yield f"ERROR: {error_message}\n\n"
|
| 141 |
-
break
|
| 142 |
-
|
| 143 |
-
yield "Factuality Analysis Pipeline Finished.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/inference_logic.py
DELETED
|
@@ -1,303 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import re
|
| 3 |
-
import ast
|
| 4 |
-
import sys
|
| 5 |
-
import os
|
| 6 |
-
import time
|
| 7 |
-
import logging
|
| 8 |
-
import asyncio
|
| 9 |
-
import json
|
| 10 |
-
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
|
| 11 |
-
from peft import PeftModel
|
| 12 |
-
from labeling_logic import (
|
| 13 |
-
LABELING_PROMPT_TEMPLATE, SCORE_INSTRUCTIONS_SIMPLE, SCORE_INSTRUCTIONS_REASONING,
|
| 14 |
-
SCHEMA_SIMPLE, SCHEMA_REASONING,
|
| 15 |
-
FCOT_MACRO_PROMPT, FCOT_MESO_PROMPT, FCOT_SYNTHESIS_PROMPT
|
| 16 |
-
)
|
| 17 |
-
from toon_parser import parse_veracity_toon
|
| 18 |
-
|
| 19 |
-
# Optional local imports
|
| 20 |
-
try:
|
| 21 |
-
from my_vision_process import process_vision_info, client
|
| 22 |
-
except ImportError:
|
| 23 |
-
process_vision_info = None
|
| 24 |
-
client = None
|
| 25 |
-
|
| 26 |
-
# Google GenAI Imports
|
| 27 |
-
try:
|
| 28 |
-
import google.generativeai as genai_legacy
|
| 29 |
-
from google.generativeai.types import generation_types, HarmCategory, HarmBlockThreshold
|
| 30 |
-
except ImportError:
|
| 31 |
-
genai_legacy = None
|
| 32 |
-
|
| 33 |
-
try:
|
| 34 |
-
# Modern Google GenAI SDK (v1)
|
| 35 |
-
from google import genai
|
| 36 |
-
from google.genai.types import (
|
| 37 |
-
GenerateContentConfig,
|
| 38 |
-
HttpOptions,
|
| 39 |
-
Retrieval,
|
| 40 |
-
Tool,
|
| 41 |
-
VertexAISearch,
|
| 42 |
-
GoogleSearch,
|
| 43 |
-
Part,
|
| 44 |
-
SafetySetting
|
| 45 |
-
)
|
| 46 |
-
import vertexai
|
| 47 |
-
except ImportError:
|
| 48 |
-
genai = None
|
| 49 |
-
vertexai = None
|
| 50 |
-
|
| 51 |
-
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 52 |
-
processor = None
|
| 53 |
-
base_model = None
|
| 54 |
-
peft_model = None
|
| 55 |
-
active_model = None
|
| 56 |
-
logger = logging.getLogger(__name__)
|
| 57 |
-
|
| 58 |
-
def load_models():
|
| 59 |
-
pass
|
| 60 |
-
|
| 61 |
-
async def attempt_toon_repair(original_text: str, schema: str, client, model_type: str, config: dict):
|
| 62 |
-
logger.info("Attempting TOON Repair...")
|
| 63 |
-
repair_prompt = f"SYSTEM: Reformat the following text into strict TOON schema. Infer missing scores as 0.\n\nSCHEMA:\n{schema}\n\nINPUT:\n{original_text}\n"
|
| 64 |
-
try:
|
| 65 |
-
loop = asyncio.get_event_loop()
|
| 66 |
-
repaired_text = ""
|
| 67 |
-
if model_type == 'gemini':
|
| 68 |
-
model = genai_legacy.GenerativeModel("models/gemini-2.0-flash-exp")
|
| 69 |
-
response = await loop.run_in_executor(None, lambda: model.generate_content(repair_prompt))
|
| 70 |
-
repaired_text = response.text
|
| 71 |
-
elif model_type == 'vertex':
|
| 72 |
-
cl = client if client else genai.Client(vertexai=True, project=config['project_id'], location=config['location'])
|
| 73 |
-
response = await loop.run_in_executor(None, lambda: cl.models.generate_content(model=config['model_name'], contents=repair_prompt))
|
| 74 |
-
repaired_text = response.text
|
| 75 |
-
return repaired_text
|
| 76 |
-
except Exception as e:
|
| 77 |
-
logger.error(f"Repair failed: {e}")
|
| 78 |
-
return original_text
|
| 79 |
-
|
| 80 |
-
async def run_gemini_labeling_pipeline(video_path: str, caption: str, transcript: str, gemini_config: dict, include_comments: bool, reasoning_method: str = "cot"):
|
| 81 |
-
if genai_legacy is None:
|
| 82 |
-
yield "ERROR: Legacy SDK missing.\n"
|
| 83 |
-
return
|
| 84 |
-
|
| 85 |
-
api_key = gemini_config.get("api_key")
|
| 86 |
-
if not api_key:
|
| 87 |
-
yield "ERROR: No Gemini API Key provided."
|
| 88 |
-
return
|
| 89 |
-
|
| 90 |
-
logger.info(f"[Gemini] Initializing with model {gemini_config.get('model_name')}")
|
| 91 |
-
|
| 92 |
-
safety_settings = [
|
| 93 |
-
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
|
| 94 |
-
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
|
| 95 |
-
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
|
| 96 |
-
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
|
| 97 |
-
]
|
| 98 |
-
|
| 99 |
-
try:
|
| 100 |
-
genai_legacy.configure(api_key=api_key)
|
| 101 |
-
loop = asyncio.get_event_loop()
|
| 102 |
-
|
| 103 |
-
# 1. Upload File
|
| 104 |
-
logger.info(f"[Gemini] Uploading video file: {video_path}...")
|
| 105 |
-
yield f"Uploading video to Gemini..."
|
| 106 |
-
|
| 107 |
-
uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.upload_file(path=video_path, mime_type="video/mp4"))
|
| 108 |
-
logger.info(f"[Gemini] Upload complete. URI: {uploaded_file.uri} | State: {uploaded_file.state.name}")
|
| 109 |
-
|
| 110 |
-
# 2. Wait for Processing (Fix: Refresh state in loop)
|
| 111 |
-
wait_start = time.time()
|
| 112 |
-
while True:
|
| 113 |
-
# Refresh file status
|
| 114 |
-
uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.get_file(uploaded_file.name))
|
| 115 |
-
state_name = uploaded_file.state.name
|
| 116 |
-
|
| 117 |
-
if state_name == "ACTIVE":
|
| 118 |
-
logger.info("[Gemini] Video processing complete. Ready for inference.")
|
| 119 |
-
break
|
| 120 |
-
elif state_name == "FAILED":
|
| 121 |
-
logger.error(f"[Gemini] Video processing failed on server side.")
|
| 122 |
-
yield "ERROR: Google failed to process video."
|
| 123 |
-
return
|
| 124 |
-
|
| 125 |
-
if time.time() - wait_start > 300: # 5 minute timeout
|
| 126 |
-
logger.error("[Gemini] Video processing timed out.")
|
| 127 |
-
yield "ERROR: Video processing timed out."
|
| 128 |
-
return
|
| 129 |
-
|
| 130 |
-
logger.info(f"[Gemini] Processing video... (State: {state_name})")
|
| 131 |
-
yield "Processing video on Google servers..."
|
| 132 |
-
await asyncio.sleep(5)
|
| 133 |
-
|
| 134 |
-
# 3. Prepare Inference
|
| 135 |
-
model_name = gemini_config.get("model_name") or "models/gemini-2.0-flash-exp"
|
| 136 |
-
model = genai_legacy.GenerativeModel(model_name)
|
| 137 |
-
toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
|
| 138 |
-
score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
|
| 139 |
-
|
| 140 |
-
raw_text = ""
|
| 141 |
-
prompt_used = ""
|
| 142 |
-
gen_config = {"temperature": 0.1}
|
| 143 |
-
|
| 144 |
-
logger.info(f"[Gemini] Starting inference with method: {reasoning_method}")
|
| 145 |
-
|
| 146 |
-
if reasoning_method == "fcot":
|
| 147 |
-
yield "Starting FCoT (Gemini)..."
|
| 148 |
-
chat = model.start_chat(history=[])
|
| 149 |
-
|
| 150 |
-
macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
|
| 151 |
-
logger.info("[Gemini] Sending Macro Prompt...")
|
| 152 |
-
res1 = await loop.run_in_executor(None, lambda: chat.send_message([uploaded_file, macro_prompt], safety_settings=safety_settings))
|
| 153 |
-
macro_hypothesis = res1.text
|
| 154 |
-
yield f"Hypothesis: {macro_hypothesis[:100]}...\n"
|
| 155 |
-
|
| 156 |
-
meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
|
| 157 |
-
logger.info("[Gemini] Sending Meso Prompt...")
|
| 158 |
-
res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt, safety_settings=safety_settings))
|
| 159 |
-
|
| 160 |
-
synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
|
| 161 |
-
logger.info("[Gemini] Sending Synthesis Prompt...")
|
| 162 |
-
res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt, safety_settings=safety_settings))
|
| 163 |
-
|
| 164 |
-
raw_text = res3.text
|
| 165 |
-
prompt_used = f"FCoT:\n{macro_prompt}\n..."
|
| 166 |
-
else:
|
| 167 |
-
prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
|
| 168 |
-
prompt_used = prompt_text
|
| 169 |
-
yield f"Generating Labels ({model_name})..."
|
| 170 |
-
logger.info("[Gemini] Sending standard generation request...")
|
| 171 |
-
response = await loop.run_in_executor(
|
| 172 |
-
None,
|
| 173 |
-
lambda: model.generate_content([prompt_text, uploaded_file], generation_config=gen_config, safety_settings=safety_settings)
|
| 174 |
-
)
|
| 175 |
-
raw_text = response.text
|
| 176 |
-
|
| 177 |
-
# Log response info
|
| 178 |
-
logger.info(f"[Gemini] Response received. Length: {len(raw_text)}")
|
| 179 |
-
if not raw_text:
|
| 180 |
-
yield "Model returned empty response (Check API quota or safety)."
|
| 181 |
-
yield {"error": "Empty Response - likely safety block"}
|
| 182 |
-
return
|
| 183 |
-
|
| 184 |
-
parsed_data = parse_veracity_toon(raw_text)
|
| 185 |
-
if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
|
| 186 |
-
yield "Auto-Repairing output..."
|
| 187 |
-
raw_text = await attempt_toon_repair(raw_text, toon_schema, None, 'gemini', gemini_config)
|
| 188 |
-
parsed_data = parse_veracity_toon(raw_text)
|
| 189 |
-
|
| 190 |
-
yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
|
| 191 |
-
|
| 192 |
-
# Cleanup
|
| 193 |
-
try:
|
| 194 |
-
logger.info(f"[Gemini] Deleting remote file {uploaded_file.name}")
|
| 195 |
-
await loop.run_in_executor(None, lambda: genai_legacy.delete_file(name=uploaded_file.name))
|
| 196 |
-
except Exception as cleanup_err:
|
| 197 |
-
logger.warning(f"Failed to cleanup file: {cleanup_err}")
|
| 198 |
-
|
| 199 |
-
except Exception as e:
|
| 200 |
-
logger.error(f"Gemini Pipeline Error: {e}", exc_info=True)
|
| 201 |
-
yield f"ERROR (Gemini): {e}"
|
| 202 |
-
|
| 203 |
-
async def run_vertex_labeling_pipeline(video_path: str, caption: str, transcript: str, vertex_config: dict, include_comments: bool, reasoning_method: str = "cot"):
|
| 204 |
-
if genai is None:
|
| 205 |
-
yield "ERROR: 'google-genai' not installed.\n"
|
| 206 |
-
return
|
| 207 |
-
|
| 208 |
-
project_id = vertex_config.get("project_id")
|
| 209 |
-
if not project_id:
|
| 210 |
-
yield "ERROR: No Vertex Project ID."
|
| 211 |
-
return
|
| 212 |
-
|
| 213 |
-
logger.info(f"[Vertex] Initializing for project {project_id}")
|
| 214 |
-
|
| 215 |
-
safety_settings = [
|
| 216 |
-
SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="BLOCK_ONLY_HIGH"),
|
| 217 |
-
SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH"),
|
| 218 |
-
SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="BLOCK_ONLY_HIGH"),
|
| 219 |
-
SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="BLOCK_ONLY_HIGH"),
|
| 220 |
-
]
|
| 221 |
-
|
| 222 |
-
try:
|
| 223 |
-
client = genai.Client(vertexai=True, project=project_id, location=vertex_config.get("location", "us-central1"))
|
| 224 |
-
|
| 225 |
-
# For Vertex, we send bytes directly (up to a limit) or use Cloud Storage.
|
| 226 |
-
# v1 SDK Part.from_bytes is easiest for small/medium videos (< 20MB approx, but allows more in some versions).
|
| 227 |
-
# For larger videos in HF Spaces, this might time out if not using GCS.
|
| 228 |
-
# Assuming direct upload for now.
|
| 229 |
-
logger.info(f"[Vertex] Reading local video file: {video_path}")
|
| 230 |
-
with open(video_path, 'rb') as f: video_bytes = f.read()
|
| 231 |
-
video_part = Part.from_bytes(data=video_bytes, mime_type="video/mp4")
|
| 232 |
-
|
| 233 |
-
toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
|
| 234 |
-
score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
|
| 235 |
-
model_name = vertex_config.get("model_name", "gemini-2.5-flash-lite")
|
| 236 |
-
|
| 237 |
-
raw_text = ""
|
| 238 |
-
prompt_used = ""
|
| 239 |
-
loop = asyncio.get_event_loop()
|
| 240 |
-
config = GenerateContentConfig(
|
| 241 |
-
temperature=0.1,
|
| 242 |
-
response_mime_type="text/plain",
|
| 243 |
-
tools=[Tool(google_search=GoogleSearch())],
|
| 244 |
-
safety_settings=safety_settings
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
logger.info(f"[Vertex] Starting inference with {model_name}")
|
| 248 |
-
|
| 249 |
-
if reasoning_method == "fcot":
|
| 250 |
-
yield "Starting FCoT (Vertex)..."
|
| 251 |
-
chat = client.chats.create(model=model_name, config=config)
|
| 252 |
-
|
| 253 |
-
macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
|
| 254 |
-
logger.info("[Vertex] Sending Macro Prompt...")
|
| 255 |
-
res1 = await loop.run_in_executor(None, lambda: chat.send_message([video_part, macro_prompt]))
|
| 256 |
-
macro_hypothesis = res1.text
|
| 257 |
-
yield f"Hypothesis: {macro_hypothesis[:80]}...\n"
|
| 258 |
-
|
| 259 |
-
meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
|
| 260 |
-
logger.info("[Vertex] Sending Meso Prompt...")
|
| 261 |
-
res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt))
|
| 262 |
-
|
| 263 |
-
synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
|
| 264 |
-
logger.info("[Vertex] Sending Synthesis Prompt...")
|
| 265 |
-
res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt))
|
| 266 |
-
|
| 267 |
-
raw_text = res3.text
|
| 268 |
-
prompt_used = f"FCoT (Vertex):\n{macro_prompt}..."
|
| 269 |
-
|
| 270 |
-
else:
|
| 271 |
-
prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
|
| 272 |
-
prompt_used = prompt_text
|
| 273 |
-
yield f"Generating Labels ({model_name})..."
|
| 274 |
-
logger.info("[Vertex] Sending standard generation request...")
|
| 275 |
-
response = await loop.run_in_executor(
|
| 276 |
-
None,
|
| 277 |
-
lambda: client.models.generate_content(model=model_name, contents=[video_part, prompt_text], config=config)
|
| 278 |
-
)
|
| 279 |
-
raw_text = response.text
|
| 280 |
-
|
| 281 |
-
logger.info(f"[Vertex] Response Length: {len(raw_text)}")
|
| 282 |
-
if not raw_text:
|
| 283 |
-
yield "Model returned empty response."
|
| 284 |
-
yield {"error": "Empty Response"}
|
| 285 |
-
return
|
| 286 |
-
|
| 287 |
-
parsed_data = parse_veracity_toon(raw_text)
|
| 288 |
-
if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
|
| 289 |
-
yield "Auto-Repairing output..."
|
| 290 |
-
raw_text = await attempt_toon_repair(raw_text, toon_schema, client, 'vertex', vertex_config)
|
| 291 |
-
parsed_data = parse_veracity_toon(raw_text)
|
| 292 |
-
|
| 293 |
-
yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
|
| 294 |
-
|
| 295 |
-
except Exception as e:
|
| 296 |
-
yield f"ERROR (Vertex): {e}"
|
| 297 |
-
logger.error("Vertex Labeling Error", exc_info=True)
|
| 298 |
-
|
| 299 |
-
async def run_gemini_pipeline(video_path, question, checks, gemini_config, generation_config=None):
|
| 300 |
-
yield "Legacy pipeline not fully supported in HF Space."
|
| 301 |
-
|
| 302 |
-
async def run_vertex_pipeline(video_path, question, checks, vertex_config, generation_config=None):
|
| 303 |
-
yield "Legacy pipeline not fully supported in HF Space."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/labeling_logic.py
DELETED
|
@@ -1,145 +0,0 @@
|
|
| 1 |
-
# labeling_logic.py
|
| 2 |
-
|
| 3 |
-
LABELING_PROMPT_TEMPLATE = """
|
| 4 |
-
You are an AI Factuality Assessment Agent operating under the "Ali Arsanjani Factuality Factors" framework.
|
| 5 |
-
Your goal is to mass-label video content, quantifying "Veracity Vectors" and "Modality Alignment".
|
| 6 |
-
|
| 7 |
-
**INPUT DATA:**
|
| 8 |
-
- **User Caption:** "{caption}"
|
| 9 |
-
- **Audio Transcript:** "{transcript}"
|
| 10 |
-
- **Visuals:** (Provided in video context)
|
| 11 |
-
|
| 12 |
-
**INSTRUCTIONS:**
|
| 13 |
-
1. **Grounding:** Cross-reference claims in the transcript with your internal knowledge base (and tools if active).
|
| 14 |
-
2. **Chain of Thought (<thinking>):** You MUST think step-by-step inside a `<thinking>` block before generating output.
|
| 15 |
-
* Analyze *Visual Integrity* (Artifacts, edits).
|
| 16 |
-
* Analyze *Audio Integrity* (Voice cloning, sync).
|
| 17 |
-
* Analyze *Modality Alignment* (Does video match audio? Does caption match content? Does audio match caption?).
|
| 18 |
-
* Analyze *Logic* (Fallacies, gaps).
|
| 19 |
-
* Determine *Disinformation* classification.
|
| 20 |
-
3. **Output Format:** Output strictly in **TOON** format (Token-Oriented Object Notation) as defined below.
|
| 21 |
-
|
| 22 |
-
**CRITICAL CONSTRAINTS:**
|
| 23 |
-
- Do NOT repeat the input data.
|
| 24 |
-
- START your response IMMEDIATELY with the `<thinking>` tag.
|
| 25 |
-
- **DO NOT use Markdown code blocks.** (Output plain text only).
|
| 26 |
-
- Use strict `Key : Type [ Count ] {{ Headers }} :` format followed by data lines.
|
| 27 |
-
- Strings containing commas MUST be quoted.
|
| 28 |
-
- ALL scores must be filled (use 0 if unsure, do not leave blank).
|
| 29 |
-
- **MODALITY SCORING:** You must provide 3 distinct alignment scores: Video-Audio, Video-Caption, and Audio-Caption.
|
| 30 |
-
|
| 31 |
-
**TOON SCHEMA:**
|
| 32 |
-
{toon_schema}
|
| 33 |
-
|
| 34 |
-
{score_instructions}
|
| 35 |
-
|
| 36 |
-
**RESPONSE:**
|
| 37 |
-
<thinking>
|
| 38 |
-
"""
|
| 39 |
-
|
| 40 |
-
SCORE_INSTRUCTIONS_REASONING = """
|
| 41 |
-
**Constraints:**
|
| 42 |
-
1. Provide specific reasoning for EACH score in the `vectors` and `modalities` tables.
|
| 43 |
-
2. Ensure strings are properly quoted.
|
| 44 |
-
"""
|
| 45 |
-
|
| 46 |
-
SCORE_INSTRUCTIONS_SIMPLE = """
|
| 47 |
-
**Constraint:** Focus on objective measurements. Keep text concise.
|
| 48 |
-
"""
|
| 49 |
-
|
| 50 |
-
SCHEMA_SIMPLE = """summary: text[1]{text}:
|
| 51 |
-
"Brief neutral summary of the video events"
|
| 52 |
-
|
| 53 |
-
vectors: scores[1]{visual,audio,source,logic,emotion}:
|
| 54 |
-
(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10)
|
| 55 |
-
*Scale: 1=Fake/Malicious, 10=Authentic/Neutral*
|
| 56 |
-
|
| 57 |
-
modalities: scores[1]{video_audio_score,video_caption_score,audio_caption_score}:
|
| 58 |
-
(Int 1-10),(Int 1-10),(Int 1-10)
|
| 59 |
-
*Scale: 1=Mismatch, 10=Perfect Match*
|
| 60 |
-
|
| 61 |
-
factuality: factors[1]{accuracy,gap,grounding}:
|
| 62 |
-
(Verified/Misleading/False),"Missing evidence description","Grounding check results"
|
| 63 |
-
|
| 64 |
-
disinfo: analysis[1]{class,intent,threat}:
|
| 65 |
-
(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
|
| 66 |
-
|
| 67 |
-
final: assessment[1]{score,reasoning}:
|
| 68 |
-
(Int 1-100),"Final synthesis of why this score was given"
|
| 69 |
-
"""
|
| 70 |
-
|
| 71 |
-
SCHEMA_REASONING = """
|
| 72 |
-
summary: text[1]{text}:
|
| 73 |
-
"Brief neutral summary of the video events"
|
| 74 |
-
|
| 75 |
-
vectors: details[5]{category,score,reasoning}:
|
| 76 |
-
Visual,(Int 1-10),"Reasoning for visual score"
|
| 77 |
-
Audio,(Int 1-10),"Reasoning for audio score"
|
| 78 |
-
Source,(Int 1-10),"Reasoning for source credibility"
|
| 79 |
-
Logic,(Int 1-10),"Reasoning for logical consistency"
|
| 80 |
-
Emotion,(Int 1-10),"Reasoning for emotional manipulation"
|
| 81 |
-
|
| 82 |
-
modalities: details[3]{category,score,reasoning}:
|
| 83 |
-
VideoAudio,(Int 1-10),"Reasoning for video-to-audio alignment"
|
| 84 |
-
VideoCaption,(Int 1-10),"Reasoning for video-to-caption alignment"
|
| 85 |
-
AudioCaption,(Int 1-10),"Reasoning for audio-to-caption alignment"
|
| 86 |
-
|
| 87 |
-
factuality: factors[1]{accuracy,gap,grounding}:
|
| 88 |
-
(Verified/Misleading/False),"Missing evidence description","Grounding check results"
|
| 89 |
-
|
| 90 |
-
disinfo: analysis[1]{class,intent,threat}:
|
| 91 |
-
(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
|
| 92 |
-
|
| 93 |
-
final: assessment[1]{score,reasoning}:
|
| 94 |
-
(Int 1-100),"Final synthesis of why this score was given"
|
| 95 |
-
"""
|
| 96 |
-
|
| 97 |
-
FCOT_MACRO_PROMPT = """
|
| 98 |
-
**Fractal Chain of Thought - Stage 1: Macro-Scale Hypothesis (Wide Aperture)**
|
| 99 |
-
|
| 100 |
-
You are analyzing a video for factuality.
|
| 101 |
-
**Context:** Caption: "{caption}" | Transcript: "{transcript}"
|
| 102 |
-
|
| 103 |
-
1. **Global Scan**: Observe the video, audio, and caption as a whole entity.
|
| 104 |
-
2. **Context Aperture**: Wide. Assess the overall intent (Humor, Information, Political, Social) and the setting.
|
| 105 |
-
3. **Macro Hypothesis**: Formulate a high-level hypothesis about the veracity. (e.g., "The video is likely authentic but the caption misrepresents the location" or "The audio quality suggests synthetic generation").
|
| 106 |
-
|
| 107 |
-
**Objective**: Maximize **Coverage** (broadly explore potential angles of manipulation).
|
| 108 |
-
|
| 109 |
-
**Output**: A concise paragraph summarizing the "Macro Hypothesis".
|
| 110 |
-
"""
|
| 111 |
-
|
| 112 |
-
FCOT_MESO_PROMPT = """
|
| 113 |
-
**Fractal Chain of Thought - Stage 2: Meso-Scale Expansion (Recursive Verification)**
|
| 114 |
-
|
| 115 |
-
**Current Macro Hypothesis**: "{macro_hypothesis}"
|
| 116 |
-
|
| 117 |
-
**Action**: Zoom In. Decompose the hypothesis into specific verification branches.
|
| 118 |
-
Perform the following checks recursively:
|
| 119 |
-
|
| 120 |
-
1. **Visual Branch**: Look for specific artifacts, lighting inconsistencies, cuts, or deepfake signs.
|
| 121 |
-
2. **Audio Branch**: Analyze lip-sync, background noise consistency, and voice tonality.
|
| 122 |
-
3. **Logical Branch**: Does the visual evidence strictly support the caption's claim? Are there logical fallacies?
|
| 123 |
-
|
| 124 |
-
**Dual-Objective Self-Correction**:
|
| 125 |
-
- **Faithfulness**: Do not hallucinate details not present in the video.
|
| 126 |
-
- **Coverage**: Did you miss any subtle cues?
|
| 127 |
-
|
| 128 |
-
**Output**: Detailed "Micro-Observations" for each branch. If you find contradictions to the Macro Hypothesis, note them explicitly as **"Self-Correction"**.
|
| 129 |
-
"""
|
| 130 |
-
|
| 131 |
-
FCOT_SYNTHESIS_PROMPT = """
|
| 132 |
-
**Fractal Chain of Thought - Stage 3: Inter-Scale Consensus & Synthesis**
|
| 133 |
-
|
| 134 |
-
**Action**: Integrate your Macro Hypothesis and Micro-Observations.
|
| 135 |
-
- **Consensus Check**: If Micro-Observations contradict the Macro Hypothesis, prioritize the Micro evidence (Self-Correction).
|
| 136 |
-
- **Compression**: Synthesize the findings into the final structured format.
|
| 137 |
-
|
| 138 |
-
**Output Format**:
|
| 139 |
-
Strictly fill out the following TOON schema based on the consensus. Do not include markdown code blocks.
|
| 140 |
-
|
| 141 |
-
**TOON SCHEMA**:
|
| 142 |
-
{toon_schema}
|
| 143 |
-
|
| 144 |
-
{score_instructions}
|
| 145 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/my_vision_process.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
# my_vision_process.py (Stub for HF Spaces / Lite Mode)
|
| 2 |
-
import logging
|
| 3 |
-
|
| 4 |
-
logger = logging.getLogger(__name__)
|
| 5 |
-
|
| 6 |
-
# Dummy client
|
| 7 |
-
client = None
|
| 8 |
-
|
| 9 |
-
def process_vision_info(messages, return_video_kwargs=False, client=None):
|
| 10 |
-
"""
|
| 11 |
-
Stub function to prevent ImportErrors in API-only mode.
|
| 12 |
-
If this is called, it means LITE_MODE logic failed or was bypassed.
|
| 13 |
-
"""
|
| 14 |
-
logger.warning("process_vision_info called in LITE/API environment. Returning empty placeholders.")
|
| 15 |
-
if return_video_kwargs:
|
| 16 |
-
return None, None, {"fps": [0]}
|
| 17 |
-
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/toon_parser.py
DELETED
|
@@ -1,220 +0,0 @@
|
|
| 1 |
-
# toon_parser.py
|
| 2 |
-
import re
|
| 3 |
-
import logging
|
| 4 |
-
import csv
|
| 5 |
-
from io import StringIO
|
| 6 |
-
|
| 7 |
-
logger = logging.getLogger(__name__)
|
| 8 |
-
|
| 9 |
-
def parse_toon_line(line_def, data_line):
|
| 10 |
-
if not data_line or data_line.isspace():
|
| 11 |
-
return {}
|
| 12 |
-
|
| 13 |
-
try:
|
| 14 |
-
reader = csv.reader(StringIO(data_line), skipinitialspace=True)
|
| 15 |
-
try:
|
| 16 |
-
values = next(reader)
|
| 17 |
-
except StopIteration:
|
| 18 |
-
values = []
|
| 19 |
-
|
| 20 |
-
cleaned_values = []
|
| 21 |
-
for v in values:
|
| 22 |
-
v_str = v.strip()
|
| 23 |
-
v_str = v_str.replace('(', '').replace(')', '')
|
| 24 |
-
if '/' in v_str and any(c.isdigit() for c in v_str):
|
| 25 |
-
parts = v_str.split('/')
|
| 26 |
-
if parts[0].strip().isdigit():
|
| 27 |
-
v_str = parts[0].strip()
|
| 28 |
-
cleaned_values.append(v_str)
|
| 29 |
-
|
| 30 |
-
headers = line_def.get('headers', [])
|
| 31 |
-
|
| 32 |
-
if len(cleaned_values) < len(headers):
|
| 33 |
-
cleaned_values += [""] * (len(headers) - len(cleaned_values))
|
| 34 |
-
elif len(cleaned_values) > len(headers):
|
| 35 |
-
cleaned_values = cleaned_values[:len(headers)]
|
| 36 |
-
|
| 37 |
-
return dict(zip(headers, cleaned_values))
|
| 38 |
-
except Exception as e:
|
| 39 |
-
logger.error(f"Error parsing TOON line '{data_line}': {e}")
|
| 40 |
-
return {}
|
| 41 |
-
|
| 42 |
-
def fuzzy_extract_scores(text: str) -> dict:
|
| 43 |
-
scores = {
|
| 44 |
-
'visual': '0', 'audio': '0', 'source': '0', 'logic': '0', 'emotion': '0',
|
| 45 |
-
'video_audio': '0', 'video_caption': '0', 'audio_caption': '0'
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
mappings = [
|
| 49 |
-
('visual', 'visual'),
|
| 50 |
-
('visual.*?integrity', 'visual'),
|
| 51 |
-
('accuracy', 'visual'),
|
| 52 |
-
('audio', 'audio'),
|
| 53 |
-
('source', 'source'),
|
| 54 |
-
('logic', 'logic'),
|
| 55 |
-
('emotion', 'emotion'),
|
| 56 |
-
(r'video.*?audio', 'video_audio'),
|
| 57 |
-
(r'video.*?caption', 'video_caption'),
|
| 58 |
-
(r'audio.*?caption', 'audio_caption')
|
| 59 |
-
]
|
| 60 |
-
|
| 61 |
-
for pattern_str, key in mappings:
|
| 62 |
-
pattern = re.compile(fr'(?i){pattern_str}.*?[:=\-\s\(]+(\b10\b|\b\d\b)(?:/10)?')
|
| 63 |
-
match = pattern.search(text)
|
| 64 |
-
if match:
|
| 65 |
-
if scores[key] == '0':
|
| 66 |
-
scores[key] = match.group(1)
|
| 67 |
-
|
| 68 |
-
return scores
|
| 69 |
-
|
| 70 |
-
def parse_veracity_toon(text: str) -> dict:
|
| 71 |
-
if not text:
|
| 72 |
-
return {}
|
| 73 |
-
|
| 74 |
-
text = re.sub(r'```\w*', '', text)
|
| 75 |
-
text = re.sub(r'```', '', text)
|
| 76 |
-
text = text.strip()
|
| 77 |
-
|
| 78 |
-
parsed_sections = {}
|
| 79 |
-
|
| 80 |
-
block_pattern = re.compile(
|
| 81 |
-
r'([a-zA-Z0-9_]+)\s*:\s*(?:\w+\s*)?(?:\[\s*(\d+)\s*\])?\s*\{\s*(.*?)\s*\}\s*:\s*',
|
| 82 |
-
re.MULTILINE
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
matches = list(block_pattern.finditer(text))
|
| 86 |
-
|
| 87 |
-
for i, match in enumerate(matches):
|
| 88 |
-
key = match.group(1).lower()
|
| 89 |
-
count = int(match.group(2)) if match.group(2) else 1
|
| 90 |
-
headers_str = match.group(3)
|
| 91 |
-
headers = [h.strip().lower() for h in headers_str.split(',')]
|
| 92 |
-
|
| 93 |
-
start_idx = match.end()
|
| 94 |
-
end_idx = matches[i+1].start() if i + 1 < len(matches) else len(text)
|
| 95 |
-
block_content = text[start_idx:end_idx].strip()
|
| 96 |
-
|
| 97 |
-
lines = [line.strip() for line in block_content.splitlines() if line.strip()]
|
| 98 |
-
|
| 99 |
-
data_items = []
|
| 100 |
-
valid_lines = [l for l in lines if len(l) > 1]
|
| 101 |
-
|
| 102 |
-
for line in valid_lines[:count]:
|
| 103 |
-
item = parse_toon_line({'key': key, 'headers': headers}, line)
|
| 104 |
-
data_items.append(item)
|
| 105 |
-
|
| 106 |
-
if count == 1 and data_items:
|
| 107 |
-
parsed_sections[key] = data_items[0]
|
| 108 |
-
else:
|
| 109 |
-
parsed_sections[key] = data_items
|
| 110 |
-
|
| 111 |
-
flat_result = {
|
| 112 |
-
'veracity_vectors': {
|
| 113 |
-
'visual_integrity_score': '0',
|
| 114 |
-
'audio_integrity_score': '0',
|
| 115 |
-
'source_credibility_score': '0',
|
| 116 |
-
'logical_consistency_score': '0',
|
| 117 |
-
'emotional_manipulation_score': '0'
|
| 118 |
-
},
|
| 119 |
-
'modalities': {
|
| 120 |
-
'video_audio_score': '0',
|
| 121 |
-
'video_caption_score': '0',
|
| 122 |
-
'audio_caption_score': '0'
|
| 123 |
-
},
|
| 124 |
-
'video_context_summary': '',
|
| 125 |
-
'factuality_factors': {},
|
| 126 |
-
'disinformation_analysis': {},
|
| 127 |
-
'final_assessment': {}
|
| 128 |
-
}
|
| 129 |
-
|
| 130 |
-
got_vectors = False
|
| 131 |
-
got_modalities = False
|
| 132 |
-
|
| 133 |
-
vectors_data = parsed_sections.get('vectors', [])
|
| 134 |
-
if isinstance(vectors_data, dict):
|
| 135 |
-
v = vectors_data
|
| 136 |
-
if any(val and val != '0' for val in v.values()):
|
| 137 |
-
if 'visual' in v: flat_result['veracity_vectors']['visual_integrity_score'] = v['visual']
|
| 138 |
-
if 'audio' in v: flat_result['veracity_vectors']['audio_integrity_score'] = v['audio']
|
| 139 |
-
if 'source' in v: flat_result['veracity_vectors']['source_credibility_score'] = v['source']
|
| 140 |
-
if 'logic' in v: flat_result['veracity_vectors']['logical_consistency_score'] = v['logic']
|
| 141 |
-
if 'emotion' in v: flat_result['veracity_vectors']['emotional_manipulation_score'] = v['emotion']
|
| 142 |
-
got_vectors = True
|
| 143 |
-
|
| 144 |
-
elif isinstance(vectors_data, list):
|
| 145 |
-
for item in vectors_data:
|
| 146 |
-
cat = item.get('category', '').lower()
|
| 147 |
-
score = item.get('score', '0')
|
| 148 |
-
if score and score != '0':
|
| 149 |
-
got_vectors = True
|
| 150 |
-
if 'visual' in cat: flat_result['veracity_vectors']['visual_integrity_score'] = score
|
| 151 |
-
elif 'audio' in cat: flat_result['veracity_vectors']['audio_integrity_score'] = score
|
| 152 |
-
elif 'source' in cat: flat_result['veracity_vectors']['source_credibility_score'] = score
|
| 153 |
-
elif 'logic' in cat: flat_result['veracity_vectors']['logical_consistency_score'] = score
|
| 154 |
-
elif 'emotion' in cat: flat_result['veracity_vectors']['emotional_manipulation_score'] = score
|
| 155 |
-
|
| 156 |
-
modalities_data = parsed_sections.get('modalities', [])
|
| 157 |
-
if isinstance(modalities_data, dict):
|
| 158 |
-
m = modalities_data
|
| 159 |
-
for k, v in m.items():
|
| 160 |
-
k_clean = k.lower().replace(' ', '').replace('-', '').replace('_', '')
|
| 161 |
-
if 'videoaudio' in k_clean: flat_result['modalities']['video_audio_score'] = v
|
| 162 |
-
elif 'videocaption' in k_clean: flat_result['modalities']['video_caption_score'] = v
|
| 163 |
-
elif 'audiocaption' in k_clean: flat_result['modalities']['audio_caption_score'] = v
|
| 164 |
-
if v and v != '0': got_modalities = True
|
| 165 |
-
|
| 166 |
-
elif isinstance(modalities_data, list):
|
| 167 |
-
for item in modalities_data:
|
| 168 |
-
cat = item.get('category', '').lower().replace(' ', '').replace('-', '').replace('_', '')
|
| 169 |
-
score = item.get('score', '0')
|
| 170 |
-
if score and score != '0':
|
| 171 |
-
got_modalities = True
|
| 172 |
-
if 'videoaudio' in cat: flat_result['modalities']['video_audio_score'] = score
|
| 173 |
-
elif 'videocaption' in cat: flat_result['modalities']['video_caption_score'] = score
|
| 174 |
-
elif 'audiocaption' in cat: flat_result['modalities']['audio_caption_score'] = score
|
| 175 |
-
|
| 176 |
-
if not got_vectors or not got_modalities:
|
| 177 |
-
fuzzy_scores = fuzzy_extract_scores(text)
|
| 178 |
-
|
| 179 |
-
if not got_vectors:
|
| 180 |
-
flat_result['veracity_vectors']['visual_integrity_score'] = fuzzy_scores['visual']
|
| 181 |
-
flat_result['veracity_vectors']['audio_integrity_score'] = fuzzy_scores['audio']
|
| 182 |
-
flat_result['veracity_vectors']['source_credibility_score'] = fuzzy_scores['source']
|
| 183 |
-
flat_result['veracity_vectors']['logical_consistency_score'] = fuzzy_scores['logic']
|
| 184 |
-
flat_result['veracity_vectors']['emotional_manipulation_score'] = fuzzy_scores['emotion']
|
| 185 |
-
|
| 186 |
-
if not got_modalities:
|
| 187 |
-
flat_result['modalities']['video_audio_score'] = fuzzy_scores['video_audio']
|
| 188 |
-
flat_result['modalities']['video_caption_score'] = fuzzy_scores['video_caption']
|
| 189 |
-
flat_result['modalities']['audio_caption_score'] = fuzzy_scores['audio_caption']
|
| 190 |
-
|
| 191 |
-
f = parsed_sections.get('factuality', {})
|
| 192 |
-
if isinstance(f, list): f = f[0] if f else {}
|
| 193 |
-
flat_result['factuality_factors'] = {
|
| 194 |
-
'claim_accuracy': f.get('accuracy', 'Unverifiable'),
|
| 195 |
-
'evidence_gap': f.get('gap', ''),
|
| 196 |
-
'grounding_check': f.get('grounding', '')
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
d = parsed_sections.get('disinfo', {})
|
| 200 |
-
if isinstance(d, list): d = d[0] if d else {}
|
| 201 |
-
flat_result['disinformation_analysis'] = {
|
| 202 |
-
'classification': d.get('class', 'None'),
|
| 203 |
-
'intent': d.get('intent', 'None'),
|
| 204 |
-
'threat_vector': d.get('threat', 'None')
|
| 205 |
-
}
|
| 206 |
-
|
| 207 |
-
fn = parsed_sections.get('final', {})
|
| 208 |
-
if isinstance(fn, list): fn = fn[0] if fn else {}
|
| 209 |
-
flat_result['final_assessment'] = {
|
| 210 |
-
'veracity_score_total': fn.get('score', '0'),
|
| 211 |
-
'reasoning': fn.get('reasoning', '')
|
| 212 |
-
}
|
| 213 |
-
|
| 214 |
-
s = parsed_sections.get('summary', {})
|
| 215 |
-
if isinstance(s, list): s = s[0] if s else {}
|
| 216 |
-
flat_result['video_context_summary'] = s.get('text', '')
|
| 217 |
-
|
| 218 |
-
flat_result['raw_parsed_structure'] = parsed_sections
|
| 219 |
-
|
| 220 |
-
return flat_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/transcription.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
import whisper
|
| 2 |
-
import logging
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
|
| 7 |
-
|
| 8 |
-
logger = logging.getLogger(__name__)
|
| 9 |
-
transcription_model = None
|
| 10 |
-
|
| 11 |
-
def load_model():
|
| 12 |
-
if LITE_MODE:
|
| 13 |
-
logger.info("LITE_MODE is enabled. Skipping Whisper model loading.")
|
| 14 |
-
return
|
| 15 |
-
|
| 16 |
-
global transcription_model
|
| 17 |
-
if transcription_model is None:
|
| 18 |
-
try:
|
| 19 |
-
logger.info("Loading 'base.en' Whisper model for transcription...")
|
| 20 |
-
transcription_model = whisper.load_model("base.en")
|
| 21 |
-
logger.info("Whisper model loaded successfully.")
|
| 22 |
-
except Exception as e:
|
| 23 |
-
logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
|
| 24 |
-
transcription_model = None
|
| 25 |
-
|
| 26 |
-
def generate_transcript(audio_path_str: str) -> str:
|
| 27 |
-
if transcription_model is None:
|
| 28 |
-
logger.warning("Transcription model is not available. Cannot generate transcript.")
|
| 29 |
-
return None
|
| 30 |
-
|
| 31 |
-
try:
|
| 32 |
-
audio_path = Path(audio_path_str)
|
| 33 |
-
logger.info(f"Starting transcription for: {audio_path.name}")
|
| 34 |
-
|
| 35 |
-
result = transcription_model.transcribe(audio_path_str, verbose=False)
|
| 36 |
-
|
| 37 |
-
vtt_path = audio_path.with_suffix('.vtt')
|
| 38 |
-
|
| 39 |
-
from whisper.utils import get_writer
|
| 40 |
-
writer = get_writer("vtt", str(vtt_path.parent))
|
| 41 |
-
writer(result, str(audio_path.name))
|
| 42 |
-
|
| 43 |
-
logger.info(f"Transcription complete. VTT file saved to: {vtt_path}")
|
| 44 |
-
return str(vtt_path)
|
| 45 |
-
|
| 46 |
-
except Exception as e:
|
| 47 |
-
logger.error(f"An error occurred during transcription for {audio_path_str}: {e}", exc_info=True)
|
| 48 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start.sh
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# 1. Start Python FastAPI in the background (Internal Port 8001)
|
| 4 |
-
echo "Starting Python Inference Engine..."
|
| 5 |
-
export PYTHONPATH=$PYTHONPATH:/app/src
|
| 6 |
-
# Use --log-level info to see startup issues
|
| 7 |
-
python -m uvicorn src.app:app --host 127.0.0.1 --port 8001 --log-level info &
|
| 8 |
-
|
| 9 |
-
# Wait longer for Python to initialize, or until port is open
|
| 10 |
-
echo "Waiting for Python backend to initialize..."
|
| 11 |
-
timeout=30
|
| 12 |
-
while ! curl -s http://127.0.0.1:8001/ > /dev/null; do
|
| 13 |
-
sleep 2
|
| 14 |
-
timeout=$((timeout-2))
|
| 15 |
-
if [ $timeout -le 0 ]; then
|
| 16 |
-
echo "Python backend failed to start on time. Logs might show why."
|
| 17 |
-
break
|
| 18 |
-
fi
|
| 19 |
-
done
|
| 20 |
-
|
| 21 |
-
# 2. Start Golang Web Server (Public Port 7860)
|
| 22 |
-
echo "Starting Go Web Server..."
|
| 23 |
-
/app/vchat-server
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|