liarMP4

Sleeping

App Files Files Community

GlazedDon0t commited on Mar 4

Commit

7632cf2

1 Parent(s): 7b1d1ea

wow

Browse files

Files changed (13) hide show

.gitattributes +35 -0
Dockerfile +86 -0
README.md +8 -0
main.go +60 -0
requirements.txt +23 -0
src/app.py +808 -0
src/factuality_logic.py +143 -0
src/inference_logic.py +305 -0
src/labeling_logic.py +145 -0
src/my_vision_process.py +17 -0
src/toon_parser.py +220 -0
src/transcription.py +53 -0
start.sh +23 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,86 @@

+# ==========================================
+# Stage 1: Build Frontend (React/TS/Vite)
+# ==========================================
+FROM node:20-slim AS frontend-builder
+WORKDIR /app/frontend
+# Copy frontend definitions
+COPY frontend/package.json frontend/package-lock.json* ./
+RUN npm install
+# Copy source and build
+COPY frontend/ ./
+RUN npm run build
+# ==========================================
+# Stage 2: Build Backend (Golang)
+# ==========================================
+FROM golang:1.23 AS backend-builder
+WORKDIR /app/backend
+# Copy Go source
+COPY main.go .
+# Build static binary
+RUN go mod init vchat-server && \
+    go mod tidy && \
+    CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o vchat-server main.go
+# ==========================================
+# Stage 3: Final Runtime (Hugging Face Space - API Lite)
+# ==========================================
+FROM python:3.11-slim
+# Default to LITE_MODE=true for HF Spaces (API Only)
+ENV PYTHONUNBUFFERED=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    LITE_MODE=true \
+    PATH="/home/user/.local/bin:$PATH" \
+    PIP_NO_CACHE_DIR=1
+# Create a non-root user (Required for HF Spaces)
+RUN useradd -m -u 1000 user
+WORKDIR /app
+# 1. Install System Dependencies (FFmpeg required for yt-dlp)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+    curl \
+    gnupg \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+# 2. Install Python Dependencies
+RUN pip install uv
+COPY requirements.txt ./
+RUN uv pip install --system -r requirements.txt
+# Explicitly force latest yt-dlp to handle Twitter/X API changes
+RUN uv pip install --system --upgrade "yt-dlp[default]"
+# 3. Copy Python Application Code
+COPY --chown=user src/ ./src/
+# 4. Install Built Artifacts
+COPY --from=backend-builder --chown=user /app/backend/vchat-server /app/vchat-server
+RUN mkdir -p /app/static
+COPY --from=frontend-builder --chown=user /app/frontend/dist /app/static
+# 5. Setup Directories and Permissions
+RUN mkdir -p /app/data /app/data/videos /app/data/labels /app/data/prompts /app/data/responses /app/metadata \
+    && chown -R user:user /app/data /app/metadata
+# 6. Setup Entrypoint
+COPY --chown=user start.sh /app/start.sh
+RUN sed -i 's/\r$//' /app/start.sh && \
+    chmod +x /app/start.sh
+# Switch to non-root user
+USER user
+# Expose the HF Space port
+EXPOSE 7860
+# Run the Orchestrator
+CMD ["/app/start.sh"]

README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+---
+title: VFacts
+emoji: 😻
+colorFrom: gray
+colorTo: gray
+sdk: docker
+pinned: false
+---

main.go ADDED Viewed

	@@ -0,0 +1,60 @@

+package main
+import (
+	"log"
+	"net/http"
+	"net/http/httputil"
+	"net/url"
+	"os"
+	"strings"
+)
+func main() {
+	// Target Python FastAPI server (running locally in the container)
+	pythonTarget := "http://127.0.0.1:8001"
+	pythonURL, err := url.Parse(pythonTarget)
+	if err != nil {
+		log.Fatalf("Invalid Python target URL: %v", err)
+	}
+	// Create Reverse Proxy
+	proxy := httputil.NewSingleHostReverseProxy(pythonURL)
+	// HF Spaces: Files are copied to /app/static in Dockerfile
+	staticPath := "/app/static"
+	fs := http.FileServer(http.Dir(staticPath))
+	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		// Proxy API requests to Python
+		if strings.HasPrefix(r.URL.Path, "/process") ||
+			strings.HasPrefix(r.URL.Path, "/label_video") ||
+			strings.HasPrefix(r.URL.Path, "/batch_label") ||
+			strings.HasPrefix(r.URL.Path, "/model-architecture") ||
+			strings.HasPrefix(r.URL.Path, "/download-dataset") ||
+			strings.HasPrefix(r.URL.Path, "/extension") ||
+			strings.HasPrefix(r.URL.Path, "/manage") ||
+			strings.HasPrefix(r.URL.Path, "/queue") {
+			log.Printf("Proxying %s to Python Backend...", r.URL.Path)
+			proxy.ServeHTTP(w, r)
+			return
+		}
+		// Check if file exists in static dir, otherwise serve index.html (SPA Routing)
+		path := staticPath + r.URL.Path
+		if _, err := os.Stat(path); os.IsNotExist(err) {
+			http.ServeFile(w, r, staticPath+"/index.html")
+			return
+		}
+		fs.ServeHTTP(w, r)
+	})
+	// HF Spaces requires listening on port 7860
+	port := "7860"
+	log.Printf("vChat HF Server listening on port %s", port)
+	log.Printf("Serving static files from %s", staticPath)
+	if err := http.ListenAndServe(":"+port, nil); err != nil {
+		log.Fatal(err)
+	}
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# --- Core Server ---
+fastapi
+uvicorn[standard]
+python-multipart
+requests
+aiofiles
+jinja2
+python-dotenv
+# --- Data & Vision Utils ---
+Pillow
+packaging
+numpy
+# --- Google Cloud & APIs ---
+google-generativeai>=0.4.0
+google-cloud-aiplatform
+google-genai
+mlcroissant
+# --- Audio & Video Fetching ---
+yt-dlp
+ffmpeg-python

src/app.py ADDED Viewed

	@@ -0,0 +1,808 @@

+import os
+import sys
+import asyncio
+import subprocess
+from pathlib import Path
+import logging
+import csv
+import io
+import datetime
+import json
+import hashlib
+import re
+import glob
+import shutil
+import time
+from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
+from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+import yt_dlp
+import inference_logic
+import factuality_logic
+import transcription
+from factuality_logic import parse_vtt
+from toon_parser import parse_veracity_toon
+try:
+    import mlcroissant as mlc
+    CROISSANT_AVAILABLE = True
+except ImportError:
+    try:
+        import croissant as mlc
+        CROISSANT_AVAILABLE = True
+    except ImportError:
+        mlc = None
+        CROISSANT_AVAILABLE = False
+# Configure Logging with High Verbosity
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger("vChat")
+LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# HF Spaces specific path
+STATIC_DIR = "/app/static"
+if not os.path.isdir(STATIC_DIR):
+    # Fallback if running locally
+    STATIC_DIR = "static"
+    os.makedirs(STATIC_DIR, exist_ok=True)
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+templates = Jinja2Templates(directory=STATIC_DIR)
+# Ensure data directories exist (HF Spaces writable locations)
+os.makedirs("data/videos", exist_ok=True)
+os.makedirs("data", exist_ok=True)
+os.makedirs("data/labels", exist_ok=True)
+os.makedirs("data/prompts", exist_ok=True)
+os.makedirs("data/responses", exist_ok=True)
+os.makedirs("metadata", exist_ok=True)
+STOP_QUEUE_SIGNAL = False
+@app.on_event("startup")
+async def startup_event():
+    logger.info("Application starting up...")
+    try:
+        transcription.load_model()
+    except Exception as e:
+        logger.warning(f"Could not load Whisper model: {e}")
+    if not LITE_MODE:
+        try:
+            inference_logic.load_models()
+        except Exception as e:
+            logger.fatal(f"Could not load local inference models. Error: {e}", exc_info=True)
+    else:
+        logger.info("Running in LITE mode (API Only).")
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    custom_model_available = False
+    if not LITE_MODE:
+        custom_model_available = inference_logic.peft_model is not None
+    if not (Path(STATIC_DIR) / "index.html").exists():
+        return HTMLResponse(content="Frontend not found.", status_code=404)
+    return templates.TemplateResponse("index.html", {
+        "request": request,
+        "custom_model_available": custom_model_available,
+        "lite_mode": LITE_MODE
+    })
+@app.get("/model-architecture", response_class=PlainTextResponse)
+async def get_model_architecture():
+    if LITE_MODE: return "Running in LITE mode."
+    if inference_logic.base_model: return str(inference_logic.base_model)
+    return "Model not loaded."
+@app.get("/download-dataset")
+async def download_dataset():
+    file_path = Path("data/dataset.csv")
+    if file_path.exists():
+        return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
+    return Response("Dataset not found.", status_code=404)
+progress_message = ""
+def progress_hook(d):
+    global progress_message
+    if d['status'] == 'downloading':
+        progress_message = f"Downloading: {d.get('_percent_str', 'N/A')} at {d.get('_speed_str', 'N/A')}\r"
+    elif d['status'] == 'finished':
+        progress_message = f"\nDownload finished. Preparing video assets...\n"
+def get_cookies_path():
+    """Look for cookies file in known locations for better yt-dlp support."""
+    candidates = ["cookies.txt", "data/cookies.txt", "/app/cookies.txt"]
+    for c in candidates:
+        if os.path.exists(c):
+            return os.path.abspath(c)
+    return None
+async def run_subprocess_async(command: list[str]):
+    cmd_str = ' '.join(command)
+    logger.info(f"[Subprocess] Running: {cmd_str}")
+    process = await asyncio.create_subprocess_exec(*command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = await process.communicate()
+    if process.returncode != 0:
+        err_msg = stderr.decode()
+        logger.error(f"[Subprocess] Failed ({process.returncode}): {err_msg}")
+        raise RuntimeError(f"Command failed: {err_msg}")
+    logger.info(f"[Subprocess] Success.")
+    return stdout.decode()
+def extract_tweet_id(url: str) -> str | None:
+    match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
+    if match: return match.group(1)
+    return None
+def check_if_processed(link: str) -> bool:
+    target_id = extract_tweet_id(link)
+    link_clean = link.split('?')[0].strip().rstrip('/')
+    for filename in ["data/dataset.csv", "data/manual_dataset.csv"]:
+        path = Path(filename)
+        if not path.exists(): continue
+        try:
+            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+                sample = f.read(2048)
+                f.seek(0)
+                try: has_header = csv.Sniffer().has_header(sample)
+                except: has_header = True
+                if has_header:
+                    reader = csv.DictReader(f)
+                    for row in reader:
+                        row_link = row.get('link', '').split('?')[0].strip().rstrip('/')
+                        if row_link == link_clean: return True
+                        row_id = row.get('id', '')
+                        if target_id and row_id == target_id: return True
+                else:
+                    reader = csv.reader(f)
+                    for row in reader:
+                        if not row: continue
+                        if link_clean in row: return True
+                        if target_id and target_id in row: return True
+        except Exception:
+            continue
+    return False
+async def prepare_video_assets_async(url: str) -> dict:
+    global progress_message
+    loop = asyncio.get_event_loop()
+    is_local = not (url.startswith("http://") or url.startswith("https://"))
+    video_id = "unknown"
+    transcript_path = None
+    logger.info(f"Preparing assets for URL: {url}")
+    if is_local:
+        original_path = Path(url)
+        if not original_path.exists(): raise FileNotFoundError(f"File not found: {url}")
+        video_id = hashlib.md5(str(url).encode('utf-8')).hexdigest()[:16]
+        metadata = {"id": video_id, "link": url, "caption": original_path.stem}
+    else:
+        tweet_id = extract_tweet_id(url)
+        video_id = tweet_id if tweet_id else hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
+        sanitized_check = Path(f"data/videos/{video_id}_fixed.mp4")
+        cookies_path = get_cookies_path()
+        ydl_opts = {
+            'format': 'best[ext=mp4]/best',
+            'outtmpl': 'data/videos/%(id)s.%(ext)s',
+            'progress_hooks': [progress_hook],
+            'quiet': False,
+            'no_warnings': False,
+            'noplaylist': True,
+            'no_overwrites': True,
+            'writesubtitles': True,
+            'writeautomaticsub': True,
+            'subtitleslangs': ['en'],
+            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        }
+        if cookies_path:
+            ydl_opts['cookiefile'] = cookies_path
+            logger.info(f"Using cookies from {cookies_path}")
+        if sanitized_check.exists():
+            logger.info(f"Video {video_id} already cached at {sanitized_check}")
+            original_path = sanitized_check
+            metadata = {"id": video_id, "link": url, "caption": "Cached Video"}
+        else:
+            try:
+                logger.info(f"Starting yt-dlp download for {video_id}...")
+                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                    info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=True))
+                    original_path = Path(ydl.prepare_filename(info))
+                    metadata = {
+                        "id": info.get("id", video_id), "link": info.get("webpage_url", url),
+                        "caption": info.get("description", info.get("title", "N/A")).encode('ascii', 'ignore').decode('ascii').strip()[:500],
+                        "postdatetime": info.get("upload_date", "N/A")
+                    }
+                    video_id = info.get("id", video_id)
+                logger.info("yt-dlp download successful.")
+            except yt_dlp.utils.DownloadError as e:
+                logger.error(f"yt-dlp download error: {e}")
+                if "No video could be found" in str(e):
+                    raise ValueError(f"No video content found at {url}")
+                raise RuntimeError(f"Download failed: {str(e)}")
+            except Exception as e:
+                logger.error(f"Unexpected yt-dlp error: {e}")
+                raise RuntimeError(f"Download failed: {str(e)}")
+        transcript_path = next(Path("data/videos").glob(f"{video_id}*.en.vtt"), None)
+        if not transcript_path: transcript_path = next(Path("data/videos").glob(f"{video_id}*.vtt"), None)
+    sanitized_path = Path(f"data/videos/{video_id}_fixed.mp4")
+    # --- FFmpeg Sanitization Logic with Robust Fallback ---
+    if not sanitized_path.exists() and original_path.exists():
+        logger.info(f"Sanitizing video {video_id} (Original: {original_path})...")
+        ffmpeg_bin = shutil.which('ffmpeg')
+        if not ffmpeg_bin: raise RuntimeError("FFmpeg binary not found in system path!")
+        try:
+            await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-y", str(sanitized_path)])
+            logger.info("Sanitization (re-encode) successful.")
+        except Exception as e:
+            logger.warning(f"Re-encode failed ({e}). Attempting Stream Copy...")
+            try:
+                await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c", "copy", "-y", str(sanitized_path)])
+                logger.info("Sanitization (copy) successful.")
+            except Exception as e2:
+                logger.error(f"Sanitization failed completely: {e2}")
+                if original_path.suffix == '.mp4':
+                    logger.warning("Using original file as sanitized file.")
+                    shutil.copy(original_path, sanitized_path)
+                else:
+                    raise RuntimeError("Could not produce a valid MP4 file.")
+    # --- Audio Extraction ---
+    audio_path = sanitized_path.with_suffix('.wav')
+    if not audio_path.exists() and sanitized_path.exists():
+        logger.info(f"Extracting audio to {audio_path}...")
+        try:
+            await run_subprocess_async(["ffmpeg", "-i", str(sanitized_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", str(audio_path)])
+            logger.info("Audio extraction successful.")
+        except Exception as e:
+            logger.error(f"Audio extraction failed: {e}")
+    # --- Transcription ---
+    if not transcript_path and audio_path.exists() and transcription.transcription_model is not None:
+        logger.info("Generating transcript via Whisper...")
+        transcript_path = await loop.run_in_executor(None, transcription.generate_transcript, str(audio_path))
+    elif not transcript_path:
+        logger.info("Skipping local transcription (Whisper not loaded or audio missing).")
+    return {"video": str(sanitized_path), "transcript": str(transcript_path) if transcript_path else None, "metadata": metadata}
+def safe_int(value):
+    try:
+        clean = re.sub(r'[^\d]', '', str(value))
+        return int(clean) if clean else 0
+    except Exception:
+        return 0
+async def generate_and_save_croissant_metadata(row_data: dict) -> str:
+    try:
+        sanitized_data = {
+            "id": str(row_data.get("id", "")),
+            "link": str(row_data.get("link", "")),
+            "visual_integrity_score": safe_int(row_data.get("visual_integrity_score")),
+            "final_veracity_score": safe_int(row_data.get("final_veracity_score"))
+        }
+        video_id = sanitized_data["id"]
+        timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+        croissant_json = {
+          "@context": "https://schema.org/",
+          "@type": "Dataset",
+          "name": f"vchat-label-{video_id}",
+          "description": f"Veracity analysis labels for video {video_id}",
+          "url": sanitized_data["link"],
+          "variableMeasured": sanitized_data
+        }
+        path = Path("metadata") / f"{video_id}_{timestamp}.json"
+        path.write_text(json.dumps(croissant_json, indent=2))
+        return str(path)
+    except Exception:
+        return "N/A (Error)"
+async def get_labels_for_link(video_url: str, gemini_config: dict, vertex_config: dict, model_selection: str, include_comments: bool, reasoning_method: str = "cot"):
+    try:
+        yield f"Downloading assets for {video_url}..."
+        try:
+            paths = await prepare_video_assets_async(video_url)
+        except ValueError as ve:
+            yield f"Skipped: {str(ve)}"
+            logger.warning(f"Skipping {video_url}: {ve}")
+            return
+        except Exception as e:
+            yield f"Error preparing assets: {str(e)}"
+            logger.error(f"Asset prep failed for {video_url}: {e}")
+            return
+        video_path = paths["video"]
+        transcript_text = parse_vtt(paths["transcript"]) if paths["transcript"] else "No transcript (Audio/Video Analysis only)."
+        caption = paths["metadata"].get("caption", "")
+        yield f"Assets ready. Running inference ({model_selection}, {reasoning_method.upper()})..."
+        logger.info(f"Starting inference pipeline for {video_url} (Transcript len: {len(transcript_text)})")
+        final_labels = None
+        raw_toon = ""
+        prompt_used = ""
+        pipeline = inference_logic.run_gemini_labeling_pipeline if model_selection == 'gemini' else inference_logic.run_vertex_labeling_pipeline
+        config = gemini_config if model_selection == 'gemini' else vertex_config
+        # Add timeout protection for inference
+        try:
+            async for msg in pipeline(video_path, caption, transcript_text, config, include_comments, reasoning_method):
+                if isinstance(msg, dict) and "parsed_data" in msg:
+                    final_labels = msg["parsed_data"]
+                    raw_toon = msg.get("raw_toon", "")
+                    prompt_used = msg.get("prompt_used", "")
+                    logger.info("Inference successful. Data parsed.")
+                elif isinstance(msg, str):
+                    yield msg
+                elif isinstance(msg, dict) and "error" in msg:
+                    yield f"API Error: {msg['error']}"
+        except Exception as pipe_err:
+            logger.error(f"Pipeline crashed: {pipe_err}")
+            yield f"Critical Pipeline Failure: {pipe_err}"
+            return
+        if not final_labels:
+            logger.error(f"Inference pipeline completed but returned no labels for {video_url}")
+            yield "No labels generated. Check logs."
+            return
+        final_labels["meta_info"] = {
+            "prompt_used": prompt_used,
+            "model_selection": model_selection,
+            "reasoning_method": reasoning_method
+        }
+        vec = final_labels.get("veracity_vectors", {})
+        mod = final_labels.get("modalities", {})
+        fin = final_labels.get("final_assessment", {})
+        row = {
+            "id": paths["metadata"]["id"],
+            "link": paths["metadata"]["link"],
+            "caption": caption,
+            "postdatetime": paths["metadata"].get("postdatetime", ""),
+            "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "videotranscriptionpath": paths["transcript"] or "",
+            "visual_integrity_score": vec.get("visual_integrity_score", "0"),
+            "audio_integrity_score": vec.get("audio_integrity_score", "0"),
+            "source_credibility_score": vec.get("source_credibility_score", "0"),
+            "logical_consistency_score": vec.get("logical_consistency_score", "0"),
+            "emotional_manipulation_score": vec.get("emotional_manipulation_score", "0"),
+            "video_audio_score": mod.get("video_audio_score", "0"),
+            "video_caption_score": mod.get("video_caption_score", "0"),
+            "audio_caption_score": mod.get("audio_caption_score", "0"),
+            "final_veracity_score": fin.get("veracity_score_total", "0"),
+            "final_reasoning": fin.get("reasoning", "")
+        }
+        yield {"csv_row": row, "full_json": final_labels, "raw_toon": raw_toon}
+    except Exception as e:
+        logger.error(f"Fatal error in get_labels_for_link: {e}", exc_info=True)
+        yield {"error": str(e)}
+@app.get("/queue/list")
+async def get_queue_list():
+    queue_path = Path("data/batch_queue.csv")
+    if not queue_path.exists(): return []
+    items = []
+    with open(queue_path, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        try: next(reader)
+        except: pass
+        for row in reader:
+            if len(row) > 0:
+                link = row[0]
+                status = "Processed" if check_if_processed(link) else "Pending"
+                items.append({
+                    "link": link,
+                    "timestamp": row[1] if len(row) > 1 else "",
+                    "status": status
+                })
+    return items
+@app.delete("/queue/delete")
+async def delete_queue_item(link: str):
+    queue_path = Path("data/batch_queue.csv")
+    if not queue_path.exists():
+        return {"status": "error", "message": "Queue file not found"}
+    rows = []
+    deleted = False
+    try:
+        with open(queue_path, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+        new_rows = []
+        if rows and len(rows) > 0 and rows[0][0] == "link":
+             new_rows.append(rows[0])
+             rows = rows[1:]
+        for row in rows:
+            if not row: continue
+            if row[0] == link:
+                deleted = True
+            else:
+                new_rows.append(row)
+        with open(queue_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            writer.writerows(new_rows)
+        if deleted:
+            return {"status": "success", "link": link}
+        else:
+            return {"status": "not_found", "message": "Link not found in queue"}
+    except Exception as e:
+        return {"status": "error", "message": str(e)}
+@app.post("/queue/stop")
+async def stop_queue_processing():
+    global STOP_QUEUE_SIGNAL
+    logger.info("Received Stop Signal from User.")
+    STOP_QUEUE_SIGNAL = True
+    return {"status": "stopping"}
+@app.post("/queue/upload_csv")
+async def upload_csv_to_queue(file: UploadFile = File(...)):
+    try:
+        content = await file.read()
+        try:
+            decoded = content.decode('utf-8').splitlines()
+        except UnicodeDecodeError:
+             decoded = content.decode('latin-1').splitlines()
+        reader = csv.reader(decoded)
+        links_to_add = []
+        header = next(reader, None)
+        if not header: return {"status": "empty file"}
+        link_idx = 0
+        header_lower = [h.lower() for h in header]
+        if "link" in header_lower: link_idx = header_lower.index("link")
+        elif "url" in header_lower: link_idx = header_lower.index("url")
+        elif len(header) > 0 and header[0].strip().startswith("http"):
+            links_to_add.append(header[0])
+            link_idx = 0
+        for row in reader:
+            if len(row) > link_idx and row[link_idx].strip():
+                links_to_add.append(row[link_idx].strip())
+        queue_path = Path("data/batch_queue.csv")
+        existing_links = set()
+        if queue_path.exists():
+            with open(queue_path, 'r', encoding='utf-8') as f:
+                existing_links = set(f.read().splitlines())
+        added_count = 0
+        with open(queue_path, 'a', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            if not queue_path.exists() or queue_path.stat().st_size == 0:
+                writer.writerow(["link", "ingest_timestamp"])
+            for link in links_to_add:
+                duplicate = False
+                for line in existing_links:
+                    if link in line:
+                        duplicate = True
+                        break
+                if duplicate: continue
+                writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
+                added_count += 1
+        return {"status": "success", "added": added_count}
+    except Exception as e:
+        logging.error(f"Upload CSV error: {e}")
+        return JSONResponse(status_code=400, content={"error": str(e), "status": "failed"})
+@app.post("/queue/run")
+async def run_queue_processing(
+    model_selection: str = Form(...),
+    gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
+    vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
+    include_comments: bool = Form(False),
+    reasoning_method: str = Form("cot")
+):
+    global STOP_QUEUE_SIGNAL
+    STOP_QUEUE_SIGNAL = False
+    gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
+    vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
+    async def queue_stream():
+        queue_path = Path("data/batch_queue.csv")
+        if not queue_path.exists():
+            yield "data: Queue empty.\n\n"
+            return
+        items = []
+        with open(queue_path, 'r', encoding='utf-8') as f:
+            reader = csv.reader(f)
+            try: next(reader)
+            except: pass
+            for row in reader:
+                if row: items.append(row[0])
+        processed_count = 0
+        total = len(items)
+        logger.info(f"Starting batch queue processing for {total} items.")
+        for i, link in enumerate(items):
+            if STOP_QUEUE_SIGNAL:
+                yield "data: [SYSTEM] Stopped by user.\n\n"
+                logger.info("Stopping queue loop.")
+                break
+            if check_if_processed(link):
+                yield f"data: [SKIP] {link} processed.\n\n"
+                continue
+            yield f"data: [START] {i+1}/{total}: {link}\n\n"
+            final_data = None
+            # Streaming results from pipeline
+            async for res in get_labels_for_link(link, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
+                if isinstance(res, str):
+                    msg = res.replace('\n', ' ')
+                    yield f"data: {msg}\n\n"
+                if isinstance(res, dict):
+                    if "error" in res:
+                        yield f"data: [ERROR DETAIL] {res['error']}\n\n"
+                    if "csv_row" in res:
+                        final_data = res
+            if final_data:
+                row = final_data["csv_row"]
+                vid_id = row["id"]
+                ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+                # Save artifacts
+                json_path = f"data/labels/{vid_id}_{ts}_labels.json"
+                with open(json_path, 'w') as f: json.dump(final_data["full_json"], f, indent=2)
+                with open(f"data/labels/{vid_id}_{ts}.toon", 'w') as f: f.write(final_data["raw_toon"])
+                prompt_content = final_data.get("full_json", {}).get("meta_info", {}).get("prompt_used", "")
+                if prompt_content:
+                    with open(f"data/prompts/{vid_id}_{ts}_prompt.txt", 'w', encoding='utf-8') as f:
+                        f.write(prompt_content)
+                raw_response = final_data.get("raw_toon", "")
+                if raw_response:
+                    with open(f"data/responses/{vid_id}.txt", 'w', encoding='utf-8') as f:
+                        f.write(raw_response)
+                row["metadatapath"] = await generate_and_save_croissant_metadata(row)
+                row["json_path"] = json_path
+                dpath = Path("data/dataset.csv")
+                exists = dpath.exists()
+                with open(dpath, 'a', newline='', encoding='utf-8') as f:
+                    writer = csv.DictWriter(f, fieldnames=list(row.keys()), extrasaction='ignore')
+                    if not exists: writer.writeheader()
+                    writer.writerow(row)
+                processed_count += 1
+                yield f"data: [SUCCESS] Labeled.\n\n"
+            else:
+                yield f"data: [FAIL] Failed to label. Check logs.\n\n"
+        yield f"data: Batch Complete. +{processed_count} videos labeled.\n\n"
+        yield "event: close\ndata: Done\n\n"
+    return StreamingResponse(queue_stream(), media_type="text/event-stream")
+@app.post("/extension/ingest")
+async def extension_ingest(request: Request):
+    try:
+        data = await request.json()
+        link = data.get("link")
+        if not link: raise HTTPException(status_code=400, detail="No link")
+        queue_path = Path("data/batch_queue.csv")
+        file_exists = queue_path.exists()
+        if file_exists:
+            with open(queue_path, 'r', encoding='utf-8') as f:
+                if link in f.read():
+                    return {"status": "queued", "msg": "Duplicate"}
+        with open(queue_path, 'a', newline='', encoding='utf-8') as f:
+            writer = csv.writer(f)
+            if not file_exists: writer.writerow(["link", "ingest_timestamp"])
+            writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
+        return {"status": "queued", "link": link}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/extension/save_comments")
+async def extension_save_comments(request: Request):
+    try:
+        data = await request.json()
+        link = data.get("link")
+        comments = data.get("comments", [])
+        if not link or not comments: raise HTTPException(status_code=400, detail="Missing data")
+        csv_path = Path("data/comments.csv")
+        exists = csv_path.exists()
+        fieldnames = ["link", "author", "comment_text", "timestamp"]
+        with open(csv_path, 'a', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+            if not exists: writer.writeheader()
+            ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            for c in comments:
+                row = {"link": link, "timestamp": ts}
+                if isinstance(c, dict):
+                    row["author"] = c.get("author", "Unknown")
+                    row["comment_text"] = c.get("text", "").strip()
+                else:
+                    row["author"] = "Unknown"
+                    row["comment_text"] = str(c).strip()
+                if row["comment_text"]:
+                    writer.writerow(row)
+        return {"status": "saved", "count": len(comments)}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/extension/save_manual")
+async def extension_save_manual(request: Request):
+    try:
+        data = await request.json()
+        link = data.get("link")
+        labels = data.get("labels", {})
+        stats = data.get("stats", {})
+        if not link: raise HTTPException(status_code=400, detail="No link")
+        video_id = extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:16]
+        row_data = {
+            "id": video_id,
+            "link": link,
+            "caption": data.get("caption", ""),
+            "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "source": "manual_extension",
+            "visual_integrity_score": labels.get("visual_integrity_score", 0),
+            "audio_integrity_score": labels.get("audio_integrity_score", 0),
+            "source_credibility_score": labels.get("source_credibility_score", 0),
+            "logical_consistency_score": labels.get("logical_consistency_score", 0),
+            "emotional_manipulation_score": labels.get("emotional_manipulation_score", 0),
+            "video_audio_score": labels.get("video_audio_score", 0),
+            "video_caption_score": labels.get("video_caption_score", 0),
+            "audio_caption_score": labels.get("audio_caption_score", 0),
+            "final_veracity_score": labels.get("final_veracity_score", 0),
+            "final_reasoning": labels.get("reasoning", ""),
+            "stats_likes": stats.get("likes", 0),
+            "stats_shares": stats.get("shares", 0),
+            "stats_comments": stats.get("comments", 0),
+            "stats_platform": stats.get("platform", "unknown")
+        }
+        dpath = Path("data/manual_dataset.csv")
+        exists = dpath.exists()
+        fieldnames = list(row_data.keys())
+        with open(dpath, 'a', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
+            if not exists: writer.writeheader()
+            writer.writerow(row_data)
+        return {"status": "saved"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/manage/list")
+async def list_data():
+    data = []
+    def read_csv(path, source_type):
+        if not path.exists(): return
+        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if not row.get('id') or row['id'].strip() == "":
+                    link = row.get('link', '')
+                    tid = extract_tweet_id(link)
+                    row['id'] = tid if tid else hashlib.md5(link.encode()).hexdigest()[:16]
+                json_content = None
+                if row.get('json_path') and os.path.exists(row['json_path']):
+                     try:
+                         with open(row['json_path'], 'r') as jf: json_content = json.load(jf)
+                     except: pass
+                row['source_type'] = source_type
+                row['json_data'] = json_content
+                data.append(row)
+    read_csv(Path("data/dataset.csv"), "auto")
+    read_csv(Path("data/manual_dataset.csv"), "manual")
+    data.sort(key=lambda x: x.get('collecttime', ''), reverse=True)
+    return data
+@app.delete("/manage/delete")
+async def delete_data(id: str = "", link: str = ""):
+    if not id and not link: raise HTTPException(status_code=400, detail="Must provide ID or Link")
+    deleted_count = 0
+    target_id = id
+    def remove_from_csv(path):
+        nonlocal deleted_count, target_id
+        if not path.exists(): return
+        rows = []
+        found_in_file = False
+        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
+            reader = csv.DictReader(f)
+            fieldnames = reader.fieldnames
+            for row in reader:
+                is_match = False
+                if id and row.get('id') == id: is_match = True
+                elif link and row.get('link') == link: is_match = True
+                if is_match:
+                    found_in_file = True
+                    deleted_count += 1
+                    if not target_id: target_id = row.get('id')
+                else: rows.append(row)
+        if found_in_file:
+            with open(path, 'w', newline='', encoding='utf-8') as f:
+                writer = csv.DictWriter(f, fieldnames=fieldnames)
+                writer.writeheader()
+                writer.writerows(rows)
+    remove_from_csv(Path("data/dataset.csv"))
+    remove_from_csv(Path("data/manual_dataset.csv"))
+    if target_id:
+        for p in Path("data/labels").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
+        for p in Path("metadata").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
+    return {"status": "deleted", "count": deleted_count}
+@app.post("/label_video")
+async def label_video_endpoint(
+    video_url: str = Form(...), model_selection: str = Form(...),
+    gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
+    vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
+    include_comments: bool = Form(False),
+    reasoning_method: str = Form("cot")
+):
+    gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
+    vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
+    async def stream():
+        async for msg in get_labels_for_link(video_url, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
+             if isinstance(msg, str): yield f"data: {msg}\n\n"
+             if isinstance(msg, dict) and "csv_row" in msg: yield "data: Done. Labels generated.\n\n"
+        yield "event: close\ndata: Done.\n\n"
+    return StreamingResponse(stream(), media_type="text/event-stream")

src/factuality_logic.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# factuality_logic.py
+import os
+import re
+import json
+import logging
+import asyncio
+from pathlib import Path
+import inference_logic
+from toon_parser import parse_toon_line
+logger = logging.getLogger(__name__)
+PROMPT_VISUAL_ARTIFACTS = (
+    "Analyze the video for visual manipulation (Deepfakes, editing anomalies).\n"
+    "Steps inside <thinking>: 1. Scan for artifacts. 2. Check cuts.\n"
+    "Output TOON format:\n"
+    "visual_analysis: result[2]{score,justification}:\n"
+    "Score(1-10),\"Justification text\""
+)
+PROMPT_CONTENT_ANALYSIS = (
+    "Analyze the content for accuracy and logic.\n"
+    "Steps inside <thinking>: 1. Identify claims. 2. Check fallacies. 3. Assess emotion.\n"
+    "**Transcript:**\n{transcript}\n"
+    "Output TOON format:\n"
+    "content_analysis: result[2]{score,justification}:\n"
+    "Score(1-10),\"Justification text\""
+)
+PROMPT_AUDIO_ANALYSIS = (
+    "Analyze audio for synthesis or manipulation.\n"
+    "Steps inside <thinking>: 1. Listen for robotic inflections. 2. Check lip-sync.\n"
+    "**Transcript:**\n{transcript}\n"
+    "Output TOON format:\n"
+    "audio_analysis: result[2]{score,justification}:\n"
+    "Score(1-10),\"Justification text\""
+)
+def parse_vtt(file_path: str) -> str:
+    try:
+        if not os.path.exists(file_path):
+            return "Transcript file not found."
+        with open(file_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        text_lines = []
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
+                clean_line = re.sub(r'<[^>]+>', '', line)
+                if clean_line and (not text_lines or clean_line != text_lines[-1]):
+                     text_lines.append(clean_line)
+        return "\n".join(text_lines) if text_lines else "No speech found in transcript."
+    except Exception as e:
+        logger.error(f"Error parsing VTT file {file_path}: {e}")
+        return f"Error reading transcript: {e}"
+async def run_factuality_pipeline(paths: dict, checks: dict, generation_config: dict):
+    video_path = paths.get("video")
+    transcript_path = paths.get("transcript")
+    if not video_path:
+        yield "ERROR: Video path not found. Cannot start analysis.\n\n"
+        return
+    yield "Step 1: Processing Transcript...\n"
+    await asyncio.sleep(0.1)
+    transcript = "No transcript was downloaded for this video."
+    if transcript_path and os.path.exists(transcript_path):
+        transcript = parse_vtt(transcript_path)
+        yield f"  - Transcript file found and processed.\n"
+    else:
+        yield f"  - No transcript file was found.\n"
+    yield f"\n--- Extracted Transcript ---\n{transcript}\n--------------------------\n\n"
+    await asyncio.sleep(0.1)
+    analysis_steps = []
+    if checks.get("visuals"):
+        analysis_steps.append(("Visual Integrity", PROMPT_VISUAL_ARTIFACTS))
+    if checks.get("content"):
+        analysis_steps.append(("Content Veracity", PROMPT_CONTENT_ANALYSIS.format(transcript=transcript)))
+    if checks.get("audio"):
+        analysis_steps.append(("Audio Forensics", PROMPT_AUDIO_ANALYSIS.format(transcript=transcript)))
+    for i, (title, prompt) in enumerate(analysis_steps):
+        yield f"--- Step {i + 2}: Running '{title}' Analysis ---\n"
+        yield "(Model is generating TOON analysis with scores...)\n\n"
+        await asyncio.sleep(0.1)
+        try:
+            current_gen_config = generation_config.copy()
+            sampling_fps = current_gen_config.pop("sampling_fps", 2.0)
+            current_gen_config.pop("num_perceptions", None)
+            current_gen_config["temperature"] = 0.1
+            current_gen_config["do_sample"] = True
+            ans = inference_logic.inference_step(
+                video_path=video_path,
+                prompt=prompt,
+                generation_kwargs=current_gen_config,
+                sampling_fps=sampling_fps,
+                pred_glue=None
+            )
+            yield f"  - Analysis Complete for '{title}'. Parsing TOON...\n\n"
+            parsed_result = {}
+            match = re.search(r'(\w+_analysis): result\[2\]\{score,justification\}:\s*\n(.+)', ans, re.MULTILINE)
+            thinking = "No thinking block found."
+            think_match = re.search(r'<thinking>(.*?)</thinking>', ans, re.DOTALL)
+            if think_match:
+                thinking = think_match.group(1).strip()
+            if match:
+                key, value_line = match.groups()
+                parsed_result = parse_toon_line({'key': key, 'headers': ['score', 'justification']}, value_line.strip())
+            else:
+                logger.warning(f"Could not parse TOON for '{title}'. Raw: {ans}")
+                yield f"Warning: Model did not return valid TOON. Raw output:\n{ans}\n"
+                continue
+            score = parsed_result.get('score', 'N/A')
+            justification = parsed_result.get('justification', 'No justification provided.')
+            yield f"===== ANALYSIS RESULT: {title.upper()} =====\n"
+            yield f"SCORE: {score}/10\n"
+            yield f"Reasoning (Step-by-Step): {thinking}\n"
+            yield f"Final Justification: {justification}\n\n"
+            yield f"========================================\n\n"
+        except Exception as e:
+            error_message = f"An error occurred during the '{title}' analysis step: {e}"
+            logger.error(error_message, exc_info=True)
+            yield f"ERROR: {error_message}\n\n"
+            break
+    yield "Factuality Analysis Pipeline Finished.\n"

src/inference_logic.py ADDED Viewed

	@@ -0,0 +1,305 @@

+import re
+import sys
+import os
+import time
+import logging
+import asyncio
+import json
+# Safe imports for Lite Mode (API only)
+try:
+    from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+    from peft import PeftModel
+except ImportError:
+    Qwen3VLForConditionalGeneration = None
+    AutoProcessor = None
+    PeftModel = None
+from labeling_logic import (
+    LABELING_PROMPT_TEMPLATE, SCORE_INSTRUCTIONS_SIMPLE, SCORE_INSTRUCTIONS_REASONING,
+    SCHEMA_SIMPLE, SCHEMA_REASONING,
+    FCOT_MACRO_PROMPT, FCOT_MESO_PROMPT, FCOT_SYNTHESIS_PROMPT
+)
+from toon_parser import parse_veracity_toon
+# Optional local imports
+try:
+    from my_vision_process import process_vision_info, client
+except ImportError:
+    process_vision_info = None
+    client = None
+# Google GenAI Imports
+try:
+    import google.generativeai as genai_legacy
+    from google.generativeai.types import generation_types, HarmCategory, HarmBlockThreshold
+except ImportError:
+    genai_legacy = None
+try:
+    # Modern Google GenAI SDK (v1)
+    from google import genai
+    from google.genai.types import (
+        GenerateContentConfig,
+        HttpOptions,
+        Retrieval,
+        Tool,
+        VertexAISearch,
+        GoogleSearch,
+        Part,
+        SafetySetting
+    )
+    import vertexai
+except ImportError:
+    genai = None
+    vertexai = None
+LITE_MODE = os.getenv("LITE_MODE", "true").lower() == "true"
+processor = None
+base_model = None
+peft_model = None
+active_model = None
+logger = logging.getLogger(__name__)
+def load_models():
+    pass
+async def attempt_toon_repair(original_text: str, schema: str, client, model_type: str, config: dict):
+    logger.info("Attempting TOON Repair...")
+    repair_prompt = f"SYSTEM: Reformat the following text into strict TOON schema. Infer missing scores as 0.\n\nSCHEMA:\n{schema}\n\nINPUT:\n{original_text}\n"
+    try:
+        loop = asyncio.get_event_loop()
+        repaired_text = ""
+        if model_type == 'gemini':
+            model = genai_legacy.GenerativeModel("models/gemini-2.0-flash-exp")
+            response = await loop.run_in_executor(None, lambda: model.generate_content(repair_prompt))
+            repaired_text = response.text
+        elif model_type == 'vertex':
+            cl = client if client else genai.Client(vertexai=True, project=config['project_id'], location=config['location'])
+            response = await loop.run_in_executor(None, lambda: cl.models.generate_content(model=config['model_name'], contents=repair_prompt))
+            repaired_text = response.text
+        return repaired_text
+    except Exception as e:
+        logger.error(f"Repair failed: {e}")
+        return original_text
+async def run_gemini_labeling_pipeline(video_path: str, caption: str, transcript: str, gemini_config: dict, include_comments: bool, reasoning_method: str = "cot"):
+    if genai_legacy is None:
+        yield "ERROR: Legacy SDK missing.\n"
+        return
+    api_key = gemini_config.get("api_key")
+    if not api_key:
+        yield "ERROR: No Gemini API Key provided."
+        return
+    logger.info(f"[Gemini] Initializing with model {gemini_config.get('model_name')}")
+    safety_settings = [
+        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
+        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
+        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
+        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
+    ]
+    try:
+        genai_legacy.configure(api_key=api_key)
+        loop = asyncio.get_event_loop()
+        # 1. Upload File
+        logger.info(f"[Gemini] Uploading video file: {video_path}...")
+        yield f"Uploading video to Gemini..."
+        uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.upload_file(path=video_path, mime_type="video/mp4"))
+        logger.info(f"[Gemini] Upload complete. URI: {uploaded_file.uri} | State: {uploaded_file.state.name}")
+        # 2. Wait for Processing (Fix: Refresh state in loop)
+        wait_start = time.time()
+        while True:
+            # Refresh file status
+            uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.get_file(uploaded_file.name))
+            state_name = uploaded_file.state.name
+            if state_name == "ACTIVE":
+                logger.info("[Gemini] Video processing complete. Ready for inference.")
+                break
+            elif state_name == "FAILED":
+                logger.error(f"[Gemini] Video processing failed on server side.")
+                yield "ERROR: Google failed to process video."
+                return
+            if time.time() - wait_start > 300: # 5 minute timeout
+                logger.error("[Gemini] Video processing timed out.")
+                yield "ERROR: Video processing timed out."
+                return
+            logger.info(f"[Gemini] Processing video... (State: {state_name})")
+            yield "Processing video on Google servers..."
+            await asyncio.sleep(5)
+        # 3. Prepare Inference
+        model_name = gemini_config.get("model_name") or "models/gemini-2.0-flash-exp"
+        model = genai_legacy.GenerativeModel(model_name)
+        toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
+        score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
+        raw_text = ""
+        prompt_used = ""
+        gen_config = {"temperature": 0.1}
+        logger.info(f"[Gemini] Starting inference with method: {reasoning_method}")
+        if reasoning_method == "fcot":
+            yield "Starting FCoT (Gemini)..."
+            chat = model.start_chat(history=[])
+            macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
+            logger.info("[Gemini] Sending Macro Prompt...")
+            res1 = await loop.run_in_executor(None, lambda: chat.send_message([uploaded_file, macro_prompt], safety_settings=safety_settings))
+            macro_hypothesis = res1.text
+            yield f"Hypothesis: {macro_hypothesis[:100]}...\n"
+            meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
+            logger.info("[Gemini] Sending Meso Prompt...")
+            res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt, safety_settings=safety_settings))
+            synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
+            logger.info("[Gemini] Sending Synthesis Prompt...")
+            res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt, safety_settings=safety_settings))
+            raw_text = res3.text
+            prompt_used = f"FCoT:\n{macro_prompt}\n..."
+        else:
+            prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
+            prompt_used = prompt_text
+            yield f"Generating Labels ({model_name})..."
+            logger.info("[Gemini] Sending standard generation request...")
+            response = await loop.run_in_executor(
+                None,
+                lambda: model.generate_content([prompt_text, uploaded_file], generation_config=gen_config, safety_settings=safety_settings)
+            )
+            raw_text = response.text
+        # Log response info
+        logger.info(f"[Gemini] Response received. Length: {len(raw_text)}")
+        if not raw_text:
+             yield "Model returned empty response (Check API quota or safety)."
+             yield {"error": "Empty Response - likely safety block"}
+             return
+        parsed_data = parse_veracity_toon(raw_text)
+        if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
+             yield "Auto-Repairing output..."
+             raw_text = await attempt_toon_repair(raw_text, toon_schema, None, 'gemini', gemini_config)
+             parsed_data = parse_veracity_toon(raw_text)
+        yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
+        # Cleanup
+        try:
+            logger.info(f"[Gemini] Deleting remote file {uploaded_file.name}")
+            await loop.run_in_executor(None, lambda: genai_legacy.delete_file(name=uploaded_file.name))
+        except Exception as cleanup_err:
+            logger.warning(f"Failed to cleanup file: {cleanup_err}")
+    except Exception as e:
+        logger.error(f"Gemini Pipeline Error: {e}", exc_info=True)
+        yield f"ERROR (Gemini): {e}"
+async def run_vertex_labeling_pipeline(video_path: str, caption: str, transcript: str, vertex_config: dict, include_comments: bool, reasoning_method: str = "cot"):
+    if genai is None:
+        yield "ERROR: 'google-genai' not installed.\n"
+        return
+    project_id = vertex_config.get("project_id")
+    if not project_id:
+        yield "ERROR: No Vertex Project ID."
+        return
+    logger.info(f"[Vertex] Initializing for project {project_id}")
+    safety_settings = [
+        SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="BLOCK_ONLY_HIGH"),
+        SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH"),
+        SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="BLOCK_ONLY_HIGH"),
+        SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="BLOCK_ONLY_HIGH"),
+    ]
+    try:
+        client = genai.Client(vertexai=True, project=project_id, location=vertex_config.get("location", "us-central1"))
+        logger.info(f"[Vertex] Reading local video file: {video_path}")
+        with open(video_path, 'rb') as f: video_bytes = f.read()
+        video_part = Part.from_bytes(data=video_bytes, mime_type="video/mp4")
+        toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
+        score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
+        model_name = vertex_config.get("model_name", "gemini-2.5-flash-lite")
+        raw_text = ""
+        prompt_used = ""
+        loop = asyncio.get_event_loop()
+        config = GenerateContentConfig(
+            temperature=0.1,
+            response_mime_type="text/plain",
+            tools=[Tool(google_search=GoogleSearch())],
+            safety_settings=safety_settings
+        )
+        logger.info(f"[Vertex] Starting inference with {model_name}")
+        if reasoning_method == "fcot":
+            yield "Starting FCoT (Vertex)..."
+            chat = client.chats.create(model=model_name, config=config)
+            macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
+            logger.info("[Vertex] Sending Macro Prompt...")
+            res1 = await loop.run_in_executor(None, lambda: chat.send_message([video_part, macro_prompt]))
+            macro_hypothesis = res1.text
+            yield f"Hypothesis: {macro_hypothesis[:80]}...\n"
+            meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
+            logger.info("[Vertex] Sending Meso Prompt...")
+            res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt))
+            synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
+            logger.info("[Vertex] Sending Synthesis Prompt...")
+            res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt))
+            raw_text = res3.text
+            prompt_used = f"FCoT (Vertex):\n{macro_prompt}..."
+        else:
+            prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
+            prompt_used = prompt_text
+            yield f"Generating Labels ({model_name})..."
+            logger.info("[Vertex] Sending standard generation request...")
+            response = await loop.run_in_executor(
+                None,
+                lambda: client.models.generate_content(model=model_name, contents=[video_part, prompt_text], config=config)
+            )
+            raw_text = response.text
+        logger.info(f"[Vertex] Response Length: {len(raw_text)}")
+        if not raw_text:
+             yield "Model returned empty response."
+             yield {"error": "Empty Response"}
+             return
+        parsed_data = parse_veracity_toon(raw_text)
+        if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
+            yield "Auto-Repairing output..."
+            raw_text = await attempt_toon_repair(raw_text, toon_schema, client, 'vertex', vertex_config)
+            parsed_data = parse_veracity_toon(raw_text)
+        yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
+    except Exception as e:
+        yield f"ERROR (Vertex): {e}"
+        logger.error("Vertex Labeling Error", exc_info=True)
+async def run_gemini_pipeline(video_path, question, checks, gemini_config, generation_config=None):
+    yield "Legacy pipeline not fully supported in HF Space."
+async def run_vertex_pipeline(video_path, question, checks, vertex_config, generation_config=None):
+    yield "Legacy pipeline not fully supported in HF Space."

src/labeling_logic.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# labeling_logic.py
+LABELING_PROMPT_TEMPLATE = """
+You are an AI Factuality Assessment Agent operating under the "Ali Arsanjani Factuality Factors" framework.
+Your goal is to mass-label video content, quantifying "Veracity Vectors" and "Modality Alignment".
+**INPUT DATA:**
+- **User Caption:** "{caption}"
+- **Audio Transcript:** "{transcript}"
+- **Visuals:** (Provided in video context)
+**INSTRUCTIONS:**
+1.  **Grounding:** Cross-reference claims in the transcript with your internal knowledge base (and tools if active).
+2.  **Chain of Thought (<thinking>):** You MUST think step-by-step inside a `<thinking>` block before generating output.
+    *   Analyze *Visual Integrity* (Artifacts, edits).
+    *   Analyze *Audio Integrity* (Voice cloning, sync).
+    *   Analyze *Modality Alignment* (Does video match audio? Does caption match content? Does audio match caption?).
+    *   Analyze *Logic* (Fallacies, gaps).
+    *   Determine *Disinformation* classification.
+3.  **Output Format:** Output strictly in **TOON** format (Token-Oriented Object Notation) as defined below.
+**CRITICAL CONSTRAINTS:**
+- Do NOT repeat the input data.
+- START your response IMMEDIATELY with the `<thinking>` tag.
+- **DO NOT use Markdown code blocks.** (Output plain text only).
+- Use strict `Key : Type [ Count ] {{ Headers }} :` format followed by data lines.
+- Strings containing commas MUST be quoted.
+- ALL scores must be filled (use 0 if unsure, do not leave blank).
+- **MODALITY SCORING:** You must provide 3 distinct alignment scores: Video-Audio, Video-Caption, and Audio-Caption.
+**TOON SCHEMA:**
+{toon_schema}
+{score_instructions}
+**RESPONSE:**
+<thinking>
+"""
+SCORE_INSTRUCTIONS_REASONING = """
+**Constraints:**
+1. Provide specific reasoning for EACH score in the `vectors` and `modalities` tables.
+2. Ensure strings are properly quoted.
+"""
+SCORE_INSTRUCTIONS_SIMPLE = """
+**Constraint:** Focus on objective measurements. Keep text concise.
+"""
+SCHEMA_SIMPLE = """summary: text[1]{text}:
+"Brief neutral summary of the video events"
+vectors: scores[1]{visual,audio,source,logic,emotion}:
+(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10)
+*Scale: 1=Fake/Malicious, 10=Authentic/Neutral*
+modalities: scores[1]{video_audio_score,video_caption_score,audio_caption_score}:
+(Int 1-10),(Int 1-10),(Int 1-10)
+*Scale: 1=Mismatch, 10=Perfect Match*
+factuality: factors[1]{accuracy,gap,grounding}:
+(Verified/Misleading/False),"Missing evidence description","Grounding check results"
+disinfo: analysis[1]{class,intent,threat}:
+(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
+final: assessment[1]{score,reasoning}:
+(Int 1-100),"Final synthesis of why this score was given"
+"""
+SCHEMA_REASONING = """
+summary: text[1]{text}:
+"Brief neutral summary of the video events"
+vectors: details[5]{category,score,reasoning}:
+Visual,(Int 1-10),"Reasoning for visual score"
+Audio,(Int 1-10),"Reasoning for audio score"
+Source,(Int 1-10),"Reasoning for source credibility"
+Logic,(Int 1-10),"Reasoning for logical consistency"
+Emotion,(Int 1-10),"Reasoning for emotional manipulation"
+modalities: details[3]{category,score,reasoning}:
+VideoAudio,(Int 1-10),"Reasoning for video-to-audio alignment"
+VideoCaption,(Int 1-10),"Reasoning for video-to-caption alignment"
+AudioCaption,(Int 1-10),"Reasoning for audio-to-caption alignment"
+factuality: factors[1]{accuracy,gap,grounding}:
+(Verified/Misleading/False),"Missing evidence description","Grounding check results"
+disinfo: analysis[1]{class,intent,threat}:
+(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
+final: assessment[1]{score,reasoning}:
+(Int 1-100),"Final synthesis of why this score was given"
+"""
+FCOT_MACRO_PROMPT = """
+**Fractal Chain of Thought - Stage 1: Macro-Scale Hypothesis (Wide Aperture)**
+You are analyzing a video for factuality.
+**Context:** Caption: "{caption}" | Transcript: "{transcript}"
+1. **Global Scan**: Observe the video, audio, and caption as a whole entity.
+2. **Context Aperture**: Wide. Assess the overall intent (Humor, Information, Political, Social) and the setting.
+3. **Macro Hypothesis**: Formulate a high-level hypothesis about the veracity. (e.g., "The video is likely authentic but the caption misrepresents the location" or "The audio quality suggests synthetic generation").
+**Objective**: Maximize **Coverage** (broadly explore potential angles of manipulation).
+**Output**: A concise paragraph summarizing the "Macro Hypothesis".
+"""
+FCOT_MESO_PROMPT = """
+**Fractal Chain of Thought - Stage 2: Meso-Scale Expansion (Recursive Verification)**
+**Current Macro Hypothesis**: "{macro_hypothesis}"
+**Action**: Zoom In. Decompose the hypothesis into specific verification branches.
+Perform the following checks recursively:
+1. **Visual Branch**: Look for specific artifacts, lighting inconsistencies, cuts, or deepfake signs.
+2. **Audio Branch**: Analyze lip-sync, background noise consistency, and voice tonality.
+3. **Logical Branch**: Does the visual evidence strictly support the caption's claim? Are there logical fallacies?
+**Dual-Objective Self-Correction**:
+- **Faithfulness**: Do not hallucinate details not present in the video.
+- **Coverage**: Did you miss any subtle cues?
+**Output**: Detailed "Micro-Observations" for each branch. If you find contradictions to the Macro Hypothesis, note them explicitly as **"Self-Correction"**.
+"""
+FCOT_SYNTHESIS_PROMPT = """
+**Fractal Chain of Thought - Stage 3: Inter-Scale Consensus & Synthesis**
+**Action**: Integrate your Macro Hypothesis and Micro-Observations.
+- **Consensus Check**: If Micro-Observations contradict the Macro Hypothesis, prioritize the Micro evidence (Self-Correction).
+- **Compression**: Synthesize the findings into the final structured format.
+**Output Format**:
+Strictly fill out the following TOON schema based on the consensus. Do not include markdown code blocks.
+**TOON SCHEMA**:
+{toon_schema}
+{score_instructions}
+"""

src/my_vision_process.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# my_vision_process.py (Stub for HF Spaces / Lite Mode)
+import logging
+logger = logging.getLogger(__name__)
+# Dummy client
+client = None
+def process_vision_info(messages, return_video_kwargs=False, client=None):
+    """
+    Stub function to prevent ImportErrors in API-only mode.
+    If this is called, it means LITE_MODE logic failed or was bypassed.
+    """
+    logger.warning("process_vision_info called in LITE/API environment. Returning empty placeholders.")
+    if return_video_kwargs:
+        return None, None, {"fps": [0]}
+    return None, None

src/toon_parser.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# toon_parser.py
+import re
+import logging
+import csv
+from io import StringIO
+logger = logging.getLogger(__name__)
+def parse_toon_line(line_def, data_line):
+    if not data_line or data_line.isspace():
+        return {}
+    try:
+        reader = csv.reader(StringIO(data_line), skipinitialspace=True)
+        try:
+            values = next(reader)
+        except StopIteration:
+            values = []
+        cleaned_values = []
+        for v in values:
+            v_str = v.strip()
+            v_str = v_str.replace('(', '').replace(')', '')
+            if '/' in v_str and any(c.isdigit() for c in v_str):
+                parts = v_str.split('/')
+                if parts[0].strip().isdigit():
+                    v_str = parts[0].strip()
+            cleaned_values.append(v_str)
+        headers = line_def.get('headers', [])
+        if len(cleaned_values) < len(headers):
+            cleaned_values += [""] * (len(headers) - len(cleaned_values))
+        elif len(cleaned_values) > len(headers):
+            cleaned_values = cleaned_values[:len(headers)]
+        return dict(zip(headers, cleaned_values))
+    except Exception as e:
+        logger.error(f"Error parsing TOON line '{data_line}': {e}")
+        return {}
+def fuzzy_extract_scores(text: str) -> dict:
+    scores = {
+        'visual': '0', 'audio': '0', 'source': '0', 'logic': '0', 'emotion': '0',
+        'video_audio': '0', 'video_caption': '0', 'audio_caption': '0'
+    }
+    mappings = [
+        ('visual', 'visual'),
+        ('visual.*?integrity', 'visual'),
+        ('accuracy', 'visual'),
+        ('audio', 'audio'),
+        ('source', 'source'),
+        ('logic', 'logic'),
+        ('emotion', 'emotion'),
+        (r'video.*?audio', 'video_audio'),
+        (r'video.*?caption', 'video_caption'),
+        (r'audio.*?caption', 'audio_caption')
+    ]
+    for pattern_str, key in mappings:
+        pattern = re.compile(fr'(?i){pattern_str}.*?[:=\-\s\(]+(\b10\b|\b\d\b)(?:/10)?')
+        match = pattern.search(text)
+        if match:
+            if scores[key] == '0':
+                scores[key] = match.group(1)
+    return scores
+def parse_veracity_toon(text: str) -> dict:
+    if not text:
+        return {}
+    text = re.sub(r'```\w*', '', text)
+    text = re.sub(r'```', '', text)
+    text = text.strip()
+    parsed_sections = {}
+    block_pattern = re.compile(
+        r'([a-zA-Z0-9_]+)\s*:\s*(?:\w+\s*)?(?:\[\s*(\d+)\s*\])?\s*\{\s*(.*?)\s*\}\s*:\s*',
+        re.MULTILINE
+    )
+    matches = list(block_pattern.finditer(text))
+    for i, match in enumerate(matches):
+        key = match.group(1).lower()
+        count = int(match.group(2)) if match.group(2) else 1
+        headers_str = match.group(3)
+        headers = [h.strip().lower() for h in headers_str.split(',')]
+        start_idx = match.end()
+        end_idx = matches[i+1].start() if i + 1 < len(matches) else len(text)
+        block_content = text[start_idx:end_idx].strip()
+        lines = [line.strip() for line in block_content.splitlines() if line.strip()]
+        data_items = []
+        valid_lines = [l for l in lines if len(l) > 1]
+        for line in valid_lines[:count]:
+            item = parse_toon_line({'key': key, 'headers': headers}, line)
+            data_items.append(item)
+        if count == 1 and data_items:
+            parsed_sections[key] = data_items[0]
+        else:
+            parsed_sections[key] = data_items
+    flat_result = {
+        'veracity_vectors': {
+            'visual_integrity_score': '0',
+            'audio_integrity_score': '0',
+            'source_credibility_score': '0',
+            'logical_consistency_score': '0',
+            'emotional_manipulation_score': '0'
+        },
+        'modalities': {
+            'video_audio_score': '0',
+            'video_caption_score': '0',
+            'audio_caption_score': '0'
+        },
+        'video_context_summary': '',
+        'factuality_factors': {},
+        'disinformation_analysis': {},
+        'final_assessment': {}
+    }
+    got_vectors = False
+    got_modalities = False
+    vectors_data = parsed_sections.get('vectors', [])
+    if isinstance(vectors_data, dict):
+        v = vectors_data
+        if any(val and val != '0' for val in v.values()):
+            if 'visual' in v: flat_result['veracity_vectors']['visual_integrity_score'] = v['visual']
+            if 'audio' in v: flat_result['veracity_vectors']['audio_integrity_score'] = v['audio']
+            if 'source' in v: flat_result['veracity_vectors']['source_credibility_score'] = v['source']
+            if 'logic' in v: flat_result['veracity_vectors']['logical_consistency_score'] = v['logic']
+            if 'emotion' in v: flat_result['veracity_vectors']['emotional_manipulation_score'] = v['emotion']
+            got_vectors = True
+    elif isinstance(vectors_data, list):
+        for item in vectors_data:
+            cat = item.get('category', '').lower()
+            score = item.get('score', '0')
+            if score and score != '0':
+                got_vectors = True
+            if 'visual' in cat: flat_result['veracity_vectors']['visual_integrity_score'] = score
+            elif 'audio' in cat: flat_result['veracity_vectors']['audio_integrity_score'] = score
+            elif 'source' in cat: flat_result['veracity_vectors']['source_credibility_score'] = score
+            elif 'logic' in cat: flat_result['veracity_vectors']['logical_consistency_score'] = score
+            elif 'emotion' in cat: flat_result['veracity_vectors']['emotional_manipulation_score'] = score
+    modalities_data = parsed_sections.get('modalities', [])
+    if isinstance(modalities_data, dict):
+        m = modalities_data
+        for k, v in m.items():
+            k_clean = k.lower().replace(' ', '').replace('-', '').replace('_', '')
+            if 'videoaudio' in k_clean: flat_result['modalities']['video_audio_score'] = v
+            elif 'videocaption' in k_clean: flat_result['modalities']['video_caption_score'] = v
+            elif 'audiocaption' in k_clean: flat_result['modalities']['audio_caption_score'] = v
+            if v and v != '0': got_modalities = True
+    elif isinstance(modalities_data, list):
+        for item in modalities_data:
+            cat = item.get('category', '').lower().replace(' ', '').replace('-', '').replace('_', '')
+            score = item.get('score', '0')
+            if score and score != '0':
+                got_modalities = True
+            if 'videoaudio' in cat: flat_result['modalities']['video_audio_score'] = score
+            elif 'videocaption' in cat: flat_result['modalities']['video_caption_score'] = score
+            elif 'audiocaption' in cat: flat_result['modalities']['audio_caption_score'] = score
+    if not got_vectors or not got_modalities:
+        fuzzy_scores = fuzzy_extract_scores(text)
+        if not got_vectors:
+            flat_result['veracity_vectors']['visual_integrity_score'] = fuzzy_scores['visual']
+            flat_result['veracity_vectors']['audio_integrity_score'] = fuzzy_scores['audio']
+            flat_result['veracity_vectors']['source_credibility_score'] = fuzzy_scores['source']
+            flat_result['veracity_vectors']['logical_consistency_score'] = fuzzy_scores['logic']
+            flat_result['veracity_vectors']['emotional_manipulation_score'] = fuzzy_scores['emotion']
+        if not got_modalities:
+            flat_result['modalities']['video_audio_score'] = fuzzy_scores['video_audio']
+            flat_result['modalities']['video_caption_score'] = fuzzy_scores['video_caption']
+            flat_result['modalities']['audio_caption_score'] = fuzzy_scores['audio_caption']
+    f = parsed_sections.get('factuality', {})
+    if isinstance(f, list): f = f[0] if f else {}
+    flat_result['factuality_factors'] = {
+        'claim_accuracy': f.get('accuracy', 'Unverifiable'),
+        'evidence_gap': f.get('gap', ''),
+        'grounding_check': f.get('grounding', '')
+    }
+    d = parsed_sections.get('disinfo', {})
+    if isinstance(d, list): d = d[0] if d else {}
+    flat_result['disinformation_analysis'] = {
+        'classification': d.get('class', 'None'),
+        'intent': d.get('intent', 'None'),
+        'threat_vector': d.get('threat', 'None')
+    }
+    fn = parsed_sections.get('final', {})
+    if isinstance(fn, list): fn = fn[0] if fn else {}
+    flat_result['final_assessment'] = {
+        'veracity_score_total': fn.get('score', '0'),
+        'reasoning': fn.get('reasoning', '')
+    }
+    s = parsed_sections.get('summary', {})
+    if isinstance(s, list): s = s[0] if s else {}
+    flat_result['video_context_summary'] = s.get('text', '')
+    flat_result['raw_parsed_structure'] = parsed_sections
+    return flat_result

src/transcription.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import logging
+from pathlib import Path
+LITE_MODE = os.getenv("LITE_MODE", "true").lower() == "true"
+logger = logging.getLogger(__name__)
+try:
+    import whisper
+    WHISPER_AVAILABLE = True
+except ImportError:
+    WHISPER_AVAILABLE = False
+transcription_model = None
+def load_model():
+    if LITE_MODE or not WHISPER_AVAILABLE:
+        logger.info("LITE_MODE is enabled or Whisper is uninstalled. Skipping Whisper model loading.")
+        return
+    global transcription_model
+    if transcription_model is None:
+        try:
+            logger.info("Loading 'base.en' Whisper model for transcription...")
+            transcription_model = whisper.load_model("base.en")
+            logger.info("Whisper model loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
+            transcription_model = None
+def generate_transcript(audio_path_str: str) -> str:
+    if transcription_model is None:
+        logger.warning("Transcription model is not available (API-Lite Mode). Bypassing local transcription.")
+        return None
+    try:
+        audio_path = Path(audio_path_str)
+        logger.info(f"Starting transcription for: {audio_path.name}")
+        result = transcription_model.transcribe(audio_path_str, verbose=False)
+        vtt_path = audio_path.with_suffix('.vtt')
+        from whisper.utils import get_writer
+        writer = get_writer("vtt", str(vtt_path.parent))
+        writer(result, str(audio_path.name))
+        logger.info(f"Transcription complete. VTT file saved to: {vtt_path}")
+        return str(vtt_path)
+    except Exception as e:
+        logger.error(f"An error occurred during transcription for {audio_path_str}: {e}", exc_info=True)
+        return None

start.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+# 1. Start Python FastAPI in the background (Internal Port 8001)
+echo "Starting Python Inference Engine..."
+export PYTHONPATH=$PYTHONPATH:/app/src
+# Use --log-level info to see startup issues
+python -m uvicorn src.app:app --host 127.0.0.1 --port 8001 --log-level info &
+# Wait longer for Python to initialize, or until port is open
+echo "Waiting for Python backend to initialize..."
+timeout=30
+while ! curl -s http://127.0.0.1:8001/ > /dev/null; do
+    sleep 2
+    timeout=$((timeout-2))
+    if [ $timeout -le 0 ]; then
+        echo "Python backend failed to start on time. Logs might show why."
+        break
+    fi
+done
+# 2. Start Golang Web Server (Public Port 7860)
+echo "Starting Go Web Server..."
+/app/vchat-server