liarMP4

Sleeping

App Files Files Community

GlazedDon0t commited on 17 days ago

Commit

7b1d1ea

1 Parent(s): bb1cad5

wow

Browse files

Files changed (14) hide show

.gitattributes +0 -35
Dockerfile +0 -89
README.md +0 -10
frontend/src/App.tsx +2 -2
main.go +0 -60
requirements.txt +0 -42
src/app.py +0 -808
src/factuality_logic.py +0 -143
src/inference_logic.py +0 -303
src/labeling_logic.py +0 -145
src/my_vision_process.py +0 -17
src/toon_parser.py +0 -220
src/transcription.py +0 -48
start.sh +0 -23

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile DELETED Viewed

@@ -1,89 +0,0 @@
-# ==========================================
-# Stage 1: Build Frontend (React/TS/Vite)
-# ==========================================
-FROM node:20-slim AS frontend-builder
-WORKDIR /app/frontend
-# Copy frontend definitions
-COPY frontend/package.json frontend/package-lock.json* ./
-RUN npm install
-# Copy source and build
-COPY frontend/ ./
-RUN npm run build
-# ==========================================
-# Stage 2: Build Backend (Golang)
-# ==========================================
-FROM golang:1.23 AS backend-builder
-WORKDIR /app/backend
-# Copy Go source
-COPY main.go .
-# Build static binary
-RUN go mod init vchat-server && \
-    go mod tidy && \
-    CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o vchat-server main.go
-# ==========================================
-# Stage 3: Final Runtime (Hugging Face Space)
-# ==========================================
-FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime
-# Default to LITE_MODE=true for HF Spaces (API Only)
-ENV PYTHONUNBUFFERED=1 \
-    DEBIAN_FRONTEND=noninteractive \
-    LITE_MODE=true \
-    PATH="/home/user/.local/bin:$PATH" \
-    PIP_NO_CACHE_DIR=1
-# Create a non-root user (Required for HF Spaces)
-RUN useradd -m -u 1000 user
-WORKDIR /app
-# 1. Fix FFmpeg Conflict (Critical Step)
-RUN conda uninstall -y ffmpeg || true
-# 2. Install System Dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg \
-    git \
-    curl \
-    gnupg \
-    ca-certificates \
-    && rm -rf /var/lib/apt/lists/*
-# 3. Install Python Dependencies
-RUN pip install uv
-COPY requirements.txt ./
-RUN uv pip install --system -r requirements.txt
-# Explicitly force latest yt-dlp to handle Twitter/X API changes
-RUN uv pip install --system --upgrade "yt-dlp[default]"
-# 4. Copy Python Application Code
-COPY --chown=user src/ ./src/
-# 5. Install Built Artifacts
-COPY --from=backend-builder --chown=user /app/backend/vchat-server /app/vchat-server
-RUN mkdir -p /app/static
-COPY --from=frontend-builder --chown=user /app/frontend/dist /app/static
-# 6. Setup Directories and Permissions
-RUN mkdir -p /app/data /app/data/videos /app/data/labels /app/data/prompts /app/data/responses /app/metadata \
-    && chown -R user:user /app/data /app/metadata
-# 7. Setup Entrypoint
-COPY --chown=user start.sh /app/start.sh
-RUN sed -i 's/\r$//' /app/start.sh && \
-    chmod +x /app/start.sh
-# Switch to non-root user
-USER user
-# Expose the HF Space port
-EXPOSE 7860
-# Run the Orchestrator
-CMD ["/app/start.sh"]

README.md DELETED Viewed

@@ -1,10 +0,0 @@
----
-title: VFacts
-emoji: 😻
-colorFrom: gray
-colorTo: gray
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

frontend/src/App.tsx CHANGED Viewed

@@ -143,7 +143,7 @@ function App() {
             <div className="w-8 h-8 rounded-lg bg-indigo-600 flex items-center justify-center">
               <Bot className="w-5 h-5 text-white" />
             </div>
-            <h1 className="text-sm font-bold text-white">vChat <span className="text-slate-500">Manager</span></h1>
           </div>
         </div>
@@ -322,7 +322,7 @@ function App() {
                         </h2>
                         <p className="text-slate-400 text-sm">
                             Switch to the <strong>Queue</strong> tab, upload your CSV file, and click <strong>Start Batch</strong>.
-                            The system will download videos, transcribe audio, and run the selected AI model to generate factuality labels.
                         </p>
                     </section>
                 </div>

             <div className="w-8 h-8 rounded-lg bg-indigo-600 flex items-center justify-center">
               <Bot className="w-5 h-5 text-white" />
             </div>
+            <h1 className="text-sm font-bold text-white">vChat <span className="text-slate-500">API Lite</span></h1>
           </div>
         </div>
                         </h2>
                         <p className="text-slate-400 text-sm">
                             Switch to the <strong>Queue</strong> tab, upload your CSV file, and click <strong>Start Batch</strong>.
+                            The system will download videos and run the selected AI model to generate factuality labels natively.
                         </p>
                     </section>
                 </div>

main.go DELETED Viewed

@@ -1,60 +0,0 @@
-package main
-import (
-	"log"
-	"net/http"
-	"net/http/httputil"
-	"net/url"
-	"os"
-	"strings"
-)
-func main() {
-	// Target Python FastAPI server (running locally in the container)
-	pythonTarget := "http://127.0.0.1:8001"
-	pythonURL, err := url.Parse(pythonTarget)
-	if err != nil {
-		log.Fatalf("Invalid Python target URL: %v", err)
-	}
-	// Create Reverse Proxy
-	proxy := httputil.NewSingleHostReverseProxy(pythonURL)
-	// HF Spaces: Files are copied to /app/static in Dockerfile
-	staticPath := "/app/static"
-	fs := http.FileServer(http.Dir(staticPath))
-	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
-		// Proxy API requests to Python
-		if strings.HasPrefix(r.URL.Path, "/process") ||
-			strings.HasPrefix(r.URL.Path, "/label_video") ||
-			strings.HasPrefix(r.URL.Path, "/batch_label") ||
-			strings.HasPrefix(r.URL.Path, "/model-architecture") ||
-			strings.HasPrefix(r.URL.Path, "/download-dataset") ||
-			strings.HasPrefix(r.URL.Path, "/extension") ||
-			strings.HasPrefix(r.URL.Path, "/manage") ||
-			strings.HasPrefix(r.URL.Path, "/queue") {
-			log.Printf("Proxying %s to Python Backend...", r.URL.Path)
-			proxy.ServeHTTP(w, r)
-			return
-		}
-		// Check if file exists in static dir, otherwise serve index.html (SPA Routing)
-		path := staticPath + r.URL.Path
-		if _, err := os.Stat(path); os.IsNotExist(err) {
-			http.ServeFile(w, r, staticPath+"/index.html")
-			return
-		}
-		fs.ServeHTTP(w, r)
-	})
-	// HF Spaces requires listening on port 7860
-	port := "7860"
-	log.Printf("vChat HF Server listening on port %s", port)
-	log.Printf("Serving static files from %s", staticPath)
-	if err := http.ListenAndServe(":"+port, nil); err != nil {
-		log.Fatal(err)
-	}
-}

requirements.txt DELETED Viewed

@@ -1,42 +0,0 @@
-torch
-torchvision
-torchaudio
-# --- Core Server ---
-fastapi
-uvicorn[standard]
-python-multipart
-requests
-aiofiles
-jinja2
-python-dotenv
-# --- AI & Vision Processing ---
-transformers
-accelerate
-Pillow
-packaging
-av
-# Use headless to avoid installing X11/GL libraries in Docker
-opencv-python-headless
-decord
-imageio
-numpy
-einops
-# --- Google Cloud & APIs ---
-google-generativeai>=0.4.0
-google-cloud-aiplatform
-google-genai
-mlcroissant
-# --- Fine-Tuning (LoRA/QLoRA) ---
-peft
-bitsandbytes
-trl
-datasets
-# --- Audio ---
-openai-whisper
-# FORCE LATEST YT-DLP (Often required for X/Twitter)
-yt-dlp>=2024.11.18
-ffmpeg-python

src/app.py DELETED Viewed

@@ -1,808 +0,0 @@
-import os
-import sys
-import asyncio
-import subprocess
-from pathlib import Path
-import logging
-import csv
-import io
-import datetime
-import json
-import hashlib
-import re
-import glob
-import shutil
-import time
-from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
-from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
-from fastapi.templating import Jinja2Templates
-from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware
-import yt_dlp
-import inference_logic
-import factuality_logic
-import transcription
-from factuality_logic import parse_vtt
-from toon_parser import parse_veracity_toon
-try:
-    import mlcroissant as mlc
-    CROISSANT_AVAILABLE = True
-except ImportError:
-    try:
-        import croissant as mlc
-        CROISSANT_AVAILABLE = True
-    except ImportError:
-        mlc = None
-        CROISSANT_AVAILABLE = False
-# Configure Logging with High Verbosity
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[logging.StreamHandler(sys.stdout)]
-)
-logger = logging.getLogger("vChat")
-LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# HF Spaces specific path
-STATIC_DIR = "/app/static"
-if not os.path.isdir(STATIC_DIR):
-    # Fallback if running locally
-    STATIC_DIR = "static"
-    os.makedirs(STATIC_DIR, exist_ok=True)
-app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
-templates = Jinja2Templates(directory=STATIC_DIR)
-# Ensure data directories exist (HF Spaces writable locations)
-os.makedirs("data/videos", exist_ok=True)
-os.makedirs("data", exist_ok=True)
-os.makedirs("data/labels", exist_ok=True)
-os.makedirs("data/prompts", exist_ok=True)
-os.makedirs("data/responses", exist_ok=True)
-os.makedirs("metadata", exist_ok=True)
-STOP_QUEUE_SIGNAL = False
-@app.on_event("startup")
-async def startup_event():
-    logger.info("Application starting up...")
-    try:
-        transcription.load_model()
-    except Exception as e:
-        logger.warning(f"Could not load Whisper model: {e}")
-    if not LITE_MODE:
-        try:
-            inference_logic.load_models()
-        except Exception as e:
-            logger.fatal(f"Could not load local inference models. Error: {e}", exc_info=True)
-    else:
-        logger.info("Running in LITE mode (API Only).")
-@app.get("/", response_class=HTMLResponse)
-async def read_root(request: Request):
-    custom_model_available = False
-    if not LITE_MODE:
-        custom_model_available = inference_logic.peft_model is not None
-    if not (Path(STATIC_DIR) / "index.html").exists():
-        return HTMLResponse(content="Frontend not found.", status_code=404)
-    return templates.TemplateResponse("index.html", {
-        "request": request,
-        "custom_model_available": custom_model_available,
-        "lite_mode": LITE_MODE
-    })
-@app.get("/model-architecture", response_class=PlainTextResponse)
-async def get_model_architecture():
-    if LITE_MODE: return "Running in LITE mode."
-    if inference_logic.base_model: return str(inference_logic.base_model)
-    return "Model not loaded."
-@app.get("/download-dataset")
-async def download_dataset():
-    file_path = Path("data/dataset.csv")
-    if file_path.exists():
-        return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
-    return Response("Dataset not found.", status_code=404)
-progress_message = ""
-def progress_hook(d):
-    global progress_message
-    if d['status'] == 'downloading':
-        progress_message = f"Downloading: {d.get('_percent_str', 'N/A')} at {d.get('_speed_str', 'N/A')}\r"
-    elif d['status'] == 'finished':
-        progress_message = f"\nDownload finished. Preparing video assets...\n"
-def get_cookies_path():
-    """Look for cookies file in known locations for better yt-dlp support."""
-    candidates = ["cookies.txt", "data/cookies.txt", "/app/cookies.txt"]
-    for c in candidates:
-        if os.path.exists(c):
-            return os.path.abspath(c)
-    return None
-async def run_subprocess_async(command: list[str]):
-    cmd_str = ' '.join(command)
-    logger.info(f"[Subprocess] Running: {cmd_str}")
-    process = await asyncio.create_subprocess_exec(*command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    stdout, stderr = await process.communicate()
-    if process.returncode != 0:
-        err_msg = stderr.decode()
-        logger.error(f"[Subprocess] Failed ({process.returncode}): {err_msg}")
-        raise RuntimeError(f"Command failed: {err_msg}")
-    logger.info(f"[Subprocess] Success.")
-    return stdout.decode()
-def extract_tweet_id(url: str) -> str | None:
-    match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
-    if match: return match.group(1)
-    return None
-def check_if_processed(link: str) -> bool:
-    target_id = extract_tweet_id(link)
-    link_clean = link.split('?')[0].strip().rstrip('/')
-    for filename in ["data/dataset.csv", "data/manual_dataset.csv"]:
-        path = Path(filename)
-        if not path.exists(): continue
-        try:
-            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
-                sample = f.read(2048)
-                f.seek(0)
-                try: has_header = csv.Sniffer().has_header(sample)
-                except: has_header = True
-                if has_header:
-                    reader = csv.DictReader(f)
-                    for row in reader:
-                        row_link = row.get('link', '').split('?')[0].strip().rstrip('/')
-                        if row_link == link_clean: return True
-                        row_id = row.get('id', '')
-                        if target_id and row_id == target_id: return True
-                else:
-                    reader = csv.reader(f)
-                    for row in reader:
-                        if not row: continue
-                        if link_clean in row: return True
-                        if target_id and target_id in row: return True
-        except Exception:
-            continue
-    return False
-async def prepare_video_assets_async(url: str) -> dict:
-    global progress_message
-    loop = asyncio.get_event_loop()
-    is_local = not (url.startswith("http://") or url.startswith("https://"))
-    video_id = "unknown"
-    transcript_path = None
-    logger.info(f"Preparing assets for URL: {url}")
-    if is_local:
-        original_path = Path(url)
-        if not original_path.exists(): raise FileNotFoundError(f"File not found: {url}")
-        video_id = hashlib.md5(str(url).encode('utf-8')).hexdigest()[:16]
-        metadata = {"id": video_id, "link": url, "caption": original_path.stem}
-    else:
-        tweet_id = extract_tweet_id(url)
-        video_id = tweet_id if tweet_id else hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
-        sanitized_check = Path(f"data/videos/{video_id}_fixed.mp4")
-        cookies_path = get_cookies_path()
-        ydl_opts = {
-            'format': 'best[ext=mp4]/best',
-            'outtmpl': 'data/videos/%(id)s.%(ext)s',
-            'progress_hooks': [progress_hook],
-            'quiet': False,
-            'no_warnings': False,
-            'noplaylist': True,
-            'no_overwrites': True,
-            'writesubtitles': True,
-            'writeautomaticsub': True,
-            'subtitleslangs': ['en'],
-            'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-        }
-        if cookies_path:
-            ydl_opts['cookiefile'] = cookies_path
-            logger.info(f"Using cookies from {cookies_path}")
-        if sanitized_check.exists():
-            logger.info(f"Video {video_id} already cached at {sanitized_check}")
-            original_path = sanitized_check
-            metadata = {"id": video_id, "link": url, "caption": "Cached Video"}
-        else:
-            try:
-                logger.info(f"Starting yt-dlp download for {video_id}...")
-                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-                    info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=True))
-                    original_path = Path(ydl.prepare_filename(info))
-                    metadata = {
-                        "id": info.get("id", video_id), "link": info.get("webpage_url", url),
-                        "caption": info.get("description", info.get("title", "N/A")).encode('ascii', 'ignore').decode('ascii').strip()[:500],
-                        "postdatetime": info.get("upload_date", "N/A")
-                    }
-                    video_id = info.get("id", video_id)
-                logger.info("yt-dlp download successful.")
-            except yt_dlp.utils.DownloadError as e:
-                logger.error(f"yt-dlp download error: {e}")
-                if "No video could be found" in str(e):
-                    raise ValueError(f"No video content found at {url}")
-                raise RuntimeError(f"Download failed: {str(e)}")
-            except Exception as e:
-                logger.error(f"Unexpected yt-dlp error: {e}")
-                raise RuntimeError(f"Download failed: {str(e)}")
-        transcript_path = next(Path("data/videos").glob(f"{video_id}*.en.vtt"), None)
-        if not transcript_path: transcript_path = next(Path("data/videos").glob(f"{video_id}*.vtt"), None)
-    sanitized_path = Path(f"data/videos/{video_id}_fixed.mp4")
-    # --- FFmpeg Sanitization Logic with Robust Fallback ---
-    if not sanitized_path.exists() and original_path.exists():
-        logger.info(f"Sanitizing video {video_id} (Original: {original_path})...")
-        ffmpeg_bin = shutil.which('ffmpeg')
-        if not ffmpeg_bin: raise RuntimeError("FFmpeg binary not found in system path!")
-        try:
-            await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-y", str(sanitized_path)])
-            logger.info("Sanitization (re-encode) successful.")
-        except Exception as e:
-            logger.warning(f"Re-encode failed ({e}). Attempting Stream Copy...")
-            try:
-                await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c", "copy", "-y", str(sanitized_path)])
-                logger.info("Sanitization (copy) successful.")
-            except Exception as e2:
-                logger.error(f"Sanitization failed completely: {e2}")
-                if original_path.suffix == '.mp4':
-                    logger.warning("Using original file as sanitized file.")
-                    shutil.copy(original_path, sanitized_path)
-                else:
-                    raise RuntimeError("Could not produce a valid MP4 file.")
-    # --- Audio Extraction ---
-    audio_path = sanitized_path.with_suffix('.wav')
-    if not audio_path.exists() and sanitized_path.exists():
-        logger.info(f"Extracting audio to {audio_path}...")
-        try:
-            await run_subprocess_async(["ffmpeg", "-i", str(sanitized_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", str(audio_path)])
-            logger.info("Audio extraction successful.")
-        except Exception as e:
-            logger.error(f"Audio extraction failed: {e}")
-    # --- Transcription ---
-    if not transcript_path and audio_path.exists() and transcription.transcription_model is not None:
-        logger.info("Generating transcript via Whisper...")
-        transcript_path = await loop.run_in_executor(None, transcription.generate_transcript, str(audio_path))
-    elif not transcript_path:
-        logger.info("Skipping local transcription (Whisper not loaded or audio missing).")
-    return {"video": str(sanitized_path), "transcript": str(transcript_path) if transcript_path else None, "metadata": metadata}
-def safe_int(value):
-    try:
-        clean = re.sub(r'[^\d]', '', str(value))
-        return int(clean) if clean else 0
-    except Exception:
-        return 0
-async def generate_and_save_croissant_metadata(row_data: dict) -> str:
-    try:
-        sanitized_data = {
-            "id": str(row_data.get("id", "")),
-            "link": str(row_data.get("link", "")),
-            "visual_integrity_score": safe_int(row_data.get("visual_integrity_score")),
-            "final_veracity_score": safe_int(row_data.get("final_veracity_score"))
-        }
-        video_id = sanitized_data["id"]
-        timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
-        croissant_json = {
-          "@context": "https://schema.org/",
-          "@type": "Dataset",
-          "name": f"vchat-label-{video_id}",
-          "description": f"Veracity analysis labels for video {video_id}",
-          "url": sanitized_data["link"],
-          "variableMeasured": sanitized_data
-        }
-        path = Path("metadata") / f"{video_id}_{timestamp}.json"
-        path.write_text(json.dumps(croissant_json, indent=2))
-        return str(path)
-    except Exception:
-        return "N/A (Error)"
-async def get_labels_for_link(video_url: str, gemini_config: dict, vertex_config: dict, model_selection: str, include_comments: bool, reasoning_method: str = "cot"):
-    try:
-        yield f"Downloading assets for {video_url}..."
-        try:
-            paths = await prepare_video_assets_async(video_url)
-        except ValueError as ve:
-            yield f"Skipped: {str(ve)}"
-            logger.warning(f"Skipping {video_url}: {ve}")
-            return
-        except Exception as e:
-            yield f"Error preparing assets: {str(e)}"
-            logger.error(f"Asset prep failed for {video_url}: {e}")
-            return
-        video_path = paths["video"]
-        transcript_text = parse_vtt(paths["transcript"]) if paths["transcript"] else "No transcript (Audio/Video Analysis only)."
-        caption = paths["metadata"].get("caption", "")
-        yield f"Assets ready. Running inference ({model_selection}, {reasoning_method.upper()})..."
-        logger.info(f"Starting inference pipeline for {video_url} (Transcript len: {len(transcript_text)})")
-        final_labels = None
-        raw_toon = ""
-        prompt_used = ""
-        pipeline = inference_logic.run_gemini_labeling_pipeline if model_selection == 'gemini' else inference_logic.run_vertex_labeling_pipeline
-        config = gemini_config if model_selection == 'gemini' else vertex_config
-        # Add timeout protection for inference
-        try:
-            async for msg in pipeline(video_path, caption, transcript_text, config, include_comments, reasoning_method):
-                if isinstance(msg, dict) and "parsed_data" in msg:
-                    final_labels = msg["parsed_data"]
-                    raw_toon = msg.get("raw_toon", "")
-                    prompt_used = msg.get("prompt_used", "")
-                    logger.info("Inference successful. Data parsed.")
-                elif isinstance(msg, str):
-                    yield msg
-                elif isinstance(msg, dict) and "error" in msg:
-                    yield f"API Error: {msg['error']}"
-        except Exception as pipe_err:
-            logger.error(f"Pipeline crashed: {pipe_err}")
-            yield f"Critical Pipeline Failure: {pipe_err}"
-            return
-        if not final_labels:
-            logger.error(f"Inference pipeline completed but returned no labels for {video_url}")
-            yield "No labels generated. Check logs."
-            return
-        final_labels["meta_info"] = {
-            "prompt_used": prompt_used,
-            "model_selection": model_selection,
-            "reasoning_method": reasoning_method
-        }
-        vec = final_labels.get("veracity_vectors", {})
-        mod = final_labels.get("modalities", {})
-        fin = final_labels.get("final_assessment", {})
-        row = {
-            "id": paths["metadata"]["id"],
-            "link": paths["metadata"]["link"],
-            "caption": caption,
-            "postdatetime": paths["metadata"].get("postdatetime", ""),
-            "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-            "videotranscriptionpath": paths["transcript"] or "",
-            "visual_integrity_score": vec.get("visual_integrity_score", "0"),
-            "audio_integrity_score": vec.get("audio_integrity_score", "0"),
-            "source_credibility_score": vec.get("source_credibility_score", "0"),
-            "logical_consistency_score": vec.get("logical_consistency_score", "0"),
-            "emotional_manipulation_score": vec.get("emotional_manipulation_score", "0"),
-            "video_audio_score": mod.get("video_audio_score", "0"),
-            "video_caption_score": mod.get("video_caption_score", "0"),
-            "audio_caption_score": mod.get("audio_caption_score", "0"),
-            "final_veracity_score": fin.get("veracity_score_total", "0"),
-            "final_reasoning": fin.get("reasoning", "")
-        }
-        yield {"csv_row": row, "full_json": final_labels, "raw_toon": raw_toon}
-    except Exception as e:
-        logger.error(f"Fatal error in get_labels_for_link: {e}", exc_info=True)
-        yield {"error": str(e)}
-@app.get("/queue/list")
-async def get_queue_list():
-    queue_path = Path("data/batch_queue.csv")
-    if not queue_path.exists(): return []
-    items = []
-    with open(queue_path, 'r', encoding='utf-8') as f:
-        reader = csv.reader(f)
-        try: next(reader)
-        except: pass
-        for row in reader:
-            if len(row) > 0:
-                link = row[0]
-                status = "Processed" if check_if_processed(link) else "Pending"
-                items.append({
-                    "link": link,
-                    "timestamp": row[1] if len(row) > 1 else "",
-                    "status": status
-                })
-    return items
-@app.delete("/queue/delete")
-async def delete_queue_item(link: str):
-    queue_path = Path("data/batch_queue.csv")
-    if not queue_path.exists():
-        return {"status": "error", "message": "Queue file not found"}
-    rows = []
-    deleted = False
-    try:
-        with open(queue_path, 'r', encoding='utf-8') as f:
-            reader = csv.reader(f)
-            rows = list(reader)
-        new_rows = []
-        if rows and len(rows) > 0 and rows[0][0] == "link":
-             new_rows.append(rows[0])
-             rows = rows[1:]
-        for row in rows:
-            if not row: continue
-            if row[0] == link:
-                deleted = True
-            else:
-                new_rows.append(row)
-        with open(queue_path, 'w', newline='', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            writer.writerows(new_rows)
-        if deleted:
-            return {"status": "success", "link": link}
-        else:
-            return {"status": "not_found", "message": "Link not found in queue"}
-    except Exception as e:
-        return {"status": "error", "message": str(e)}
-@app.post("/queue/stop")
-async def stop_queue_processing():
-    global STOP_QUEUE_SIGNAL
-    logger.info("Received Stop Signal from User.")
-    STOP_QUEUE_SIGNAL = True
-    return {"status": "stopping"}
-@app.post("/queue/upload_csv")
-async def upload_csv_to_queue(file: UploadFile = File(...)):
-    try:
-        content = await file.read()
-        try:
-            decoded = content.decode('utf-8').splitlines()
-        except UnicodeDecodeError:
-             decoded = content.decode('latin-1').splitlines()
-        reader = csv.reader(decoded)
-        links_to_add = []
-        header = next(reader, None)
-        if not header: return {"status": "empty file"}
-        link_idx = 0
-        header_lower = [h.lower() for h in header]
-        if "link" in header_lower: link_idx = header_lower.index("link")
-        elif "url" in header_lower: link_idx = header_lower.index("url")
-        elif len(header) > 0 and header[0].strip().startswith("http"):
-            links_to_add.append(header[0])
-            link_idx = 0
-        for row in reader:
-            if len(row) > link_idx and row[link_idx].strip():
-                links_to_add.append(row[link_idx].strip())
-        queue_path = Path("data/batch_queue.csv")
-        existing_links = set()
-        if queue_path.exists():
-            with open(queue_path, 'r', encoding='utf-8') as f:
-                existing_links = set(f.read().splitlines())
-        added_count = 0
-        with open(queue_path, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            if not queue_path.exists() or queue_path.stat().st_size == 0:
-                writer.writerow(["link", "ingest_timestamp"])
-            for link in links_to_add:
-                duplicate = False
-                for line in existing_links:
-                    if link in line:
-                        duplicate = True
-                        break
-                if duplicate: continue
-                writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
-                added_count += 1
-        return {"status": "success", "added": added_count}
-    except Exception as e:
-        logging.error(f"Upload CSV error: {e}")
-        return JSONResponse(status_code=400, content={"error": str(e), "status": "failed"})
-@app.post("/queue/run")
-async def run_queue_processing(
-    model_selection: str = Form(...),
-    gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
-    vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
-    include_comments: bool = Form(False),
-    reasoning_method: str = Form("cot")
-):
-    global STOP_QUEUE_SIGNAL
-    STOP_QUEUE_SIGNAL = False
-    gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
-    vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
-    async def queue_stream():
-        queue_path = Path("data/batch_queue.csv")
-        if not queue_path.exists():
-            yield "data: Queue empty.\n\n"
-            return
-        items = []
-        with open(queue_path, 'r', encoding='utf-8') as f:
-            reader = csv.reader(f)
-            try: next(reader)
-            except: pass
-            for row in reader:
-                if row: items.append(row[0])
-        processed_count = 0
-        total = len(items)
-        logger.info(f"Starting batch queue processing for {total} items.")
-        for i, link in enumerate(items):
-            if STOP_QUEUE_SIGNAL:
-                yield "data: [SYSTEM] Stopped by user.\n\n"
-                logger.info("Stopping queue loop.")
-                break
-            if check_if_processed(link):
-                yield f"data: [SKIP] {link} processed.\n\n"
-                continue
-            yield f"data: [START] {i+1}/{total}: {link}\n\n"
-            final_data = None
-            # Streaming results from pipeline
-            async for res in get_labels_for_link(link, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
-                if isinstance(res, str):
-                    msg = res.replace('\n', ' ')
-                    yield f"data: {msg}\n\n"
-                if isinstance(res, dict):
-                    if "error" in res:
-                        yield f"data: [ERROR DETAIL] {res['error']}\n\n"
-                    if "csv_row" in res:
-                        final_data = res
-            if final_data:
-                row = final_data["csv_row"]
-                vid_id = row["id"]
-                ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
-                # Save artifacts
-                json_path = f"data/labels/{vid_id}_{ts}_labels.json"
-                with open(json_path, 'w') as f: json.dump(final_data["full_json"], f, indent=2)
-                with open(f"data/labels/{vid_id}_{ts}.toon", 'w') as f: f.write(final_data["raw_toon"])
-                prompt_content = final_data.get("full_json", {}).get("meta_info", {}).get("prompt_used", "")
-                if prompt_content:
-                    with open(f"data/prompts/{vid_id}_{ts}_prompt.txt", 'w', encoding='utf-8') as f:
-                        f.write(prompt_content)
-                raw_response = final_data.get("raw_toon", "")
-                if raw_response:
-                    with open(f"data/responses/{vid_id}.txt", 'w', encoding='utf-8') as f:
-                        f.write(raw_response)
-                row["metadatapath"] = await generate_and_save_croissant_metadata(row)
-                row["json_path"] = json_path
-                dpath = Path("data/dataset.csv")
-                exists = dpath.exists()
-                with open(dpath, 'a', newline='', encoding='utf-8') as f:
-                    writer = csv.DictWriter(f, fieldnames=list(row.keys()), extrasaction='ignore')
-                    if not exists: writer.writeheader()
-                    writer.writerow(row)
-                processed_count += 1
-                yield f"data: [SUCCESS] Labeled.\n\n"
-            else:
-                yield f"data: [FAIL] Failed to label. Check logs.\n\n"
-        yield f"data: Batch Complete. +{processed_count} videos labeled.\n\n"
-        yield "event: close\ndata: Done\n\n"
-    return StreamingResponse(queue_stream(), media_type="text/event-stream")
-@app.post("/extension/ingest")
-async def extension_ingest(request: Request):
-    try:
-        data = await request.json()
-        link = data.get("link")
-        if not link: raise HTTPException(status_code=400, detail="No link")
-        queue_path = Path("data/batch_queue.csv")
-        file_exists = queue_path.exists()
-        if file_exists:
-            with open(queue_path, 'r', encoding='utf-8') as f:
-                if link in f.read():
-                    return {"status": "queued", "msg": "Duplicate"}
-        with open(queue_path, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.writer(f)
-            if not file_exists: writer.writerow(["link", "ingest_timestamp"])
-            writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")])
-        return {"status": "queued", "link": link}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/extension/save_comments")
-async def extension_save_comments(request: Request):
-    try:
-        data = await request.json()
-        link = data.get("link")
-        comments = data.get("comments", [])
-        if not link or not comments: raise HTTPException(status_code=400, detail="Missing data")
-        csv_path = Path("data/comments.csv")
-        exists = csv_path.exists()
-        fieldnames = ["link", "author", "comment_text", "timestamp"]
-        with open(csv_path, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
-            if not exists: writer.writeheader()
-            ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-            for c in comments:
-                row = {"link": link, "timestamp": ts}
-                if isinstance(c, dict):
-                    row["author"] = c.get("author", "Unknown")
-                    row["comment_text"] = c.get("text", "").strip()
-                else:
-                    row["author"] = "Unknown"
-                    row["comment_text"] = str(c).strip()
-                if row["comment_text"]:
-                    writer.writerow(row)
-        return {"status": "saved", "count": len(comments)}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/extension/save_manual")
-async def extension_save_manual(request: Request):
-    try:
-        data = await request.json()
-        link = data.get("link")
-        labels = data.get("labels", {})
-        stats = data.get("stats", {})
-        if not link: raise HTTPException(status_code=400, detail="No link")
-        video_id = extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:16]
-        row_data = {
-            "id": video_id,
-            "link": link,
-            "caption": data.get("caption", ""),
-            "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-            "source": "manual_extension",
-            "visual_integrity_score": labels.get("visual_integrity_score", 0),
-            "audio_integrity_score": labels.get("audio_integrity_score", 0),
-            "source_credibility_score": labels.get("source_credibility_score", 0),
-            "logical_consistency_score": labels.get("logical_consistency_score", 0),
-            "emotional_manipulation_score": labels.get("emotional_manipulation_score", 0),
-            "video_audio_score": labels.get("video_audio_score", 0),
-            "video_caption_score": labels.get("video_caption_score", 0),
-            "audio_caption_score": labels.get("audio_caption_score", 0),
-            "final_veracity_score": labels.get("final_veracity_score", 0),
-            "final_reasoning": labels.get("reasoning", ""),
-            "stats_likes": stats.get("likes", 0),
-            "stats_shares": stats.get("shares", 0),
-            "stats_comments": stats.get("comments", 0),
-            "stats_platform": stats.get("platform", "unknown")
-        }
-        dpath = Path("data/manual_dataset.csv")
-        exists = dpath.exists()
-        fieldnames = list(row_data.keys())
-        with open(dpath, 'a', newline='', encoding='utf-8') as f:
-            writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
-            if not exists: writer.writeheader()
-            writer.writerow(row_data)
-        return {"status": "saved"}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/manage/list")
-async def list_data():
-    data = []
-    def read_csv(path, source_type):
-        if not path.exists(): return
-        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
-            reader = csv.DictReader(f)
-            for row in reader:
-                if not row.get('id') or row['id'].strip() == "":
-                    link = row.get('link', '')
-                    tid = extract_tweet_id(link)
-                    row['id'] = tid if tid else hashlib.md5(link.encode()).hexdigest()[:16]
-                json_content = None
-                if row.get('json_path') and os.path.exists(row['json_path']):
-                     try:
-                         with open(row['json_path'], 'r') as jf: json_content = json.load(jf)
-                     except: pass
-                row['source_type'] = source_type
-                row['json_data'] = json_content
-                data.append(row)
-    read_csv(Path("data/dataset.csv"), "auto")
-    read_csv(Path("data/manual_dataset.csv"), "manual")
-    data.sort(key=lambda x: x.get('collecttime', ''), reverse=True)
-    return data
-@app.delete("/manage/delete")
-async def delete_data(id: str = "", link: str = ""):
-    if not id and not link: raise HTTPException(status_code=400, detail="Must provide ID or Link")
-    deleted_count = 0
-    target_id = id
-    def remove_from_csv(path):
-        nonlocal deleted_count, target_id
-        if not path.exists(): return
-        rows = []
-        found_in_file = False
-        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
-            reader = csv.DictReader(f)
-            fieldnames = reader.fieldnames
-            for row in reader:
-                is_match = False
-                if id and row.get('id') == id: is_match = True
-                elif link and row.get('link') == link: is_match = True
-                if is_match:
-                    found_in_file = True
-                    deleted_count += 1
-                    if not target_id: target_id = row.get('id')
-                else: rows.append(row)
-        if found_in_file:
-            with open(path, 'w', newline='', encoding='utf-8') as f:
-                writer = csv.DictWriter(f, fieldnames=fieldnames)
-                writer.writeheader()
-                writer.writerows(rows)
-    remove_from_csv(Path("data/dataset.csv"))
-    remove_from_csv(Path("data/manual_dataset.csv"))
-    if target_id:
-        for p in Path("data/labels").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
-        for p in Path("metadata").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
-    return {"status": "deleted", "count": deleted_count}
-@app.post("/label_video")
-async def label_video_endpoint(
-    video_url: str = Form(...), model_selection: str = Form(...),
-    gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
-    vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
-    include_comments: bool = Form(False),
-    reasoning_method: str = Form("cot")
-):
-    gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
-    vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
-    async def stream():
-        async for msg in get_labels_for_link(video_url, gemini_config, vertex_config, model_selection, include_comments, reasoning_method):
-             if isinstance(msg, str): yield f"data: {msg}\n\n"
-             if isinstance(msg, dict) and "csv_row" in msg: yield "data: Done. Labels generated.\n\n"
-        yield "event: close\ndata: Done.\n\n"
-    return StreamingResponse(stream(), media_type="text/event-stream")

src/factuality_logic.py DELETED Viewed

@@ -1,143 +0,0 @@
-# factuality_logic.py
-import os
-import re
-import json
-import logging
-import asyncio
-from pathlib import Path
-import inference_logic
-from toon_parser import parse_toon_line
-logger = logging.getLogger(__name__)
-PROMPT_VISUAL_ARTIFACTS = (
-    "Analyze the video for visual manipulation (Deepfakes, editing anomalies).\n"
-    "Steps inside <thinking>: 1. Scan for artifacts. 2. Check cuts.\n"
-    "Output TOON format:\n"
-    "visual_analysis: result[2]{score,justification}:\n"
-    "Score(1-10),\"Justification text\""
-)
-PROMPT_CONTENT_ANALYSIS = (
-    "Analyze the content for accuracy and logic.\n"
-    "Steps inside <thinking>: 1. Identify claims. 2. Check fallacies. 3. Assess emotion.\n"
-    "**Transcript:**\n{transcript}\n"
-    "Output TOON format:\n"
-    "content_analysis: result[2]{score,justification}:\n"
-    "Score(1-10),\"Justification text\""
-)
-PROMPT_AUDIO_ANALYSIS = (
-    "Analyze audio for synthesis or manipulation.\n"
-    "Steps inside <thinking>: 1. Listen for robotic inflections. 2. Check lip-sync.\n"
-    "**Transcript:**\n{transcript}\n"
-    "Output TOON format:\n"
-    "audio_analysis: result[2]{score,justification}:\n"
-    "Score(1-10),\"Justification text\""
-)
-def parse_vtt(file_path: str) -> str:
-    try:
-        if not os.path.exists(file_path):
-            return "Transcript file not found."
-        with open(file_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-        text_lines = []
-        for line in lines:
-            line = line.strip()
-            if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
-                clean_line = re.sub(r'<[^>]+>', '', line)
-                if clean_line and (not text_lines or clean_line != text_lines[-1]):
-                     text_lines.append(clean_line)
-        return "\n".join(text_lines) if text_lines else "No speech found in transcript."
-    except Exception as e:
-        logger.error(f"Error parsing VTT file {file_path}: {e}")
-        return f"Error reading transcript: {e}"
-async def run_factuality_pipeline(paths: dict, checks: dict, generation_config: dict):
-    video_path = paths.get("video")
-    transcript_path = paths.get("transcript")
-    if not video_path:
-        yield "ERROR: Video path not found. Cannot start analysis.\n\n"
-        return
-    yield "Step 1: Processing Transcript...\n"
-    await asyncio.sleep(0.1)
-    transcript = "No transcript was downloaded for this video."
-    if transcript_path and os.path.exists(transcript_path):
-        transcript = parse_vtt(transcript_path)
-        yield f"  - Transcript file found and processed.\n"
-    else:
-        yield f"  - No transcript file was found.\n"
-    yield f"\n--- Extracted Transcript ---\n{transcript}\n--------------------------\n\n"
-    await asyncio.sleep(0.1)
-    analysis_steps = []
-    if checks.get("visuals"):
-        analysis_steps.append(("Visual Integrity", PROMPT_VISUAL_ARTIFACTS))
-    if checks.get("content"):
-        analysis_steps.append(("Content Veracity", PROMPT_CONTENT_ANALYSIS.format(transcript=transcript)))
-    if checks.get("audio"):
-        analysis_steps.append(("Audio Forensics", PROMPT_AUDIO_ANALYSIS.format(transcript=transcript)))
-    for i, (title, prompt) in enumerate(analysis_steps):
-        yield f"--- Step {i + 2}: Running '{title}' Analysis ---\n"
-        yield "(Model is generating TOON analysis with scores...)\n\n"
-        await asyncio.sleep(0.1)
-        try:
-            current_gen_config = generation_config.copy()
-            sampling_fps = current_gen_config.pop("sampling_fps", 2.0)
-            current_gen_config.pop("num_perceptions", None)
-            current_gen_config["temperature"] = 0.1
-            current_gen_config["do_sample"] = True
-            ans = inference_logic.inference_step(
-                video_path=video_path,
-                prompt=prompt,
-                generation_kwargs=current_gen_config,
-                sampling_fps=sampling_fps,
-                pred_glue=None
-            )
-            yield f"  - Analysis Complete for '{title}'. Parsing TOON...\n\n"
-            parsed_result = {}
-            match = re.search(r'(\w+_analysis): result\[2\]\{score,justification\}:\s*\n(.+)', ans, re.MULTILINE)
-            thinking = "No thinking block found."
-            think_match = re.search(r'<thinking>(.*?)</thinking>', ans, re.DOTALL)
-            if think_match:
-                thinking = think_match.group(1).strip()
-            if match:
-                key, value_line = match.groups()
-                parsed_result = parse_toon_line({'key': key, 'headers': ['score', 'justification']}, value_line.strip())
-            else:
-                logger.warning(f"Could not parse TOON for '{title}'. Raw: {ans}")
-                yield f"Warning: Model did not return valid TOON. Raw output:\n{ans}\n"
-                continue
-            score = parsed_result.get('score', 'N/A')
-            justification = parsed_result.get('justification', 'No justification provided.')
-            yield f"===== ANALYSIS RESULT: {title.upper()} =====\n"
-            yield f"SCORE: {score}/10\n"
-            yield f"Reasoning (Step-by-Step): {thinking}\n"
-            yield f"Final Justification: {justification}\n\n"
-            yield f"========================================\n\n"
-        except Exception as e:
-            error_message = f"An error occurred during the '{title}' analysis step: {e}"
-            logger.error(error_message, exc_info=True)
-            yield f"ERROR: {error_message}\n\n"
-            break
-    yield "Factuality Analysis Pipeline Finished.\n"

src/inference_logic.py DELETED Viewed

@@ -1,303 +0,0 @@
-import torch
-import re
-import ast
-import sys
-import os
-import time
-import logging
-import asyncio
-import json
-from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
-from peft import PeftModel
-from labeling_logic import (
-    LABELING_PROMPT_TEMPLATE, SCORE_INSTRUCTIONS_SIMPLE, SCORE_INSTRUCTIONS_REASONING,
-    SCHEMA_SIMPLE, SCHEMA_REASONING,
-    FCOT_MACRO_PROMPT, FCOT_MESO_PROMPT, FCOT_SYNTHESIS_PROMPT
-)
-from toon_parser import parse_veracity_toon
-# Optional local imports
-try:
-    from my_vision_process import process_vision_info, client
-except ImportError:
-    process_vision_info = None
-    client = None
-# Google GenAI Imports
-try:
-    import google.generativeai as genai_legacy
-    from google.generativeai.types import generation_types, HarmCategory, HarmBlockThreshold
-except ImportError:
-    genai_legacy = None
-try:
-    # Modern Google GenAI SDK (v1)
-    from google import genai
-    from google.genai.types import (
-        GenerateContentConfig,
-        HttpOptions,
-        Retrieval,
-        Tool,
-        VertexAISearch,
-        GoogleSearch,
-        Part,
-        SafetySetting
-    )
-    import vertexai
-except ImportError:
-    genai = None
-    vertexai = None
-LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
-processor = None
-base_model = None
-peft_model = None
-active_model = None
-logger = logging.getLogger(__name__)
-def load_models():
-    pass
-async def attempt_toon_repair(original_text: str, schema: str, client, model_type: str, config: dict):
-    logger.info("Attempting TOON Repair...")
-    repair_prompt = f"SYSTEM: Reformat the following text into strict TOON schema. Infer missing scores as 0.\n\nSCHEMA:\n{schema}\n\nINPUT:\n{original_text}\n"
-    try:
-        loop = asyncio.get_event_loop()
-        repaired_text = ""
-        if model_type == 'gemini':
-            model = genai_legacy.GenerativeModel("models/gemini-2.0-flash-exp")
-            response = await loop.run_in_executor(None, lambda: model.generate_content(repair_prompt))
-            repaired_text = response.text
-        elif model_type == 'vertex':
-            cl = client if client else genai.Client(vertexai=True, project=config['project_id'], location=config['location'])
-            response = await loop.run_in_executor(None, lambda: cl.models.generate_content(model=config['model_name'], contents=repair_prompt))
-            repaired_text = response.text
-        return repaired_text
-    except Exception as e:
-        logger.error(f"Repair failed: {e}")
-        return original_text
-async def run_gemini_labeling_pipeline(video_path: str, caption: str, transcript: str, gemini_config: dict, include_comments: bool, reasoning_method: str = "cot"):
-    if genai_legacy is None:
-        yield "ERROR: Legacy SDK missing.\n"
-        return
-    api_key = gemini_config.get("api_key")
-    if not api_key:
-        yield "ERROR: No Gemini API Key provided."
-        return
-    logger.info(f"[Gemini] Initializing with model {gemini_config.get('model_name')}")
-    safety_settings = [
-        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
-        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
-        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
-        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
-    ]
-    try:
-        genai_legacy.configure(api_key=api_key)
-        loop = asyncio.get_event_loop()
-        # 1. Upload File
-        logger.info(f"[Gemini] Uploading video file: {video_path}...")
-        yield f"Uploading video to Gemini..."
-        uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.upload_file(path=video_path, mime_type="video/mp4"))
-        logger.info(f"[Gemini] Upload complete. URI: {uploaded_file.uri} | State: {uploaded_file.state.name}")
-        # 2. Wait for Processing (Fix: Refresh state in loop)
-        wait_start = time.time()
-        while True:
-            # Refresh file status
-            uploaded_file = await loop.run_in_executor(None, lambda: genai_legacy.get_file(uploaded_file.name))
-            state_name = uploaded_file.state.name
-            if state_name == "ACTIVE":
-                logger.info("[Gemini] Video processing complete. Ready for inference.")
-                break
-            elif state_name == "FAILED":
-                logger.error(f"[Gemini] Video processing failed on server side.")
-                yield "ERROR: Google failed to process video."
-                return
-            if time.time() - wait_start > 300: # 5 minute timeout
-                logger.error("[Gemini] Video processing timed out.")
-                yield "ERROR: Video processing timed out."
-                return
-            logger.info(f"[Gemini] Processing video... (State: {state_name})")
-            yield "Processing video on Google servers..."
-            await asyncio.sleep(5)
-        # 3. Prepare Inference
-        model_name = gemini_config.get("model_name") or "models/gemini-2.0-flash-exp"
-        model = genai_legacy.GenerativeModel(model_name)
-        toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
-        score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
-        raw_text = ""
-        prompt_used = ""
-        gen_config = {"temperature": 0.1}
-        logger.info(f"[Gemini] Starting inference with method: {reasoning_method}")
-        if reasoning_method == "fcot":
-            yield "Starting FCoT (Gemini)..."
-            chat = model.start_chat(history=[])
-            macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
-            logger.info("[Gemini] Sending Macro Prompt...")
-            res1 = await loop.run_in_executor(None, lambda: chat.send_message([uploaded_file, macro_prompt], safety_settings=safety_settings))
-            macro_hypothesis = res1.text
-            yield f"Hypothesis: {macro_hypothesis[:100]}...\n"
-            meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
-            logger.info("[Gemini] Sending Meso Prompt...")
-            res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt, safety_settings=safety_settings))
-            synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
-            logger.info("[Gemini] Sending Synthesis Prompt...")
-            res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt, safety_settings=safety_settings))
-            raw_text = res3.text
-            prompt_used = f"FCoT:\n{macro_prompt}\n..."
-        else:
-            prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
-            prompt_used = prompt_text
-            yield f"Generating Labels ({model_name})..."
-            logger.info("[Gemini] Sending standard generation request...")
-            response = await loop.run_in_executor(
-                None,
-                lambda: model.generate_content([prompt_text, uploaded_file], generation_config=gen_config, safety_settings=safety_settings)
-            )
-            raw_text = response.text
-        # Log response info
-        logger.info(f"[Gemini] Response received. Length: {len(raw_text)}")
-        if not raw_text:
-             yield "Model returned empty response (Check API quota or safety)."
-             yield {"error": "Empty Response - likely safety block"}
-             return
-        parsed_data = parse_veracity_toon(raw_text)
-        if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
-             yield "Auto-Repairing output..."
-             raw_text = await attempt_toon_repair(raw_text, toon_schema, None, 'gemini', gemini_config)
-             parsed_data = parse_veracity_toon(raw_text)
-        yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
-        # Cleanup
-        try:
-            logger.info(f"[Gemini] Deleting remote file {uploaded_file.name}")
-            await loop.run_in_executor(None, lambda: genai_legacy.delete_file(name=uploaded_file.name))
-        except Exception as cleanup_err:
-            logger.warning(f"Failed to cleanup file: {cleanup_err}")
-    except Exception as e:
-        logger.error(f"Gemini Pipeline Error: {e}", exc_info=True)
-        yield f"ERROR (Gemini): {e}"
-async def run_vertex_labeling_pipeline(video_path: str, caption: str, transcript: str, vertex_config: dict, include_comments: bool, reasoning_method: str = "cot"):
-    if genai is None:
-        yield "ERROR: 'google-genai' not installed.\n"
-        return
-    project_id = vertex_config.get("project_id")
-    if not project_id:
-        yield "ERROR: No Vertex Project ID."
-        return
-    logger.info(f"[Vertex] Initializing for project {project_id}")
-    safety_settings = [
-        SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="BLOCK_ONLY_HIGH"),
-        SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="BLOCK_ONLY_HIGH"),
-        SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="BLOCK_ONLY_HIGH"),
-        SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="BLOCK_ONLY_HIGH"),
-    ]
-    try:
-        client = genai.Client(vertexai=True, project=project_id, location=vertex_config.get("location", "us-central1"))
-        # For Vertex, we send bytes directly (up to a limit) or use Cloud Storage.
-        # v1 SDK Part.from_bytes is easiest for small/medium videos (< 20MB approx, but allows more in some versions).
-        # For larger videos in HF Spaces, this might time out if not using GCS.
-        # Assuming direct upload for now.
-        logger.info(f"[Vertex] Reading local video file: {video_path}")
-        with open(video_path, 'rb') as f: video_bytes = f.read()
-        video_part = Part.from_bytes(data=video_bytes, mime_type="video/mp4")
-        toon_schema = SCHEMA_REASONING if include_comments else SCHEMA_SIMPLE
-        score_instructions = SCORE_INSTRUCTIONS_REASONING if include_comments else SCORE_INSTRUCTIONS_SIMPLE
-        model_name = vertex_config.get("model_name", "gemini-2.5-flash-lite")
-        raw_text = ""
-        prompt_used = ""
-        loop = asyncio.get_event_loop()
-        config = GenerateContentConfig(
-            temperature=0.1,
-            response_mime_type="text/plain",
-            tools=[Tool(google_search=GoogleSearch())],
-            safety_settings=safety_settings
-        )
-        logger.info(f"[Vertex] Starting inference with {model_name}")
-        if reasoning_method == "fcot":
-            yield "Starting FCoT (Vertex)..."
-            chat = client.chats.create(model=model_name, config=config)
-            macro_prompt = FCOT_MACRO_PROMPT.format(caption=caption, transcript=transcript)
-            logger.info("[Vertex] Sending Macro Prompt...")
-            res1 = await loop.run_in_executor(None, lambda: chat.send_message([video_part, macro_prompt]))
-            macro_hypothesis = res1.text
-            yield f"Hypothesis: {macro_hypothesis[:80]}...\n"
-            meso_prompt = FCOT_MESO_PROMPT.format(macro_hypothesis=macro_hypothesis)
-            logger.info("[Vertex] Sending Meso Prompt...")
-            res2 = await loop.run_in_executor(None, lambda: chat.send_message(meso_prompt))
-            synthesis_prompt = FCOT_SYNTHESIS_PROMPT.format(toon_schema=toon_schema, score_instructions=score_instructions)
-            logger.info("[Vertex] Sending Synthesis Prompt...")
-            res3 = await loop.run_in_executor(None, lambda: chat.send_message(synthesis_prompt))
-            raw_text = res3.text
-            prompt_used = f"FCoT (Vertex):\n{macro_prompt}..."
-        else:
-            prompt_text = LABELING_PROMPT_TEMPLATE.format(caption=caption, transcript=transcript, toon_schema=toon_schema, score_instructions=score_instructions)
-            prompt_used = prompt_text
-            yield f"Generating Labels ({model_name})..."
-            logger.info("[Vertex] Sending standard generation request...")
-            response = await loop.run_in_executor(
-                None,
-                lambda: client.models.generate_content(model=model_name, contents=[video_part, prompt_text], config=config)
-            )
-            raw_text = response.text
-        logger.info(f"[Vertex] Response Length: {len(raw_text)}")
-        if not raw_text:
-             yield "Model returned empty response."
-             yield {"error": "Empty Response"}
-             return
-        parsed_data = parse_veracity_toon(raw_text)
-        if parsed_data['veracity_vectors']['visual_integrity_score'] == '0':
-            yield "Auto-Repairing output..."
-            raw_text = await attempt_toon_repair(raw_text, toon_schema, client, 'vertex', vertex_config)
-            parsed_data = parse_veracity_toon(raw_text)
-        yield {"raw_toon": raw_text, "parsed_data": parsed_data, "prompt_used": prompt_used}
-    except Exception as e:
-        yield f"ERROR (Vertex): {e}"
-        logger.error("Vertex Labeling Error", exc_info=True)
-async def run_gemini_pipeline(video_path, question, checks, gemini_config, generation_config=None):
-    yield "Legacy pipeline not fully supported in HF Space."
-async def run_vertex_pipeline(video_path, question, checks, vertex_config, generation_config=None):
-    yield "Legacy pipeline not fully supported in HF Space."

src/labeling_logic.py DELETED Viewed

@@ -1,145 +0,0 @@
-# labeling_logic.py
-LABELING_PROMPT_TEMPLATE = """
-You are an AI Factuality Assessment Agent operating under the "Ali Arsanjani Factuality Factors" framework.
-Your goal is to mass-label video content, quantifying "Veracity Vectors" and "Modality Alignment".
-**INPUT DATA:**
-- **User Caption:** "{caption}"
-- **Audio Transcript:** "{transcript}"
-- **Visuals:** (Provided in video context)
-**INSTRUCTIONS:**
-1.  **Grounding:** Cross-reference claims in the transcript with your internal knowledge base (and tools if active).
-2.  **Chain of Thought (<thinking>):** You MUST think step-by-step inside a `<thinking>` block before generating output.
-    *   Analyze *Visual Integrity* (Artifacts, edits).
-    *   Analyze *Audio Integrity* (Voice cloning, sync).
-    *   Analyze *Modality Alignment* (Does video match audio? Does caption match content? Does audio match caption?).
-    *   Analyze *Logic* (Fallacies, gaps).
-    *   Determine *Disinformation* classification.
-3.  **Output Format:** Output strictly in **TOON** format (Token-Oriented Object Notation) as defined below.
-**CRITICAL CONSTRAINTS:**
-- Do NOT repeat the input data.
-- START your response IMMEDIATELY with the `<thinking>` tag.
-- **DO NOT use Markdown code blocks.** (Output plain text only).
-- Use strict `Key : Type [ Count ] {{ Headers }} :` format followed by data lines.
-- Strings containing commas MUST be quoted.
-- ALL scores must be filled (use 0 if unsure, do not leave blank).
-- **MODALITY SCORING:** You must provide 3 distinct alignment scores: Video-Audio, Video-Caption, and Audio-Caption.
-**TOON SCHEMA:**
-{toon_schema}
-{score_instructions}
-**RESPONSE:**
-<thinking>
-"""
-SCORE_INSTRUCTIONS_REASONING = """
-**Constraints:**
-1. Provide specific reasoning for EACH score in the `vectors` and `modalities` tables.
-2. Ensure strings are properly quoted.
-"""
-SCORE_INSTRUCTIONS_SIMPLE = """
-**Constraint:** Focus on objective measurements. Keep text concise.
-"""
-SCHEMA_SIMPLE = """summary: text[1]{text}:
-"Brief neutral summary of the video events"
-vectors: scores[1]{visual,audio,source,logic,emotion}:
-(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10),(Int 1-10)
-*Scale: 1=Fake/Malicious, 10=Authentic/Neutral*
-modalities: scores[1]{video_audio_score,video_caption_score,audio_caption_score}:
-(Int 1-10),(Int 1-10),(Int 1-10)
-*Scale: 1=Mismatch, 10=Perfect Match*
-factuality: factors[1]{accuracy,gap,grounding}:
-(Verified/Misleading/False),"Missing evidence description","Grounding check results"
-disinfo: analysis[1]{class,intent,threat}:
-(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
-final: assessment[1]{score,reasoning}:
-(Int 1-100),"Final synthesis of why this score was given"
-"""
-SCHEMA_REASONING = """
-summary: text[1]{text}:
-"Brief neutral summary of the video events"
-vectors: details[5]{category,score,reasoning}:
-Visual,(Int 1-10),"Reasoning for visual score"
-Audio,(Int 1-10),"Reasoning for audio score"
-Source,(Int 1-10),"Reasoning for source credibility"
-Logic,(Int 1-10),"Reasoning for logical consistency"
-Emotion,(Int 1-10),"Reasoning for emotional manipulation"
-modalities: details[3]{category,score,reasoning}:
-VideoAudio,(Int 1-10),"Reasoning for video-to-audio alignment"
-VideoCaption,(Int 1-10),"Reasoning for video-to-caption alignment"
-AudioCaption,(Int 1-10),"Reasoning for audio-to-caption alignment"
-factuality: factors[1]{accuracy,gap,grounding}:
-(Verified/Misleading/False),"Missing evidence description","Grounding check results"
-disinfo: analysis[1]{class,intent,threat}:
-(None/Misinfo/Disinfo/Satire),(Political/Commercial/None),(Deepfake/Recontextualization/None)
-final: assessment[1]{score,reasoning}:
-(Int 1-100),"Final synthesis of why this score was given"
-"""
-FCOT_MACRO_PROMPT = """
-**Fractal Chain of Thought - Stage 1: Macro-Scale Hypothesis (Wide Aperture)**
-You are analyzing a video for factuality.
-**Context:** Caption: "{caption}" | Transcript: "{transcript}"
-1. **Global Scan**: Observe the video, audio, and caption as a whole entity.
-2. **Context Aperture**: Wide. Assess the overall intent (Humor, Information, Political, Social) and the setting.
-3. **Macro Hypothesis**: Formulate a high-level hypothesis about the veracity. (e.g., "The video is likely authentic but the caption misrepresents the location" or "The audio quality suggests synthetic generation").
-**Objective**: Maximize **Coverage** (broadly explore potential angles of manipulation).
-**Output**: A concise paragraph summarizing the "Macro Hypothesis".
-"""
-FCOT_MESO_PROMPT = """
-**Fractal Chain of Thought - Stage 2: Meso-Scale Expansion (Recursive Verification)**
-**Current Macro Hypothesis**: "{macro_hypothesis}"
-**Action**: Zoom In. Decompose the hypothesis into specific verification branches.
-Perform the following checks recursively:
-1. **Visual Branch**: Look for specific artifacts, lighting inconsistencies, cuts, or deepfake signs.
-2. **Audio Branch**: Analyze lip-sync, background noise consistency, and voice tonality.
-3. **Logical Branch**: Does the visual evidence strictly support the caption's claim? Are there logical fallacies?
-**Dual-Objective Self-Correction**:
-- **Faithfulness**: Do not hallucinate details not present in the video.
-- **Coverage**: Did you miss any subtle cues?
-**Output**: Detailed "Micro-Observations" for each branch. If you find contradictions to the Macro Hypothesis, note them explicitly as **"Self-Correction"**.
-"""
-FCOT_SYNTHESIS_PROMPT = """
-**Fractal Chain of Thought - Stage 3: Inter-Scale Consensus & Synthesis**
-**Action**: Integrate your Macro Hypothesis and Micro-Observations.
-- **Consensus Check**: If Micro-Observations contradict the Macro Hypothesis, prioritize the Micro evidence (Self-Correction).
-- **Compression**: Synthesize the findings into the final structured format.
-**Output Format**:
-Strictly fill out the following TOON schema based on the consensus. Do not include markdown code blocks.
-**TOON SCHEMA**:
-{toon_schema}
-{score_instructions}
-"""

src/my_vision_process.py DELETED Viewed

@@ -1,17 +0,0 @@
-# my_vision_process.py (Stub for HF Spaces / Lite Mode)
-import logging
-logger = logging.getLogger(__name__)
-# Dummy client
-client = None
-def process_vision_info(messages, return_video_kwargs=False, client=None):
-    """
-    Stub function to prevent ImportErrors in API-only mode.
-    If this is called, it means LITE_MODE logic failed or was bypassed.
-    """
-    logger.warning("process_vision_info called in LITE/API environment. Returning empty placeholders.")
-    if return_video_kwargs:
-        return None, None, {"fps": [0]}
-    return None, None

src/toon_parser.py DELETED Viewed

@@ -1,220 +0,0 @@
-# toon_parser.py
-import re
-import logging
-import csv
-from io import StringIO
-logger = logging.getLogger(__name__)
-def parse_toon_line(line_def, data_line):
-    if not data_line or data_line.isspace():
-        return {}
-    try:
-        reader = csv.reader(StringIO(data_line), skipinitialspace=True)
-        try:
-            values = next(reader)
-        except StopIteration:
-            values = []
-        cleaned_values = []
-        for v in values:
-            v_str = v.strip()
-            v_str = v_str.replace('(', '').replace(')', '')
-            if '/' in v_str and any(c.isdigit() for c in v_str):
-                parts = v_str.split('/')
-                if parts[0].strip().isdigit():
-                    v_str = parts[0].strip()
-            cleaned_values.append(v_str)
-        headers = line_def.get('headers', [])
-        if len(cleaned_values) < len(headers):
-            cleaned_values += [""] * (len(headers) - len(cleaned_values))
-        elif len(cleaned_values) > len(headers):
-            cleaned_values = cleaned_values[:len(headers)]
-        return dict(zip(headers, cleaned_values))
-    except Exception as e:
-        logger.error(f"Error parsing TOON line '{data_line}': {e}")
-        return {}
-def fuzzy_extract_scores(text: str) -> dict:
-    scores = {
-        'visual': '0', 'audio': '0', 'source': '0', 'logic': '0', 'emotion': '0',
-        'video_audio': '0', 'video_caption': '0', 'audio_caption': '0'
-    }
-    mappings = [
-        ('visual', 'visual'),
-        ('visual.*?integrity', 'visual'),
-        ('accuracy', 'visual'),
-        ('audio', 'audio'),
-        ('source', 'source'),
-        ('logic', 'logic'),
-        ('emotion', 'emotion'),
-        (r'video.*?audio', 'video_audio'),
-        (r'video.*?caption', 'video_caption'),
-        (r'audio.*?caption', 'audio_caption')
-    ]
-    for pattern_str, key in mappings:
-        pattern = re.compile(fr'(?i){pattern_str}.*?[:=\-\s\(]+(\b10\b|\b\d\b)(?:/10)?')
-        match = pattern.search(text)
-        if match:
-            if scores[key] == '0':
-                scores[key] = match.group(1)
-    return scores
-def parse_veracity_toon(text: str) -> dict:
-    if not text:
-        return {}
-    text = re.sub(r'```\w*', '', text)
-    text = re.sub(r'```', '', text)
-    text = text.strip()
-    parsed_sections = {}
-    block_pattern = re.compile(
-        r'([a-zA-Z0-9_]+)\s*:\s*(?:\w+\s*)?(?:\[\s*(\d+)\s*\])?\s*\{\s*(.*?)\s*\}\s*:\s*',
-        re.MULTILINE
-    )
-    matches = list(block_pattern.finditer(text))
-    for i, match in enumerate(matches):
-        key = match.group(1).lower()
-        count = int(match.group(2)) if match.group(2) else 1
-        headers_str = match.group(3)
-        headers = [h.strip().lower() for h in headers_str.split(',')]
-        start_idx = match.end()
-        end_idx = matches[i+1].start() if i + 1 < len(matches) else len(text)
-        block_content = text[start_idx:end_idx].strip()
-        lines = [line.strip() for line in block_content.splitlines() if line.strip()]
-        data_items = []
-        valid_lines = [l for l in lines if len(l) > 1]
-        for line in valid_lines[:count]:
-            item = parse_toon_line({'key': key, 'headers': headers}, line)
-            data_items.append(item)
-        if count == 1 and data_items:
-            parsed_sections[key] = data_items[0]
-        else:
-            parsed_sections[key] = data_items
-    flat_result = {
-        'veracity_vectors': {
-            'visual_integrity_score': '0',
-            'audio_integrity_score': '0',
-            'source_credibility_score': '0',
-            'logical_consistency_score': '0',
-            'emotional_manipulation_score': '0'
-        },
-        'modalities': {
-            'video_audio_score': '0',
-            'video_caption_score': '0',
-            'audio_caption_score': '0'
-        },
-        'video_context_summary': '',
-        'factuality_factors': {},
-        'disinformation_analysis': {},
-        'final_assessment': {}
-    }
-    got_vectors = False
-    got_modalities = False
-    vectors_data = parsed_sections.get('vectors', [])
-    if isinstance(vectors_data, dict):
-        v = vectors_data
-        if any(val and val != '0' for val in v.values()):
-            if 'visual' in v: flat_result['veracity_vectors']['visual_integrity_score'] = v['visual']
-            if 'audio' in v: flat_result['veracity_vectors']['audio_integrity_score'] = v['audio']
-            if 'source' in v: flat_result['veracity_vectors']['source_credibility_score'] = v['source']
-            if 'logic' in v: flat_result['veracity_vectors']['logical_consistency_score'] = v['logic']
-            if 'emotion' in v: flat_result['veracity_vectors']['emotional_manipulation_score'] = v['emotion']
-            got_vectors = True
-    elif isinstance(vectors_data, list):
-        for item in vectors_data:
-            cat = item.get('category', '').lower()
-            score = item.get('score', '0')
-            if score and score != '0':
-                got_vectors = True
-            if 'visual' in cat: flat_result['veracity_vectors']['visual_integrity_score'] = score
-            elif 'audio' in cat: flat_result['veracity_vectors']['audio_integrity_score'] = score
-            elif 'source' in cat: flat_result['veracity_vectors']['source_credibility_score'] = score
-            elif 'logic' in cat: flat_result['veracity_vectors']['logical_consistency_score'] = score
-            elif 'emotion' in cat: flat_result['veracity_vectors']['emotional_manipulation_score'] = score
-    modalities_data = parsed_sections.get('modalities', [])
-    if isinstance(modalities_data, dict):
-        m = modalities_data
-        for k, v in m.items():
-            k_clean = k.lower().replace(' ', '').replace('-', '').replace('_', '')
-            if 'videoaudio' in k_clean: flat_result['modalities']['video_audio_score'] = v
-            elif 'videocaption' in k_clean: flat_result['modalities']['video_caption_score'] = v
-            elif 'audiocaption' in k_clean: flat_result['modalities']['audio_caption_score'] = v
-            if v and v != '0': got_modalities = True
-    elif isinstance(modalities_data, list):
-        for item in modalities_data:
-            cat = item.get('category', '').lower().replace(' ', '').replace('-', '').replace('_', '')
-            score = item.get('score', '0')
-            if score and score != '0':
-                got_modalities = True
-            if 'videoaudio' in cat: flat_result['modalities']['video_audio_score'] = score
-            elif 'videocaption' in cat: flat_result['modalities']['video_caption_score'] = score
-            elif 'audiocaption' in cat: flat_result['modalities']['audio_caption_score'] = score
-    if not got_vectors or not got_modalities:
-        fuzzy_scores = fuzzy_extract_scores(text)
-        if not got_vectors:
-            flat_result['veracity_vectors']['visual_integrity_score'] = fuzzy_scores['visual']
-            flat_result['veracity_vectors']['audio_integrity_score'] = fuzzy_scores['audio']
-            flat_result['veracity_vectors']['source_credibility_score'] = fuzzy_scores['source']
-            flat_result['veracity_vectors']['logical_consistency_score'] = fuzzy_scores['logic']
-            flat_result['veracity_vectors']['emotional_manipulation_score'] = fuzzy_scores['emotion']
-        if not got_modalities:
-            flat_result['modalities']['video_audio_score'] = fuzzy_scores['video_audio']
-            flat_result['modalities']['video_caption_score'] = fuzzy_scores['video_caption']
-            flat_result['modalities']['audio_caption_score'] = fuzzy_scores['audio_caption']
-    f = parsed_sections.get('factuality', {})
-    if isinstance(f, list): f = f[0] if f else {}
-    flat_result['factuality_factors'] = {
-        'claim_accuracy': f.get('accuracy', 'Unverifiable'),
-        'evidence_gap': f.get('gap', ''),
-        'grounding_check': f.get('grounding', '')
-    }
-    d = parsed_sections.get('disinfo', {})
-    if isinstance(d, list): d = d[0] if d else {}
-    flat_result['disinformation_analysis'] = {
-        'classification': d.get('class', 'None'),
-        'intent': d.get('intent', 'None'),
-        'threat_vector': d.get('threat', 'None')
-    }
-    fn = parsed_sections.get('final', {})
-    if isinstance(fn, list): fn = fn[0] if fn else {}
-    flat_result['final_assessment'] = {
-        'veracity_score_total': fn.get('score', '0'),
-        'reasoning': fn.get('reasoning', '')
-    }
-    s = parsed_sections.get('summary', {})
-    if isinstance(s, list): s = s[0] if s else {}
-    flat_result['video_context_summary'] = s.get('text', '')
-    flat_result['raw_parsed_structure'] = parsed_sections
-    return flat_result

src/transcription.py DELETED Viewed

@@ -1,48 +0,0 @@
-import whisper
-import logging
-from pathlib import Path
-import os
-LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
-logger = logging.getLogger(__name__)
-transcription_model = None
-def load_model():
-    if LITE_MODE:
-        logger.info("LITE_MODE is enabled. Skipping Whisper model loading.")
-        return
-    global transcription_model
-    if transcription_model is None:
-        try:
-            logger.info("Loading 'base.en' Whisper model for transcription...")
-            transcription_model = whisper.load_model("base.en")
-            logger.info("Whisper model loaded successfully.")
-        except Exception as e:
-            logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
-            transcription_model = None
-def generate_transcript(audio_path_str: str) -> str:
-    if transcription_model is None:
-        logger.warning("Transcription model is not available. Cannot generate transcript.")
-        return None
-    try:
-        audio_path = Path(audio_path_str)
-        logger.info(f"Starting transcription for: {audio_path.name}")
-        result = transcription_model.transcribe(audio_path_str, verbose=False)
-        vtt_path = audio_path.with_suffix('.vtt')
-        from whisper.utils import get_writer
-        writer = get_writer("vtt", str(vtt_path.parent))
-        writer(result, str(audio_path.name))
-        logger.info(f"Transcription complete. VTT file saved to: {vtt_path}")
-        return str(vtt_path)
-    except Exception as e:
-        logger.error(f"An error occurred during transcription for {audio_path_str}: {e}", exc_info=True)
-        return None

start.sh DELETED Viewed

@@ -1,23 +0,0 @@
-#!/bin/bash
-# 1. Start Python FastAPI in the background (Internal Port 8001)
-echo "Starting Python Inference Engine..."
-export PYTHONPATH=$PYTHONPATH:/app/src
-# Use --log-level info to see startup issues
-python -m uvicorn src.app:app --host 127.0.0.1 --port 8001 --log-level info &
-# Wait longer for Python to initialize, or until port is open
-echo "Waiting for Python backend to initialize..."
-timeout=30
-while ! curl -s http://127.0.0.1:8001/ > /dev/null; do
-    sleep 2
-    timeout=$((timeout-2))
-    if [ $timeout -le 0 ]; then
-        echo "Python backend failed to start on time. Logs might show why."
-        break
-    fi
-done
-# 2. Start Golang Web Server (Public Port 7860)
-echo "Starting Go Web Server..."
-/app/vchat-server