Spaces:

GlazedDon0t
/

liarMP4

Sleeping

App Files Files Community

GlazedDon0t commited on Mar 4

Commit

664772d

1 Parent(s): 7632cf2

new

Browse files

Files changed (6) hide show

README.md +2 -0
frontend/src/App.tsx +0 -0
src/agent_logic.py +357 -0
src/benchmarking.py +224 -0
src/common_utils.py +104 -0
src/user_analysis_logic.py +147 -0

README.md CHANGED Viewed

@@ -6,3 +6,5 @@ colorTo: gray
 sdk: docker
 pinned: false
 ---

 sdk: docker
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

frontend/src/App.tsx CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/agent_logic.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import logging
+import os
+import asyncio
+import nest_asyncio
+import hashlib
+import datetime
+import json
+import re
+import csv
+from pathlib import Path
+from typing import Any, Dict
+logger = logging.getLogger(__name__)
+# Apply nested asyncio if possible
+try:
+    nest_asyncio.apply()
+except (ValueError, ImportError):
+    pass
+import common_utils
+import inference_logic
+# --- Tool Definition & Agent Logic ---
+def analyze_video_veracity(video_url: str, specific_question: str = "", agent_config: dict = None) -> dict:
+    """Tool to analyze video veracity."""
+    if agent_config is None: agent_config = {}
+    loop = asyncio.get_event_loop()
+    if loop.is_running():
+        import concurrent.futures
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            return pool.submit(asyncio.run, _analyze_video_async(video_url, specific_question, agent_config)).result()
+    else:
+        return asyncio.run(_analyze_video_async(video_url, specific_question, agent_config))
+async def _analyze_video_async(video_url: str, context: str, agent_config: dict) -> dict:
+    try:
+        use_search = agent_config.get("use_search", False)
+        use_code = agent_config.get("use_code", False)
+        provider = agent_config.get("provider", "vertex")
+        api_key = agent_config.get("api_key", os.getenv("GEMINI_API_KEY", ""))
+        project_id = agent_config.get("project_id", os.getenv("VERTEX_PROJECT_ID", ""))
+        location = agent_config.get("location", os.getenv("VERTEX_LOCATION", "us-central1"))
+        model_name = agent_config.get("model_name", os.getenv("VERTEX_MODEL_NAME", "gemini-1.5-pro-preview-0409"))
+        reasoning_method = agent_config.get("reasoning_method", "cot")
+        prompt_template = agent_config.get("prompt_template", "standard")
+        request_id = hashlib.md5(f"{video_url}_{datetime.datetime.now()}".encode()).hexdigest()[:10]
+        assets = await common_utils.prepare_video_assets(video_url, request_id)
+        # We need the prompt instructions
+        try:
+            from labeling_logic import PROMPT_VARIANTS
+            sel_p = PROMPT_VARIANTS.get(prompt_template, PROMPT_VARIANTS['standard'])
+            system_persona_txt = sel_p['instruction']
+        except Exception:
+            system_persona_txt = "You are a Factuality Agent."
+        system_persona = f"You are the LiarMP4 Verifier. Context: {context}\n\nPersona: {system_persona_txt}"
+        trans = common_utils.parse_vtt(assets['transcript']) if assets.get('transcript') else "No transcript."
+        final_result = None
+        raw_toon_text = ""
+        pipeline_logs =[]
+        if provider == "gemini":
+            if not api_key:
+                return {"error": "Gemini API Key missing. Please provide it in the Inference Config."}
+            gemini_config = {"api_key": api_key, "model_name": model_name, "max_retries": 3, "use_search": use_search, "use_code": use_code}
+            async for chunk in inference_logic.run_gemini_labeling_pipeline(
+                video_path=assets.get('video'),
+                caption=assets.get('caption', ''),
+                transcript=trans,
+                gemini_config=gemini_config,
+                include_comments=False,
+                reasoning_method=reasoning_method,
+                system_persona=system_persona,
+                request_id=request_id
+            ):
+                if isinstance(chunk, str):
+                    pipeline_logs.append(chunk.strip())
+                elif isinstance(chunk, dict) and "parsed_data" in chunk:
+                    final_result = chunk["parsed_data"]
+                    raw_toon_text = chunk.get("raw_toon", "")
+        else:
+            if not project_id:
+                return {"error": "Vertex Project ID missing. Please provide it in the Inference Config."}
+            vertex_config = {
+                "project_id": project_id,
+                "location": location,
+                "model_name": model_name,
+                "max_retries": 3,
+                "use_search": use_search,
+                "use_code": use_code,
+                "api_key": api_key
+            }
+            async for chunk in inference_logic.run_vertex_labeling_pipeline(
+                video_path=assets.get('video'),
+                caption=assets.get('caption', ''),
+                transcript=trans,
+                vertex_config=vertex_config,
+                include_comments=False,
+                reasoning_method=reasoning_method,
+                system_persona=system_persona,
+                request_id=request_id
+            ):
+                if isinstance(chunk, str):
+                    pipeline_logs.append(chunk.strip())
+                elif isinstance(chunk, dict) and "parsed_data" in chunk:
+                    final_result = chunk["parsed_data"]
+                    raw_toon_text = chunk.get("raw_toon", "")
+        if final_result:
+            # 1. Compare to GT Database
+            gt_score = None
+            manual_path = Path("data/manual_dataset.csv")
+            if manual_path.exists():
+                for row in common_utils.robust_read_csv(manual_path):
+                    if common_utils.normalize_link(row.get('link', '')) == common_utils.normalize_link(video_url):
+                        try: gt_score = float(row.get('final_veracity_score', 0))
+                        except: pass
+                        break
+            # 2. Extract Data
+            ai_score_val = final_result.get('final_assessment', {}).get('veracity_score_total', 0)
+            try: ai_score = float(ai_score_val)
+            except: ai_score = 0
+            reasoning = final_result.get('final_assessment', {}).get('reasoning', 'No reasoning provided.')
+            vec = final_result.get('veracity_vectors', {})
+            mod = final_result.get('modalities', {})
+            fact = final_result.get('factuality_factors', {})
+            reply_text = f"[ANALYSIS COMPLETE]\nVideo: {video_url}\n\n"
+            reply_text += "--- AGENT PIPELINE LOGS ---\n"
+            reply_text += "\n".join([log for log in pipeline_logs if log]) + "\n\n"
+            reply_text += f"Final Veracity Score: {ai_score}/100\n"
+            reply_text += f"Reasoning: {reasoning}\n\n"
+            reply_text += "--- VERACITY VECTORS ---\n"
+            reply_text += f"Visual Integrity       : {vec.get('visual_integrity_score', 'N/A')}\n"
+            reply_text += f"Audio Integrity        : {vec.get('audio_integrity_score', 'N/A')}\n"
+            reply_text += f"Source Credibility     : {vec.get('source_credibility_score', 'N/A')}\n"
+            reply_text += f"Logical Consistency    : {vec.get('logical_consistency_score', 'N/A')}\n"
+            reply_text += f"Emotional Manipulation : {vec.get('emotional_manipulation_score', 'N/A')}\n\n"
+            reply_text += "--- MODALITIES ---\n"
+            reply_text += f"Video-Audio            : {mod.get('video_audio_score', 'N/A')}\n"
+            reply_text += f"Video-Caption          : {mod.get('video_caption_score', 'N/A')}\n"
+            reply_text += f"Audio-Caption          : {mod.get('audio_caption_score', 'N/A')}\n"
+            reply_text += "\n--- FACTUALITY FACTORS ---\n"
+            reply_text += f"Claim Accuracy         : {fact.get('claim_accuracy', 'N/A')}\n"
+            reply_text += f"Evidence Gap           : {fact.get('evidence_gap', 'N/A')}\n"
+            reply_text += f"Grounding Check        : {fact.get('grounding_check', 'N/A')}\n"
+            if gt_score is not None:
+                delta = abs(ai_score - gt_score)
+                reply_text += f"\n--- GROUND TRUTH COMPARISON ---\n"
+                reply_text += f"Verified GT Score      : {gt_score}/100\n"
+                reply_text += f"AI Generated Score     : {ai_score}/100\n"
+                reply_text += f"Accuracy Delta         : {delta} points\n"
+            reply_text += "\n--- RAW TOON OUTPUT ---\n"
+            reply_text += f"{raw_toon_text}\n\n"
+            config_params_str = json.dumps({"agent_active": True, "use_search": use_search, "use_code": use_code})
+            # 3. Save to Dataset properly to track agent config accuracy
+            d_path = Path("data/dataset.csv")
+            try:
+                with open(d_path, 'a', newline='', encoding='utf-8') as f:
+                    row = {
+                        "id": request_id, "link": video_url, "timestamp": datetime.datetime.now().isoformat(),
+                        "caption": assets.get('caption', ''),
+                        "final_veracity_score": ai_score,
+                        "visual_score": final_result.get('veracity_vectors', {}).get('visual_integrity_score', 0),
+                        "audio_score": final_result.get('veracity_vectors', {}).get('audio_integrity_score', 0),
+                        "source_score": final_result.get('veracity_vectors', {}).get('source_credibility_score', 0),
+                        "logic_score": final_result.get('veracity_vectors', {}).get('logical_consistency_score', 0),
+                        "emotion_score": final_result.get('veracity_vectors', {}).get('emotional_manipulation_score', 0),
+                        "align_video_audio": final_result.get('modalities', {}).get('video_audio_score', 0),
+                        "align_video_caption": final_result.get('modalities', {}).get('video_caption_score', 0),
+                        "align_audio_caption": final_result.get('modalities', {}).get('audio_caption_score', 0),
+                        "classification": final_result.get('disinformation_analysis', {}).get('classification', 'None'),
+                        "reasoning": reasoning,
+                        "tags": ",".join(final_result.get('tags',[])),
+                        "raw_toon": raw_toon_text,
+                        "config_type": "A2A Agent",
+                        "config_model": model_name,
+                        "config_prompt": prompt_template,
+                        "config_reasoning": reasoning_method,
+                        "config_params": config_params_str
+                    }
+                    writer = csv.DictWriter(f, fieldnames=[
+                        "id", "link", "timestamp", "caption",
+                        "final_veracity_score", "visual_score", "audio_score", "source_score", "logic_score", "emotion_score",
+                        "align_video_audio", "align_video_caption", "align_audio_caption",
+                        "classification", "reasoning", "tags", "raw_toon",
+                        "config_type", "config_model", "config_prompt", "config_reasoning", "config_params"
+                    ], extrasaction='ignore')
+                    if not d_path.exists() or d_path.stat().st_size == 0: writer.writeheader()
+                    writer.writerow(row)
+            except Exception as e:
+                logger.error(f"Failed writing A2A to dataset: {e}")
+            # 4. Save Raw JSON AI-generated file exactly like the ingest queue
+            try:
+                ts_clean = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+                flat_parsed = final_result.copy()
+                flat_parsed["raw_toon"] = raw_toon_text
+                flat_parsed["meta_info"] = {
+                    "id": request_id, "timestamp": datetime.datetime.now().isoformat(), "link": video_url,
+                    "prompt_used": "A2A Agent Prompt",
+                    "model_selection": provider,
+                    "config_type": "GenAI A2A",
+                    "config_model": model_name,
+                    "config_prompt": prompt_template,
+                    "config_reasoning": reasoning_method,
+                    "config_params": {"agent_active": True, "use_search": use_search, "use_code": use_code}
+                }
+                with open(Path(f"data/labels/{request_id}_{ts_clean}.json"), 'w', encoding='utf-8') as f:
+                    json.dump(flat_parsed, f, indent=2, ensure_ascii=False)
+            except Exception as e:
+                logger.error(f"Failed saving A2A raw JSON sidecar: {e}")
+            reply_text += f"\n[Pipeline] Successfully parsed context, analyzed factuality, and saved raw AI Label File to Data Manager (Provider: {provider}, Model: {model_name}, Search: {use_search})."
+            return {"text": reply_text, "data": final_result}
+        return {"error": "Inference yielded no data or credentials missing."}
+    except Exception as e:
+        logger.error(f"[Tool Error] {e}")
+        return {"error": str(e)}
+# --- Custom A2A App ---
+def create_a2a_app():
+    """Creates a robust Starlette/FastAPI app that implements core A2A JSON-RPC behavior."""
+    from fastapi import FastAPI, Request
+    a2a_app = FastAPI(title="LiarMP4 A2A Agent")
+    @a2a_app.post("/")
+    @a2a_app.post("/jsonrpc")
+    async def jsonrpc_handler(request: Request):
+        try:
+            data = await request.json()
+            method = data.get("method", "agent.process")
+            params = data.get("params", {})
+            input_text = ""
+            agent_config = {}
+            if isinstance(params, dict):
+                input_text = params.get("input", params.get("text", params.get("query", params.get("prompt", ""))))
+                agent_config = params.get("agent_config", {})
+                if not input_text and "url" in params:
+                    input_text = params["url"]
+            elif isinstance(params, list) and len(params) > 0:
+                if isinstance(params[0], dict):
+                    input_text = params[0].get("text", params[0].get("input", ""))
+                else:
+                    input_text = str(params[0])
+            elif isinstance(params, str):
+                input_text = params
+            # Accept an array of standard agentic invocation methods
+            accepted_methods =["agent.process", "agent.generate", "model.generate", "a2a.generate", "a2a.interact", "agent.interact"]
+            if method in accepted_methods or not method:
+                # Dynamic Setup & Config Management via Agent Conversation
+                update_config = {}
+                low_input = str(input_text).lower()
+                if "set provider to " in low_input:
+                    val = low_input.split("set provider to ")[-1].strip().split()[0]
+                    if val in["gemini", "vertex"]: update_config["provider"] = val
+                if "set api key to " in low_input:
+                    val = input_text.split("set api key to ")[-1].strip().split()[0]
+                    update_config["api_key"] = val
+                if "set project id to " in low_input:
+                    val = input_text.split("set project id to ")[-1].strip().split()[0]
+                    update_config["project_id"] = val
+                if update_config:
+                    return {
+                        "jsonrpc": "2.0", "id": data.get("id", 1),
+                        "result": {
+                            "text": f"✅ Agent configuration updated automatically ({', '.join(update_config.keys())}). You can now provide a video link or further instructions.",
+                            "update_config": update_config
+                        }
+                    }
+                urls = re.findall(r'(https?://[^\s]+)', str(input_text))
+                if urls:
+                    url = urls[0]
+                    logger.info(f"Agent Processing Video URL: {url} | Config: {agent_config}")
+                    res = await _analyze_video_async(url, str(input_text), agent_config)
+                    if "error" in res:
+                        reply = f"Error analyzing video: {res['error']}"
+                    else:
+                        reply = res.get("text", "Processing finished but no reply generated.")
+                else:
+                    # Agent Setup Guidance Logic
+                    provider = agent_config.get("provider", "vertex")
+                    api_key = agent_config.get("api_key", "")
+                    project_id = agent_config.get("project_id", "")
+                    base_capabilities = (
+                        "**Agent Capabilities:**\n"
+                        "- Process raw video & audio modalities via A2A\n"
+                        "- Fetch & analyze comment sentiment and community context\n"
+                        "- Run full Factuality pipeline (FCoT) & Generate Veracity Vectors\n"
+                        "- Automatically save raw AI Labeled JSON files & sync to Data Manager\n"
+                        "- Verify and compare AI outputs against Ground Truth\n"
+                        "- Reprompt dynamically for missing scores or incomplete data\n\n"
+                        "**Easy Command:**\n"
+                        "Use `Run full pipeline on[URL]` to analyze a video, extract all vectors (source, logic, emotion, etc.), and save aligned files."
+                    )
+                    if provider == 'vertex' and not project_id:
+                        reply = f"Welcome to the LiarMP4 Agent Nexus!\n\nIt looks like you haven't configured **Vertex AI** yet. Please enter your Google Cloud Project ID in the 'Inference Config' panel on the left, or tell me directly: *'set project id to [YOUR_PROJECT]'*.\n\n{base_capabilities}"
+                    elif provider == 'gemini' and not api_key:
+                        reply = f"👋 Welcome to the LiarMP4 Agent Nexus!\n\nIt looks like you haven't configured **Gemini** yet. Please enter your API Key in the 'Inference Config' panel on the left, or tell me directly: *'set api key to[YOUR_KEY]'*.\n\n{base_capabilities}"
+                    else:
+                        reply = f"✅ I am the LiarMP4 Verifier, fully configured ({provider.capitalize()}) and ready!\n\n{base_capabilities}"
+                return {
+                    "jsonrpc": "2.0",
+                    "id": data.get("id", 1),
+                    "result": {
+                        "text": reply,
+                        "data": {"status": "success", "agent": "LiarMP4_A2A"}
+                    }
+                }
+            else:
+                logger.warning(f"A2A Agent rejected unknown method: {method}")
+                return {
+                    "jsonrpc": "2.0",
+                    "id": data.get("id", 1),
+                    "error": {
+                        "code": -32601,
+                        "message": f"Method '{method}' not found. Supported: {', '.join(accepted_methods)}"
+                    }
+                }
+        except Exception as e:
+            logger.error(f"A2A Parse Error: {e}")
+            return {"jsonrpc": "2.0", "id": None, "error": {"code": -32700, "message": "Parse error"}}
+    logger.info("✅ A2A Custom Agent App created successfully.")
+    return a2a_app

src/benchmarking.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import pandas as pd
+import numpy as np
+import shutil
+import json
+import math
+from pathlib import Path
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+# Lazy import to avoid startup overhead
+try:
+    from autogluon.tabular import TabularPredictor
+    AUTOGLUON_AVAILABLE = True
+except ImportError:
+    AUTOGLUON_AVAILABLE = False
+DATA_AI = Path("data/dataset.csv")
+DATA_MANUAL = Path("data/manual_dataset.csv")
+def sanitize_for_json(obj):
+    """Recursively clean floats for JSON output."""
+    if isinstance(obj, float):
+        if math.isnan(obj) or math.isinf(obj): return None
+        return obj
+    elif isinstance(obj, dict):
+        return {k: sanitize_for_json(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return[sanitize_for_json(v) for v in obj]
+    return obj
+def calculate_tag_accuracy(tags_ai, tags_man):
+    if pd.isna(tags_ai): tags_ai = ""
+    if pd.isna(tags_man): tags_man = ""
+    set_ai = set([t.strip().lower() for t in str(tags_ai).split(',') if t.strip()])
+    set_man = set([t.strip().lower() for t in str(tags_man).split(',') if t.strip()])
+    if not set_man and not set_ai: return 1.0
+    if not set_man or not set_ai: return 0.0
+    # Jaccard Similarity
+    return len(set_ai.intersection(set_man)) / len(set_ai.union(set_man))
+def get_combined_dataset():
+    """
+    Joins AI predictions with Manual Ground Truth on ID and calculates comprehensive vector differences.
+    """
+    if not DATA_AI.exists() or not DATA_MANUAL.exists():
+        return None
+    try:
+        # Load datasets
+        df_ai = pd.read_csv(DATA_AI)
+        df_manual = pd.read_csv(DATA_MANUAL)
+        # Normalize IDs (Trim spaces, ensure string)
+        df_ai['id'] = df_ai['id'].astype(str).str.strip()
+        df_manual['id'] = df_manual['id'].astype(str).str.strip()
+        df_manual_cols =['id', 'final_veracity_score', 'visual_integrity_score', 'audio_integrity_score', 'source_credibility_score', 'logical_consistency_score', 'emotional_manipulation_score', 'video_audio_score', 'video_caption_score', 'audio_caption_score', 'tags', 'classification']
+        # Merge on ID
+        merged = pd.merge(
+            df_ai,
+            df_manual[[c for c in df_manual_cols if c in df_manual.columns]],
+            on='id',
+            suffixes=('_ai', '_manual'),
+            how='inner'
+        )
+        # 1. Final Score Error
+        merged['final_veracity_score_ai'] = pd.to_numeric(merged['final_veracity_score_ai'], errors='coerce').fillna(0)
+        merged['final_veracity_score_manual'] = pd.to_numeric(merged['final_veracity_score_manual'], errors='coerce').fillna(0)
+        merged['abs_error'] = (merged['final_veracity_score_ai'] - merged['final_veracity_score_manual']).abs()
+        # 2. Sophisticated Vector Calculations
+        vector_pairs =[
+            ('visual_score', 'visual_integrity_score'),
+            ('audio_score', 'audio_integrity_score'),
+            ('source_score', 'source_credibility_score'),
+            ('logic_score', 'logical_consistency_score'),
+            ('emotion_score', 'emotional_manipulation_score'),
+            ('align_video_audio', 'video_audio_score'),
+            ('align_video_caption', 'video_caption_score'),
+            ('align_audio_caption', 'audio_caption_score'),
+        ]
+        error_cols =['abs_error']
+        for ai_c, man_c in vector_pairs:
+            if ai_c in merged.columns and man_c in merged.columns:
+                # Multiply 1-10 scores by 10 to put them on the same 0-100 scale as final score
+                merged[ai_c] = pd.to_numeric(merged[ai_c], errors='coerce').fillna(5) * 10
+                merged[man_c] = pd.to_numeric(merged[man_c], errors='coerce').fillna(5) * 10
+                err_c = f"err_{ai_c}"
+                merged[err_c] = (merged[ai_c] - merged[man_c]).abs()
+                error_cols.append(err_c)
+        # Composite MAE represents the mean absolute error across the final score AND all 8 sub-vectors
+        merged['composite_mae'] = merged[error_cols].mean(axis=1)
+        # 3. Tag Accuracy Calculation
+        merged['tag_accuracy'] = merged.apply(lambda row: calculate_tag_accuracy(row.get('tags_ai', ''), row.get('tags_manual', '')), axis=1)
+        return merged
+    except Exception as e:
+        print(f"Error merging datasets: {e}")
+        return None
+def format_config_params(params_raw):
+    """Parses the config_params JSON string into a readable format for the leaderboard."""
+    if pd.isna(params_raw) or not params_raw:
+        return "Defaults"
+    try:
+        if isinstance(params_raw, str):
+            p = json.loads(params_raw)
+        else:
+            p = params_raw
+        reprompts = p.get('reprompts', 0)
+        comments = "Yes" if p.get('include_comments') == 'true' or p.get('include_comments') is True else "No"
+        return f"Retries:{reprompts} | Context:{comments}"
+    except:
+        return "Legacy/Unknown"
+def calculate_benchmarks():
+    """Global stats (All AI models vs Ground Truth)."""
+    merged = get_combined_dataset()
+    if merged is None or len(merged) == 0:
+        return {"status": "no_data"}
+    mae = merged['composite_mae'].mean()
+    tag_acc = merged['tag_accuracy'].mean()
+    # Binary Accuracy (Threshold 50)
+    merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
+    merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
+    accuracy = (merged['bin_ai'] == merged['bin_manual']).mean()
+    recent_samples = merged.tail(5)[['id', 'composite_mae', 'final_veracity_score_ai', 'final_veracity_score_manual']].to_dict(orient='records')
+    result = {
+        "count": int(len(merged)),
+        "mae": round(mae, 2), # Exposing composite MAE as main MAE metric
+        "accuracy_percent": round(accuracy * 100, 1),
+        "tag_accuracy_percent": round(tag_acc * 100, 1),
+        "recent_samples": recent_samples
+    }
+    return sanitize_for_json(result)
+def generate_leaderboard():
+    """
+    Groups results by Configuration to rank models/prompts using sophisticated distance measurements.
+    """
+    merged = get_combined_dataset()
+    if merged is None or len(merged) == 0:
+        return[]
+    for col in['config_model', 'config_prompt', 'config_reasoning', 'config_params']:
+        if col not in merged.columns: merged[col] = "Unknown"
+    merged = merged.fillna({'config_model': 'Unknown', 'config_prompt': 'Standard', 'config_reasoning': 'None'})
+    merged['params_readable'] = merged['config_params'].apply(format_config_params)
+    merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
+    merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
+    merged['is_correct'] = (merged['bin_ai'] == merged['bin_manual']).astype(int)
+    def get_fcot_depth(row):
+        r = str(row['config_reasoning']).lower()
+        if 'fcot' in r: return 2
+        elif 'cot' in r: return 1
+        return 0
+    merged['fcot_depth'] = merged.apply(get_fcot_depth, axis=1)
+    # Group By Configuration using Composite MAE and Tag Accuracy
+    grouped = merged.groupby(['config_model', 'config_prompt', 'config_reasoning', 'params_readable', 'fcot_depth']).agg(
+        comp_mae=('composite_mae', 'mean'),
+        tag_accuracy=('tag_accuracy', 'mean'),
+        accuracy=('is_correct', 'mean'),
+        count=('id', 'count')
+    ).reset_index()
+    leaderboard =[]
+    for _, row in grouped.iterrows():
+        leaderboard.append({
+            "type": "GenAI",
+            "model": row['config_model'],
+            "prompt": row['config_prompt'],
+            "reasoning": row['config_reasoning'],
+            "params": row['params_readable'],
+            "fcot_depth": int(row['fcot_depth']),
+            "comp_mae": round(row['comp_mae'], 2),
+            "tag_acc": round(row['tag_accuracy'] * 100, 1),
+            "accuracy": round(row['accuracy'] * 100, 1),
+            "samples": int(row['count'])
+        })
+    # Sort: Highest Accuracy, Highest Tag Accuracy, then Lowest Composite MAE
+    leaderboard.sort(key=lambda x: (-x['accuracy'], -x['tag_acc'], x['comp_mae']))
+    return sanitize_for_json(leaderboard)
+def train_predictive_sandbox(features_config: dict):
+    if not DATA_MANUAL.exists(): return {"error": "No data"}
+    df = pd.read_csv(DATA_MANUAL).dropna(subset=['caption', 'final_veracity_score'])
+    if len(df) < 5: return {"error": "Not enough data"}
+    df['len'] = df['caption'].astype(str).apply(len)
+    keywords = ["shocking", "breaking", "watch"]
+    df['kw_count'] = df['caption'].astype(str).apply(lambda x: sum(1 for k in keywords if k in x.lower()))
+    feat_cols = ['len', 'kw_count']
+    df['target'] = (pd.to_numeric(df['final_veracity_score'], errors='coerce').fillna(0) >= 50).astype(int)
+    try:
+        X_train, X_test, y_train, y_test = train_test_split(df[feat_cols], df['target'], test_size=0.3, random_state=42)
+        clf = LogisticRegression()
+        clf.fit(X_train, y_train)
+        return {
+            "status": "success",
+            "type": "logistic_regression",
+            "accuracy": round(clf.score(X_test, y_test) * 100, 1),
+            "message": "Baseline trained on Caption Length + Keywords."
+        }
+    except Exception as e:
+        return {"error": str(e)}

src/common_utils.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import os
+import re
+import csv
+import logging
+import datetime
+import subprocess
+import hashlib
+from pathlib import Path
+import yt_dlp
+import transcription
+logger = logging.getLogger(__name__)
+def robust_read_csv(file_path: Path):
+    if not file_path.exists():
+        return
+    try:
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            clean_lines = (line.replace('\0', '') for line in f)
+            reader = csv.DictReader(clean_lines)
+            for row in reader:
+                if row:
+                    yield row
+    except Exception as e:
+        logger.error(f"Error reading CSV {file_path}: {e}")
+        return
+def extract_tweet_id(url: str) -> str | None:
+    if not url: return None
+    match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
+    if match: return match.group(1)
+    return None
+def normalize_link(link: str) -> str:
+    if not link: return ""
+    return link.split('?')[0].strip().rstrip('/').replace('http://', '').replace('https://', '').replace('www.', '')
+def parse_vtt(file_path: str) -> str:
+    """Parses a .vtt subtitle file and returns the clean text content."""
+    try:
+        if not os.path.exists(file_path):
+            return "Transcript file not found."
+        with open(file_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        text_lines =[]
+        for line in lines:
+            line = line.strip()
+            if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
+                clean_line = re.sub(r'<[^>]+>', '', line)
+                if clean_line and (not text_lines or clean_line != text_lines[-1]):
+                     text_lines.append(clean_line)
+        return "\n".join(text_lines) if text_lines else "No speech found in transcript."
+    except Exception as e:
+        logger.error(f"Error parsing VTT file {file_path}: {e}")
+        return f"Error reading transcript: {e}"
+async def prepare_video_assets(link: str, output_id: str) -> dict:
+    video_dir = Path("data/videos")
+    if not video_dir.exists():
+        video_dir.mkdir(parents=True, exist_ok=True)
+    video_path = video_dir / f"{output_id}.mp4"
+    audio_path = video_dir / f"{output_id}.wav"
+    transcript_path = video_dir / f"{output_id}.vtt"
+    caption = ""
+    video_downloaded = False
+    ydl_opts = {
+        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
+        'outtmpl': str(video_path),
+        'quiet': True, 'ignoreerrors': True, 'no_warnings': True, 'skip_download': False
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(link, download=False)
+            if info:
+                caption = info.get('description', '') or info.get('title', '')
+                formats = info.get('formats',[])
+                if not formats and not info.get('url'):
+                     logger.info(f"No video formats found for {link}. Treating as text-only.")
+                else:
+                    if not video_path.exists(): ydl.download([link])
+    except Exception as e:
+        logger.error(f"Download error for {link}: {e}")
+    if video_path.exists() and video_path.stat().st_size > 0:
+        video_downloaded = True
+        if not audio_path.exists():
+            subprocess.run(["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(audio_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        if audio_path.exists() and not transcript_path.exists():
+            transcription.load_model()
+            transcription.generate_transcript(str(audio_path))
+    return {
+        "video": str(video_path) if video_downloaded else None,
+        "transcript": str(transcript_path) if video_downloaded and transcript_path.exists() else None,
+        "caption": caption
+    }

src/user_analysis_logic.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import os
+import csv
+import json
+import logging
+import asyncio
+from pathlib import Path
+import inference_logic
+# Configure Logging
+logger = logging.getLogger(__name__)
+# --- Prompts for User Analysis ---
+PROMPT_USER_PROFILING = """
+You are an Expert Intelligence Analyst specializing in Information Integrity and Social Influence Operations.
+**TASK:**
+Analyze the following timeline of social media posts from a single user: "@{username}".
+Your goal is to construct a "Credibility & Bias Profile" based on their historical behavior.
+**INPUT DATA (Recent Posts):**
+{timeline_text}
+**ANALYSIS REQUIREMENTS:**
+1.  **Thematic Clusters:** What subjects does this user repeatedly post about? (e.g., "Crypto", "US Politics", "Climate Skepticism").
+2.  **Echo Chamber Indicators:** Does the user frequently repost specific domains or engage with specific narratives without adding nuance?
+3.  **Emotional Valence:** Analyze the aggregate emotional tone (Alarmist, Neutral, Aggressive, Satirical).
+4.  **Bias Detection:** Identify explicit political or ideological biases based on the text.
+5.  **Credibility Weighting:** Based on the content, assign a "Historical Credibility Score" (0.0 to 1.0).
+    *   0.0 = High frequency of inflammatory/unverified claims.
+    *   1.0 = Consistently neutral or verified sourcing.
+**OUTPUT FORMAT (Strict JSON):**
+{{
+  "username": "@{username}",
+  "thematic_clusters": ["Topic A", "Topic B"],
+  "echo_chamber_detected": boolean,
+  "bias_assessment": "Description of bias...",
+  "emotional_valence": "Dominant tone...",
+  "credibility_score": float,
+  "summary_profile": "A concise paragraph summarizing the user's role in the information ecosystem."
+}}
+"""
+async def load_user_history(username: str, limit: int = 50) -> str:
+    """
+    Reads the user's history.csv and formats it into a text block for the LLM.
+    """
+    csv_path = Path(f"data/profiles/{username}/history.csv")
+    if not csv_path.exists():
+        return ""
+    timeline_entries =[]
+    try:
+        with open(csv_path, 'r', encoding='utf-8', errors='replace') as f:
+            reader = csv.DictReader(f)
+            # Read all, sort by date descending if needed, but scraper usually does desc
+            rows = list(reader)
+            # Take latest 'limit' posts
+            recent_rows = rows[-limit:]
+            for row in recent_rows:
+                entry = (
+                    f"[{row['timestamp']}] "
+                    f"{'REPOST' if row.get('is_reply')=='True' else 'POST'}: "
+                    f"\"{row['text']}\" "
+                    f"(Likes: {row['metric_likes']}, Views: {row['metric_views']})"
+                )
+                timeline_entries.append(entry)
+    except Exception as e:
+        logger.error(f"Error reading history for {username}: {e}")
+        return ""
+    return "\n".join(timeline_entries)
+async def generate_user_profile_report(username: str):
+    """
+    Orchestrates the analysis pipeline:
+    1. Load History.
+    2. Construct Prompt.
+    3. Call LLM (using Vertex/Gemini config from environment or default).
+    4. Save JSON Report.
+    """
+    logger.info(f"Starting analysis for user: {username}")
+    timeline_text = await load_user_history(username)
+    if not timeline_text:
+        return {"error": "No history found or empty timeline."}
+    # Format Prompt
+    prompt = PROMPT_USER_PROFILING.format(username=username, timeline_text=timeline_text)
+    # Use Vertex AI by default if configured, else try Gemini Legacy
+    # For now, we reuse the pipeline functions in inference_logic if available,
+    # or create a direct call here for simplicity.
+    # We'll assume Vertex is the primary backend for this advanced analysis
+    # This requires valid credentials in the environment or passed config.
+    # Fallback to a placeholder if no model is loaded.
+    report_json = {}
+    try:
+        # Attempt to use the existing Vertex Client in inference_logic if initialized
+        # Otherwise, we instantiate a quick one if env vars exist
+        project_id = os.getenv("VERTEX_PROJECT_ID")
+        location = os.getenv("VERTEX_LOCATION", "us-central1")
+        api_key = os.getenv("VERTEX_API_KEY")
+        if inference_logic.genai and project_id:
+            from google.genai import Client
+            from google.genai.types import GenerateContentConfig
+            if api_key:
+                client = Client(vertexai=True, project=project_id, location=location, api_key=api_key)
+            else:
+                client = Client(vertexai=True, project=project_id, location=location)
+            response = client.models.generate_content(
+                model="gemini-1.5-pro-preview-0409",
+                contents=prompt,
+                config=GenerateContentConfig(response_mime_type="application/json")
+            )
+            report_text = response.text
+            report_json = json.loads(report_text)
+        else:
+            # Fallback Mock for Demo/LITE mode
+            logger.warning("Vertex AI credentials not found. Generating Mock Analysis.")
+            report_json = {
+                "username": f"@{username}",
+                "thematic_clusters":["Simulated Topic 1", "Simulated Topic 2"],
+                "bias_assessment": "System running in LITE mode. Configure Vertex AI for real analysis.",
+                "credibility_score": 0.5,
+                "summary_profile": "Mock profile generated because AI backend is not active."
+            }
+    except Exception as e:
+        logger.error(f"LLM Analysis failed: {e}")
+        report_json = {"error": str(e)}
+    # Save Report
+    output_path = Path(f"data/profiles/{username}/analysis_report.json")
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(report_json, f, indent=2)
+    return report_json