Spaces:
Sleeping
Sleeping
Commit ·
664772d
1
Parent(s): 7632cf2
new
Browse files- README.md +2 -0
- frontend/src/App.tsx +0 -0
- src/agent_logic.py +357 -0
- src/benchmarking.py +224 -0
- src/common_utils.py +104 -0
- src/user_analysis_logic.py +147 -0
README.md
CHANGED
|
@@ -6,3 +6,5 @@ colorTo: gray
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
---
|
|
|
|
|
|
|
|
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
+
|
| 10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
frontend/src/App.tsx
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/agent_logic.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import nest_asyncio
|
| 5 |
+
import hashlib
|
| 6 |
+
import datetime
|
| 7 |
+
import json
|
| 8 |
+
import re
|
| 9 |
+
import csv
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Dict
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# Apply nested asyncio if possible
|
| 16 |
+
try:
|
| 17 |
+
nest_asyncio.apply()
|
| 18 |
+
except (ValueError, ImportError):
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
import common_utils
|
| 22 |
+
import inference_logic
|
| 23 |
+
|
| 24 |
+
# --- Tool Definition & Agent Logic ---
|
| 25 |
+
|
| 26 |
+
def analyze_video_veracity(video_url: str, specific_question: str = "", agent_config: dict = None) -> dict:
|
| 27 |
+
"""Tool to analyze video veracity."""
|
| 28 |
+
if agent_config is None: agent_config = {}
|
| 29 |
+
loop = asyncio.get_event_loop()
|
| 30 |
+
if loop.is_running():
|
| 31 |
+
import concurrent.futures
|
| 32 |
+
with concurrent.futures.ThreadPoolExecutor() as pool:
|
| 33 |
+
return pool.submit(asyncio.run, _analyze_video_async(video_url, specific_question, agent_config)).result()
|
| 34 |
+
else:
|
| 35 |
+
return asyncio.run(_analyze_video_async(video_url, specific_question, agent_config))
|
| 36 |
+
|
| 37 |
+
async def _analyze_video_async(video_url: str, context: str, agent_config: dict) -> dict:
|
| 38 |
+
try:
|
| 39 |
+
use_search = agent_config.get("use_search", False)
|
| 40 |
+
use_code = agent_config.get("use_code", False)
|
| 41 |
+
provider = agent_config.get("provider", "vertex")
|
| 42 |
+
api_key = agent_config.get("api_key", os.getenv("GEMINI_API_KEY", ""))
|
| 43 |
+
project_id = agent_config.get("project_id", os.getenv("VERTEX_PROJECT_ID", ""))
|
| 44 |
+
location = agent_config.get("location", os.getenv("VERTEX_LOCATION", "us-central1"))
|
| 45 |
+
model_name = agent_config.get("model_name", os.getenv("VERTEX_MODEL_NAME", "gemini-1.5-pro-preview-0409"))
|
| 46 |
+
reasoning_method = agent_config.get("reasoning_method", "cot")
|
| 47 |
+
prompt_template = agent_config.get("prompt_template", "standard")
|
| 48 |
+
|
| 49 |
+
request_id = hashlib.md5(f"{video_url}_{datetime.datetime.now()}".encode()).hexdigest()[:10]
|
| 50 |
+
assets = await common_utils.prepare_video_assets(video_url, request_id)
|
| 51 |
+
|
| 52 |
+
# We need the prompt instructions
|
| 53 |
+
try:
|
| 54 |
+
from labeling_logic import PROMPT_VARIANTS
|
| 55 |
+
sel_p = PROMPT_VARIANTS.get(prompt_template, PROMPT_VARIANTS['standard'])
|
| 56 |
+
system_persona_txt = sel_p['instruction']
|
| 57 |
+
except Exception:
|
| 58 |
+
system_persona_txt = "You are a Factuality Agent."
|
| 59 |
+
|
| 60 |
+
system_persona = f"You are the LiarMP4 Verifier. Context: {context}\n\nPersona: {system_persona_txt}"
|
| 61 |
+
|
| 62 |
+
trans = common_utils.parse_vtt(assets['transcript']) if assets.get('transcript') else "No transcript."
|
| 63 |
+
|
| 64 |
+
final_result = None
|
| 65 |
+
raw_toon_text = ""
|
| 66 |
+
pipeline_logs =[]
|
| 67 |
+
|
| 68 |
+
if provider == "gemini":
|
| 69 |
+
if not api_key:
|
| 70 |
+
return {"error": "Gemini API Key missing. Please provide it in the Inference Config."}
|
| 71 |
+
gemini_config = {"api_key": api_key, "model_name": model_name, "max_retries": 3, "use_search": use_search, "use_code": use_code}
|
| 72 |
+
async for chunk in inference_logic.run_gemini_labeling_pipeline(
|
| 73 |
+
video_path=assets.get('video'),
|
| 74 |
+
caption=assets.get('caption', ''),
|
| 75 |
+
transcript=trans,
|
| 76 |
+
gemini_config=gemini_config,
|
| 77 |
+
include_comments=False,
|
| 78 |
+
reasoning_method=reasoning_method,
|
| 79 |
+
system_persona=system_persona,
|
| 80 |
+
request_id=request_id
|
| 81 |
+
):
|
| 82 |
+
if isinstance(chunk, str):
|
| 83 |
+
pipeline_logs.append(chunk.strip())
|
| 84 |
+
elif isinstance(chunk, dict) and "parsed_data" in chunk:
|
| 85 |
+
final_result = chunk["parsed_data"]
|
| 86 |
+
raw_toon_text = chunk.get("raw_toon", "")
|
| 87 |
+
else:
|
| 88 |
+
if not project_id:
|
| 89 |
+
return {"error": "Vertex Project ID missing. Please provide it in the Inference Config."}
|
| 90 |
+
vertex_config = {
|
| 91 |
+
"project_id": project_id,
|
| 92 |
+
"location": location,
|
| 93 |
+
"model_name": model_name,
|
| 94 |
+
"max_retries": 3,
|
| 95 |
+
"use_search": use_search,
|
| 96 |
+
"use_code": use_code,
|
| 97 |
+
"api_key": api_key
|
| 98 |
+
}
|
| 99 |
+
async for chunk in inference_logic.run_vertex_labeling_pipeline(
|
| 100 |
+
video_path=assets.get('video'),
|
| 101 |
+
caption=assets.get('caption', ''),
|
| 102 |
+
transcript=trans,
|
| 103 |
+
vertex_config=vertex_config,
|
| 104 |
+
include_comments=False,
|
| 105 |
+
reasoning_method=reasoning_method,
|
| 106 |
+
system_persona=system_persona,
|
| 107 |
+
request_id=request_id
|
| 108 |
+
):
|
| 109 |
+
if isinstance(chunk, str):
|
| 110 |
+
pipeline_logs.append(chunk.strip())
|
| 111 |
+
elif isinstance(chunk, dict) and "parsed_data" in chunk:
|
| 112 |
+
final_result = chunk["parsed_data"]
|
| 113 |
+
raw_toon_text = chunk.get("raw_toon", "")
|
| 114 |
+
|
| 115 |
+
if final_result:
|
| 116 |
+
# 1. Compare to GT Database
|
| 117 |
+
gt_score = None
|
| 118 |
+
manual_path = Path("data/manual_dataset.csv")
|
| 119 |
+
if manual_path.exists():
|
| 120 |
+
for row in common_utils.robust_read_csv(manual_path):
|
| 121 |
+
if common_utils.normalize_link(row.get('link', '')) == common_utils.normalize_link(video_url):
|
| 122 |
+
try: gt_score = float(row.get('final_veracity_score', 0))
|
| 123 |
+
except: pass
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
# 2. Extract Data
|
| 127 |
+
ai_score_val = final_result.get('final_assessment', {}).get('veracity_score_total', 0)
|
| 128 |
+
try: ai_score = float(ai_score_val)
|
| 129 |
+
except: ai_score = 0
|
| 130 |
+
|
| 131 |
+
reasoning = final_result.get('final_assessment', {}).get('reasoning', 'No reasoning provided.')
|
| 132 |
+
|
| 133 |
+
vec = final_result.get('veracity_vectors', {})
|
| 134 |
+
mod = final_result.get('modalities', {})
|
| 135 |
+
fact = final_result.get('factuality_factors', {})
|
| 136 |
+
|
| 137 |
+
reply_text = f"[ANALYSIS COMPLETE]\nVideo: {video_url}\n\n"
|
| 138 |
+
reply_text += "--- AGENT PIPELINE LOGS ---\n"
|
| 139 |
+
reply_text += "\n".join([log for log in pipeline_logs if log]) + "\n\n"
|
| 140 |
+
|
| 141 |
+
reply_text += f"Final Veracity Score: {ai_score}/100\n"
|
| 142 |
+
reply_text += f"Reasoning: {reasoning}\n\n"
|
| 143 |
+
|
| 144 |
+
reply_text += "--- VERACITY VECTORS ---\n"
|
| 145 |
+
reply_text += f"Visual Integrity : {vec.get('visual_integrity_score', 'N/A')}\n"
|
| 146 |
+
reply_text += f"Audio Integrity : {vec.get('audio_integrity_score', 'N/A')}\n"
|
| 147 |
+
reply_text += f"Source Credibility : {vec.get('source_credibility_score', 'N/A')}\n"
|
| 148 |
+
reply_text += f"Logical Consistency : {vec.get('logical_consistency_score', 'N/A')}\n"
|
| 149 |
+
reply_text += f"Emotional Manipulation : {vec.get('emotional_manipulation_score', 'N/A')}\n\n"
|
| 150 |
+
|
| 151 |
+
reply_text += "--- MODALITIES ---\n"
|
| 152 |
+
reply_text += f"Video-Audio : {mod.get('video_audio_score', 'N/A')}\n"
|
| 153 |
+
reply_text += f"Video-Caption : {mod.get('video_caption_score', 'N/A')}\n"
|
| 154 |
+
reply_text += f"Audio-Caption : {mod.get('audio_caption_score', 'N/A')}\n"
|
| 155 |
+
|
| 156 |
+
reply_text += "\n--- FACTUALITY FACTORS ---\n"
|
| 157 |
+
reply_text += f"Claim Accuracy : {fact.get('claim_accuracy', 'N/A')}\n"
|
| 158 |
+
reply_text += f"Evidence Gap : {fact.get('evidence_gap', 'N/A')}\n"
|
| 159 |
+
reply_text += f"Grounding Check : {fact.get('grounding_check', 'N/A')}\n"
|
| 160 |
+
|
| 161 |
+
if gt_score is not None:
|
| 162 |
+
delta = abs(ai_score - gt_score)
|
| 163 |
+
reply_text += f"\n--- GROUND TRUTH COMPARISON ---\n"
|
| 164 |
+
reply_text += f"Verified GT Score : {gt_score}/100\n"
|
| 165 |
+
reply_text += f"AI Generated Score : {ai_score}/100\n"
|
| 166 |
+
reply_text += f"Accuracy Delta : {delta} points\n"
|
| 167 |
+
|
| 168 |
+
reply_text += "\n--- RAW TOON OUTPUT ---\n"
|
| 169 |
+
reply_text += f"{raw_toon_text}\n\n"
|
| 170 |
+
|
| 171 |
+
config_params_str = json.dumps({"agent_active": True, "use_search": use_search, "use_code": use_code})
|
| 172 |
+
|
| 173 |
+
# 3. Save to Dataset properly to track agent config accuracy
|
| 174 |
+
d_path = Path("data/dataset.csv")
|
| 175 |
+
try:
|
| 176 |
+
with open(d_path, 'a', newline='', encoding='utf-8') as f:
|
| 177 |
+
row = {
|
| 178 |
+
"id": request_id, "link": video_url, "timestamp": datetime.datetime.now().isoformat(),
|
| 179 |
+
"caption": assets.get('caption', ''),
|
| 180 |
+
"final_veracity_score": ai_score,
|
| 181 |
+
"visual_score": final_result.get('veracity_vectors', {}).get('visual_integrity_score', 0),
|
| 182 |
+
"audio_score": final_result.get('veracity_vectors', {}).get('audio_integrity_score', 0),
|
| 183 |
+
"source_score": final_result.get('veracity_vectors', {}).get('source_credibility_score', 0),
|
| 184 |
+
"logic_score": final_result.get('veracity_vectors', {}).get('logical_consistency_score', 0),
|
| 185 |
+
"emotion_score": final_result.get('veracity_vectors', {}).get('emotional_manipulation_score', 0),
|
| 186 |
+
"align_video_audio": final_result.get('modalities', {}).get('video_audio_score', 0),
|
| 187 |
+
"align_video_caption": final_result.get('modalities', {}).get('video_caption_score', 0),
|
| 188 |
+
"align_audio_caption": final_result.get('modalities', {}).get('audio_caption_score', 0),
|
| 189 |
+
"classification": final_result.get('disinformation_analysis', {}).get('classification', 'None'),
|
| 190 |
+
"reasoning": reasoning,
|
| 191 |
+
"tags": ",".join(final_result.get('tags',[])),
|
| 192 |
+
"raw_toon": raw_toon_text,
|
| 193 |
+
"config_type": "A2A Agent",
|
| 194 |
+
"config_model": model_name,
|
| 195 |
+
"config_prompt": prompt_template,
|
| 196 |
+
"config_reasoning": reasoning_method,
|
| 197 |
+
"config_params": config_params_str
|
| 198 |
+
}
|
| 199 |
+
writer = csv.DictWriter(f, fieldnames=[
|
| 200 |
+
"id", "link", "timestamp", "caption",
|
| 201 |
+
"final_veracity_score", "visual_score", "audio_score", "source_score", "logic_score", "emotion_score",
|
| 202 |
+
"align_video_audio", "align_video_caption", "align_audio_caption",
|
| 203 |
+
"classification", "reasoning", "tags", "raw_toon",
|
| 204 |
+
"config_type", "config_model", "config_prompt", "config_reasoning", "config_params"
|
| 205 |
+
], extrasaction='ignore')
|
| 206 |
+
if not d_path.exists() or d_path.stat().st_size == 0: writer.writeheader()
|
| 207 |
+
writer.writerow(row)
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.error(f"Failed writing A2A to dataset: {e}")
|
| 210 |
+
|
| 211 |
+
# 4. Save Raw JSON AI-generated file exactly like the ingest queue
|
| 212 |
+
try:
|
| 213 |
+
ts_clean = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 214 |
+
flat_parsed = final_result.copy()
|
| 215 |
+
flat_parsed["raw_toon"] = raw_toon_text
|
| 216 |
+
flat_parsed["meta_info"] = {
|
| 217 |
+
"id": request_id, "timestamp": datetime.datetime.now().isoformat(), "link": video_url,
|
| 218 |
+
"prompt_used": "A2A Agent Prompt",
|
| 219 |
+
"model_selection": provider,
|
| 220 |
+
"config_type": "GenAI A2A",
|
| 221 |
+
"config_model": model_name,
|
| 222 |
+
"config_prompt": prompt_template,
|
| 223 |
+
"config_reasoning": reasoning_method,
|
| 224 |
+
"config_params": {"agent_active": True, "use_search": use_search, "use_code": use_code}
|
| 225 |
+
}
|
| 226 |
+
with open(Path(f"data/labels/{request_id}_{ts_clean}.json"), 'w', encoding='utf-8') as f:
|
| 227 |
+
json.dump(flat_parsed, f, indent=2, ensure_ascii=False)
|
| 228 |
+
except Exception as e:
|
| 229 |
+
logger.error(f"Failed saving A2A raw JSON sidecar: {e}")
|
| 230 |
+
|
| 231 |
+
reply_text += f"\n[Pipeline] Successfully parsed context, analyzed factuality, and saved raw AI Label File to Data Manager (Provider: {provider}, Model: {model_name}, Search: {use_search})."
|
| 232 |
+
|
| 233 |
+
return {"text": reply_text, "data": final_result}
|
| 234 |
+
|
| 235 |
+
return {"error": "Inference yielded no data or credentials missing."}
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"[Tool Error] {e}")
|
| 239 |
+
return {"error": str(e)}
|
| 240 |
+
|
| 241 |
+
# --- Custom A2A App ---
|
| 242 |
+
def create_a2a_app():
|
| 243 |
+
"""Creates a robust Starlette/FastAPI app that implements core A2A JSON-RPC behavior."""
|
| 244 |
+
from fastapi import FastAPI, Request
|
| 245 |
+
|
| 246 |
+
a2a_app = FastAPI(title="LiarMP4 A2A Agent")
|
| 247 |
+
|
| 248 |
+
@a2a_app.post("/")
|
| 249 |
+
@a2a_app.post("/jsonrpc")
|
| 250 |
+
async def jsonrpc_handler(request: Request):
|
| 251 |
+
try:
|
| 252 |
+
data = await request.json()
|
| 253 |
+
method = data.get("method", "agent.process")
|
| 254 |
+
params = data.get("params", {})
|
| 255 |
+
|
| 256 |
+
input_text = ""
|
| 257 |
+
agent_config = {}
|
| 258 |
+
if isinstance(params, dict):
|
| 259 |
+
input_text = params.get("input", params.get("text", params.get("query", params.get("prompt", ""))))
|
| 260 |
+
agent_config = params.get("agent_config", {})
|
| 261 |
+
if not input_text and "url" in params:
|
| 262 |
+
input_text = params["url"]
|
| 263 |
+
elif isinstance(params, list) and len(params) > 0:
|
| 264 |
+
if isinstance(params[0], dict):
|
| 265 |
+
input_text = params[0].get("text", params[0].get("input", ""))
|
| 266 |
+
else:
|
| 267 |
+
input_text = str(params[0])
|
| 268 |
+
elif isinstance(params, str):
|
| 269 |
+
input_text = params
|
| 270 |
+
|
| 271 |
+
# Accept an array of standard agentic invocation methods
|
| 272 |
+
accepted_methods =["agent.process", "agent.generate", "model.generate", "a2a.generate", "a2a.interact", "agent.interact"]
|
| 273 |
+
|
| 274 |
+
if method in accepted_methods or not method:
|
| 275 |
+
|
| 276 |
+
# Dynamic Setup & Config Management via Agent Conversation
|
| 277 |
+
update_config = {}
|
| 278 |
+
low_input = str(input_text).lower()
|
| 279 |
+
if "set provider to " in low_input:
|
| 280 |
+
val = low_input.split("set provider to ")[-1].strip().split()[0]
|
| 281 |
+
if val in["gemini", "vertex"]: update_config["provider"] = val
|
| 282 |
+
if "set api key to " in low_input:
|
| 283 |
+
val = input_text.split("set api key to ")[-1].strip().split()[0]
|
| 284 |
+
update_config["api_key"] = val
|
| 285 |
+
if "set project id to " in low_input:
|
| 286 |
+
val = input_text.split("set project id to ")[-1].strip().split()[0]
|
| 287 |
+
update_config["project_id"] = val
|
| 288 |
+
|
| 289 |
+
if update_config:
|
| 290 |
+
return {
|
| 291 |
+
"jsonrpc": "2.0", "id": data.get("id", 1),
|
| 292 |
+
"result": {
|
| 293 |
+
"text": f"✅ Agent configuration updated automatically ({', '.join(update_config.keys())}). You can now provide a video link or further instructions.",
|
| 294 |
+
"update_config": update_config
|
| 295 |
+
}
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
urls = re.findall(r'(https?://[^\s]+)', str(input_text))
|
| 299 |
+
|
| 300 |
+
if urls:
|
| 301 |
+
url = urls[0]
|
| 302 |
+
logger.info(f"Agent Processing Video URL: {url} | Config: {agent_config}")
|
| 303 |
+
res = await _analyze_video_async(url, str(input_text), agent_config)
|
| 304 |
+
|
| 305 |
+
if "error" in res:
|
| 306 |
+
reply = f"Error analyzing video: {res['error']}"
|
| 307 |
+
else:
|
| 308 |
+
reply = res.get("text", "Processing finished but no reply generated.")
|
| 309 |
+
else:
|
| 310 |
+
# Agent Setup Guidance Logic
|
| 311 |
+
provider = agent_config.get("provider", "vertex")
|
| 312 |
+
api_key = agent_config.get("api_key", "")
|
| 313 |
+
project_id = agent_config.get("project_id", "")
|
| 314 |
+
|
| 315 |
+
base_capabilities = (
|
| 316 |
+
"**Agent Capabilities:**\n"
|
| 317 |
+
"- Process raw video & audio modalities via A2A\n"
|
| 318 |
+
"- Fetch & analyze comment sentiment and community context\n"
|
| 319 |
+
"- Run full Factuality pipeline (FCoT) & Generate Veracity Vectors\n"
|
| 320 |
+
"- Automatically save raw AI Labeled JSON files & sync to Data Manager\n"
|
| 321 |
+
"- Verify and compare AI outputs against Ground Truth\n"
|
| 322 |
+
"- Reprompt dynamically for missing scores or incomplete data\n\n"
|
| 323 |
+
"**Easy Command:**\n"
|
| 324 |
+
"Use `Run full pipeline on[URL]` to analyze a video, extract all vectors (source, logic, emotion, etc.), and save aligned files."
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
if provider == 'vertex' and not project_id:
|
| 328 |
+
reply = f"Welcome to the LiarMP4 Agent Nexus!\n\nIt looks like you haven't configured **Vertex AI** yet. Please enter your Google Cloud Project ID in the 'Inference Config' panel on the left, or tell me directly: *'set project id to [YOUR_PROJECT]'*.\n\n{base_capabilities}"
|
| 329 |
+
elif provider == 'gemini' and not api_key:
|
| 330 |
+
reply = f"👋 Welcome to the LiarMP4 Agent Nexus!\n\nIt looks like you haven't configured **Gemini** yet. Please enter your API Key in the 'Inference Config' panel on the left, or tell me directly: *'set api key to[YOUR_KEY]'*.\n\n{base_capabilities}"
|
| 331 |
+
else:
|
| 332 |
+
reply = f"✅ I am the LiarMP4 Verifier, fully configured ({provider.capitalize()}) and ready!\n\n{base_capabilities}"
|
| 333 |
+
|
| 334 |
+
return {
|
| 335 |
+
"jsonrpc": "2.0",
|
| 336 |
+
"id": data.get("id", 1),
|
| 337 |
+
"result": {
|
| 338 |
+
"text": reply,
|
| 339 |
+
"data": {"status": "success", "agent": "LiarMP4_A2A"}
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
else:
|
| 343 |
+
logger.warning(f"A2A Agent rejected unknown method: {method}")
|
| 344 |
+
return {
|
| 345 |
+
"jsonrpc": "2.0",
|
| 346 |
+
"id": data.get("id", 1),
|
| 347 |
+
"error": {
|
| 348 |
+
"code": -32601,
|
| 349 |
+
"message": f"Method '{method}' not found. Supported: {', '.join(accepted_methods)}"
|
| 350 |
+
}
|
| 351 |
+
}
|
| 352 |
+
except Exception as e:
|
| 353 |
+
logger.error(f"A2A Parse Error: {e}")
|
| 354 |
+
return {"jsonrpc": "2.0", "id": None, "error": {"code": -32700, "message": "Parse error"}}
|
| 355 |
+
|
| 356 |
+
logger.info("✅ A2A Custom Agent App created successfully.")
|
| 357 |
+
return a2a_app
|
src/benchmarking.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import shutil
|
| 4 |
+
import json
|
| 5 |
+
import math
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
|
| 10 |
+
# Lazy import to avoid startup overhead
|
| 11 |
+
try:
|
| 12 |
+
from autogluon.tabular import TabularPredictor
|
| 13 |
+
AUTOGLUON_AVAILABLE = True
|
| 14 |
+
except ImportError:
|
| 15 |
+
AUTOGLUON_AVAILABLE = False
|
| 16 |
+
|
| 17 |
+
DATA_AI = Path("data/dataset.csv")
|
| 18 |
+
DATA_MANUAL = Path("data/manual_dataset.csv")
|
| 19 |
+
|
| 20 |
+
def sanitize_for_json(obj):
|
| 21 |
+
"""Recursively clean floats for JSON output."""
|
| 22 |
+
if isinstance(obj, float):
|
| 23 |
+
if math.isnan(obj) or math.isinf(obj): return None
|
| 24 |
+
return obj
|
| 25 |
+
elif isinstance(obj, dict):
|
| 26 |
+
return {k: sanitize_for_json(v) for k, v in obj.items()}
|
| 27 |
+
elif isinstance(obj, list):
|
| 28 |
+
return[sanitize_for_json(v) for v in obj]
|
| 29 |
+
return obj
|
| 30 |
+
|
| 31 |
+
def calculate_tag_accuracy(tags_ai, tags_man):
|
| 32 |
+
if pd.isna(tags_ai): tags_ai = ""
|
| 33 |
+
if pd.isna(tags_man): tags_man = ""
|
| 34 |
+
set_ai = set([t.strip().lower() for t in str(tags_ai).split(',') if t.strip()])
|
| 35 |
+
set_man = set([t.strip().lower() for t in str(tags_man).split(',') if t.strip()])
|
| 36 |
+
if not set_man and not set_ai: return 1.0
|
| 37 |
+
if not set_man or not set_ai: return 0.0
|
| 38 |
+
# Jaccard Similarity
|
| 39 |
+
return len(set_ai.intersection(set_man)) / len(set_ai.union(set_man))
|
| 40 |
+
|
| 41 |
+
def get_combined_dataset():
|
| 42 |
+
"""
|
| 43 |
+
Joins AI predictions with Manual Ground Truth on ID and calculates comprehensive vector differences.
|
| 44 |
+
"""
|
| 45 |
+
if not DATA_AI.exists() or not DATA_MANUAL.exists():
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
# Load datasets
|
| 50 |
+
df_ai = pd.read_csv(DATA_AI)
|
| 51 |
+
df_manual = pd.read_csv(DATA_MANUAL)
|
| 52 |
+
|
| 53 |
+
# Normalize IDs (Trim spaces, ensure string)
|
| 54 |
+
df_ai['id'] = df_ai['id'].astype(str).str.strip()
|
| 55 |
+
df_manual['id'] = df_manual['id'].astype(str).str.strip()
|
| 56 |
+
|
| 57 |
+
df_manual_cols =['id', 'final_veracity_score', 'visual_integrity_score', 'audio_integrity_score', 'source_credibility_score', 'logical_consistency_score', 'emotional_manipulation_score', 'video_audio_score', 'video_caption_score', 'audio_caption_score', 'tags', 'classification']
|
| 58 |
+
|
| 59 |
+
# Merge on ID
|
| 60 |
+
merged = pd.merge(
|
| 61 |
+
df_ai,
|
| 62 |
+
df_manual[[c for c in df_manual_cols if c in df_manual.columns]],
|
| 63 |
+
on='id',
|
| 64 |
+
suffixes=('_ai', '_manual'),
|
| 65 |
+
how='inner'
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# 1. Final Score Error
|
| 69 |
+
merged['final_veracity_score_ai'] = pd.to_numeric(merged['final_veracity_score_ai'], errors='coerce').fillna(0)
|
| 70 |
+
merged['final_veracity_score_manual'] = pd.to_numeric(merged['final_veracity_score_manual'], errors='coerce').fillna(0)
|
| 71 |
+
merged['abs_error'] = (merged['final_veracity_score_ai'] - merged['final_veracity_score_manual']).abs()
|
| 72 |
+
|
| 73 |
+
# 2. Sophisticated Vector Calculations
|
| 74 |
+
vector_pairs =[
|
| 75 |
+
('visual_score', 'visual_integrity_score'),
|
| 76 |
+
('audio_score', 'audio_integrity_score'),
|
| 77 |
+
('source_score', 'source_credibility_score'),
|
| 78 |
+
('logic_score', 'logical_consistency_score'),
|
| 79 |
+
('emotion_score', 'emotional_manipulation_score'),
|
| 80 |
+
('align_video_audio', 'video_audio_score'),
|
| 81 |
+
('align_video_caption', 'video_caption_score'),
|
| 82 |
+
('align_audio_caption', 'audio_caption_score'),
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
error_cols =['abs_error']
|
| 86 |
+
for ai_c, man_c in vector_pairs:
|
| 87 |
+
if ai_c in merged.columns and man_c in merged.columns:
|
| 88 |
+
# Multiply 1-10 scores by 10 to put them on the same 0-100 scale as final score
|
| 89 |
+
merged[ai_c] = pd.to_numeric(merged[ai_c], errors='coerce').fillna(5) * 10
|
| 90 |
+
merged[man_c] = pd.to_numeric(merged[man_c], errors='coerce').fillna(5) * 10
|
| 91 |
+
err_c = f"err_{ai_c}"
|
| 92 |
+
merged[err_c] = (merged[ai_c] - merged[man_c]).abs()
|
| 93 |
+
error_cols.append(err_c)
|
| 94 |
+
|
| 95 |
+
# Composite MAE represents the mean absolute error across the final score AND all 8 sub-vectors
|
| 96 |
+
merged['composite_mae'] = merged[error_cols].mean(axis=1)
|
| 97 |
+
|
| 98 |
+
# 3. Tag Accuracy Calculation
|
| 99 |
+
merged['tag_accuracy'] = merged.apply(lambda row: calculate_tag_accuracy(row.get('tags_ai', ''), row.get('tags_manual', '')), axis=1)
|
| 100 |
+
|
| 101 |
+
return merged
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Error merging datasets: {e}")
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
def format_config_params(params_raw):
|
| 107 |
+
"""Parses the config_params JSON string into a readable format for the leaderboard."""
|
| 108 |
+
if pd.isna(params_raw) or not params_raw:
|
| 109 |
+
return "Defaults"
|
| 110 |
+
try:
|
| 111 |
+
if isinstance(params_raw, str):
|
| 112 |
+
p = json.loads(params_raw)
|
| 113 |
+
else:
|
| 114 |
+
p = params_raw
|
| 115 |
+
|
| 116 |
+
reprompts = p.get('reprompts', 0)
|
| 117 |
+
comments = "Yes" if p.get('include_comments') == 'true' or p.get('include_comments') is True else "No"
|
| 118 |
+
return f"Retries:{reprompts} | Context:{comments}"
|
| 119 |
+
except:
|
| 120 |
+
return "Legacy/Unknown"
|
| 121 |
+
|
| 122 |
+
def calculate_benchmarks():
|
| 123 |
+
"""Global stats (All AI models vs Ground Truth)."""
|
| 124 |
+
merged = get_combined_dataset()
|
| 125 |
+
if merged is None or len(merged) == 0:
|
| 126 |
+
return {"status": "no_data"}
|
| 127 |
+
|
| 128 |
+
mae = merged['composite_mae'].mean()
|
| 129 |
+
tag_acc = merged['tag_accuracy'].mean()
|
| 130 |
+
|
| 131 |
+
# Binary Accuracy (Threshold 50)
|
| 132 |
+
merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
|
| 133 |
+
merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
|
| 134 |
+
accuracy = (merged['bin_ai'] == merged['bin_manual']).mean()
|
| 135 |
+
|
| 136 |
+
recent_samples = merged.tail(5)[['id', 'composite_mae', 'final_veracity_score_ai', 'final_veracity_score_manual']].to_dict(orient='records')
|
| 137 |
+
|
| 138 |
+
result = {
|
| 139 |
+
"count": int(len(merged)),
|
| 140 |
+
"mae": round(mae, 2), # Exposing composite MAE as main MAE metric
|
| 141 |
+
"accuracy_percent": round(accuracy * 100, 1),
|
| 142 |
+
"tag_accuracy_percent": round(tag_acc * 100, 1),
|
| 143 |
+
"recent_samples": recent_samples
|
| 144 |
+
}
|
| 145 |
+
return sanitize_for_json(result)
|
| 146 |
+
|
| 147 |
+
def generate_leaderboard():
|
| 148 |
+
"""
|
| 149 |
+
Groups results by Configuration to rank models/prompts using sophisticated distance measurements.
|
| 150 |
+
"""
|
| 151 |
+
merged = get_combined_dataset()
|
| 152 |
+
if merged is None or len(merged) == 0:
|
| 153 |
+
return[]
|
| 154 |
+
|
| 155 |
+
for col in['config_model', 'config_prompt', 'config_reasoning', 'config_params']:
|
| 156 |
+
if col not in merged.columns: merged[col] = "Unknown"
|
| 157 |
+
|
| 158 |
+
merged = merged.fillna({'config_model': 'Unknown', 'config_prompt': 'Standard', 'config_reasoning': 'None'})
|
| 159 |
+
|
| 160 |
+
merged['params_readable'] = merged['config_params'].apply(format_config_params)
|
| 161 |
+
|
| 162 |
+
merged['bin_ai'] = merged['final_veracity_score_ai'] >= 50
|
| 163 |
+
merged['bin_manual'] = merged['final_veracity_score_manual'] >= 50
|
| 164 |
+
merged['is_correct'] = (merged['bin_ai'] == merged['bin_manual']).astype(int)
|
| 165 |
+
|
| 166 |
+
def get_fcot_depth(row):
|
| 167 |
+
r = str(row['config_reasoning']).lower()
|
| 168 |
+
if 'fcot' in r: return 2
|
| 169 |
+
elif 'cot' in r: return 1
|
| 170 |
+
return 0
|
| 171 |
+
merged['fcot_depth'] = merged.apply(get_fcot_depth, axis=1)
|
| 172 |
+
|
| 173 |
+
# Group By Configuration using Composite MAE and Tag Accuracy
|
| 174 |
+
grouped = merged.groupby(['config_model', 'config_prompt', 'config_reasoning', 'params_readable', 'fcot_depth']).agg(
|
| 175 |
+
comp_mae=('composite_mae', 'mean'),
|
| 176 |
+
tag_accuracy=('tag_accuracy', 'mean'),
|
| 177 |
+
accuracy=('is_correct', 'mean'),
|
| 178 |
+
count=('id', 'count')
|
| 179 |
+
).reset_index()
|
| 180 |
+
|
| 181 |
+
leaderboard =[]
|
| 182 |
+
for _, row in grouped.iterrows():
|
| 183 |
+
leaderboard.append({
|
| 184 |
+
"type": "GenAI",
|
| 185 |
+
"model": row['config_model'],
|
| 186 |
+
"prompt": row['config_prompt'],
|
| 187 |
+
"reasoning": row['config_reasoning'],
|
| 188 |
+
"params": row['params_readable'],
|
| 189 |
+
"fcot_depth": int(row['fcot_depth']),
|
| 190 |
+
"comp_mae": round(row['comp_mae'], 2),
|
| 191 |
+
"tag_acc": round(row['tag_accuracy'] * 100, 1),
|
| 192 |
+
"accuracy": round(row['accuracy'] * 100, 1),
|
| 193 |
+
"samples": int(row['count'])
|
| 194 |
+
})
|
| 195 |
+
|
| 196 |
+
# Sort: Highest Accuracy, Highest Tag Accuracy, then Lowest Composite MAE
|
| 197 |
+
leaderboard.sort(key=lambda x: (-x['accuracy'], -x['tag_acc'], x['comp_mae']))
|
| 198 |
+
|
| 199 |
+
return sanitize_for_json(leaderboard)
|
| 200 |
+
|
| 201 |
+
def train_predictive_sandbox(features_config: dict):
|
| 202 |
+
if not DATA_MANUAL.exists(): return {"error": "No data"}
|
| 203 |
+
df = pd.read_csv(DATA_MANUAL).dropna(subset=['caption', 'final_veracity_score'])
|
| 204 |
+
if len(df) < 5: return {"error": "Not enough data"}
|
| 205 |
+
|
| 206 |
+
df['len'] = df['caption'].astype(str).apply(len)
|
| 207 |
+
keywords = ["shocking", "breaking", "watch"]
|
| 208 |
+
df['kw_count'] = df['caption'].astype(str).apply(lambda x: sum(1 for k in keywords if k in x.lower()))
|
| 209 |
+
feat_cols = ['len', 'kw_count']
|
| 210 |
+
|
| 211 |
+
df['target'] = (pd.to_numeric(df['final_veracity_score'], errors='coerce').fillna(0) >= 50).astype(int)
|
| 212 |
+
|
| 213 |
+
try:
|
| 214 |
+
X_train, X_test, y_train, y_test = train_test_split(df[feat_cols], df['target'], test_size=0.3, random_state=42)
|
| 215 |
+
clf = LogisticRegression()
|
| 216 |
+
clf.fit(X_train, y_train)
|
| 217 |
+
return {
|
| 218 |
+
"status": "success",
|
| 219 |
+
"type": "logistic_regression",
|
| 220 |
+
"accuracy": round(clf.score(X_test, y_test) * 100, 1),
|
| 221 |
+
"message": "Baseline trained on Caption Length + Keywords."
|
| 222 |
+
}
|
| 223 |
+
except Exception as e:
|
| 224 |
+
return {"error": str(e)}
|
src/common_utils.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import csv
|
| 4 |
+
import logging
|
| 5 |
+
import datetime
|
| 6 |
+
import subprocess
|
| 7 |
+
import hashlib
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import yt_dlp
|
| 10 |
+
import transcription
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
def robust_read_csv(file_path: Path):
|
| 15 |
+
if not file_path.exists():
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 20 |
+
clean_lines = (line.replace('\0', '') for line in f)
|
| 21 |
+
reader = csv.DictReader(clean_lines)
|
| 22 |
+
for row in reader:
|
| 23 |
+
if row:
|
| 24 |
+
yield row
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"Error reading CSV {file_path}: {e}")
|
| 27 |
+
return
|
| 28 |
+
|
| 29 |
+
def extract_tweet_id(url: str) -> str | None:
|
| 30 |
+
if not url: return None
|
| 31 |
+
match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
|
| 32 |
+
if match: return match.group(1)
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def normalize_link(link: str) -> str:
|
| 36 |
+
if not link: return ""
|
| 37 |
+
return link.split('?')[0].strip().rstrip('/').replace('http://', '').replace('https://', '').replace('www.', '')
|
| 38 |
+
|
| 39 |
+
def parse_vtt(file_path: str) -> str:
|
| 40 |
+
"""Parses a .vtt subtitle file and returns the clean text content."""
|
| 41 |
+
try:
|
| 42 |
+
if not os.path.exists(file_path):
|
| 43 |
+
return "Transcript file not found."
|
| 44 |
+
|
| 45 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 46 |
+
lines = f.readlines()
|
| 47 |
+
|
| 48 |
+
text_lines =[]
|
| 49 |
+
for line in lines:
|
| 50 |
+
line = line.strip()
|
| 51 |
+
if line and not line.startswith('WEBVTT') and not '-->' in line and not line.isdigit():
|
| 52 |
+
clean_line = re.sub(r'<[^>]+>', '', line)
|
| 53 |
+
if clean_line and (not text_lines or clean_line != text_lines[-1]):
|
| 54 |
+
text_lines.append(clean_line)
|
| 55 |
+
|
| 56 |
+
return "\n".join(text_lines) if text_lines else "No speech found in transcript."
|
| 57 |
+
except Exception as e:
|
| 58 |
+
logger.error(f"Error parsing VTT file {file_path}: {e}")
|
| 59 |
+
return f"Error reading transcript: {e}"
|
| 60 |
+
|
| 61 |
+
async def prepare_video_assets(link: str, output_id: str) -> dict:
|
| 62 |
+
video_dir = Path("data/videos")
|
| 63 |
+
if not video_dir.exists():
|
| 64 |
+
video_dir.mkdir(parents=True, exist_ok=True)
|
| 65 |
+
|
| 66 |
+
video_path = video_dir / f"{output_id}.mp4"
|
| 67 |
+
audio_path = video_dir / f"{output_id}.wav"
|
| 68 |
+
transcript_path = video_dir / f"{output_id}.vtt"
|
| 69 |
+
|
| 70 |
+
caption = ""
|
| 71 |
+
video_downloaded = False
|
| 72 |
+
|
| 73 |
+
ydl_opts = {
|
| 74 |
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/mp4',
|
| 75 |
+
'outtmpl': str(video_path),
|
| 76 |
+
'quiet': True, 'ignoreerrors': True, 'no_warnings': True, 'skip_download': False
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 81 |
+
info = ydl.extract_info(link, download=False)
|
| 82 |
+
if info:
|
| 83 |
+
caption = info.get('description', '') or info.get('title', '')
|
| 84 |
+
formats = info.get('formats',[])
|
| 85 |
+
if not formats and not info.get('url'):
|
| 86 |
+
logger.info(f"No video formats found for {link}. Treating as text-only.")
|
| 87 |
+
else:
|
| 88 |
+
if not video_path.exists(): ydl.download([link])
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error(f"Download error for {link}: {e}")
|
| 91 |
+
|
| 92 |
+
if video_path.exists() and video_path.stat().st_size > 0:
|
| 93 |
+
video_downloaded = True
|
| 94 |
+
if not audio_path.exists():
|
| 95 |
+
subprocess.run(["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(audio_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 96 |
+
if audio_path.exists() and not transcript_path.exists():
|
| 97 |
+
transcription.load_model()
|
| 98 |
+
transcription.generate_transcript(str(audio_path))
|
| 99 |
+
|
| 100 |
+
return {
|
| 101 |
+
"video": str(video_path) if video_downloaded else None,
|
| 102 |
+
"transcript": str(transcript_path) if video_downloaded and transcript_path.exists() else None,
|
| 103 |
+
"caption": caption
|
| 104 |
+
}
|
src/user_analysis_logic.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import csv
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import asyncio
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import inference_logic
|
| 8 |
+
|
| 9 |
+
# Configure Logging
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
# --- Prompts for User Analysis ---
|
| 13 |
+
|
| 14 |
+
PROMPT_USER_PROFILING = """
|
| 15 |
+
You are an Expert Intelligence Analyst specializing in Information Integrity and Social Influence Operations.
|
| 16 |
+
|
| 17 |
+
**TASK:**
|
| 18 |
+
Analyze the following timeline of social media posts from a single user: "@{username}".
|
| 19 |
+
Your goal is to construct a "Credibility & Bias Profile" based on their historical behavior.
|
| 20 |
+
|
| 21 |
+
**INPUT DATA (Recent Posts):**
|
| 22 |
+
{timeline_text}
|
| 23 |
+
|
| 24 |
+
**ANALYSIS REQUIREMENTS:**
|
| 25 |
+
1. **Thematic Clusters:** What subjects does this user repeatedly post about? (e.g., "Crypto", "US Politics", "Climate Skepticism").
|
| 26 |
+
2. **Echo Chamber Indicators:** Does the user frequently repost specific domains or engage with specific narratives without adding nuance?
|
| 27 |
+
3. **Emotional Valence:** Analyze the aggregate emotional tone (Alarmist, Neutral, Aggressive, Satirical).
|
| 28 |
+
4. **Bias Detection:** Identify explicit political or ideological biases based on the text.
|
| 29 |
+
5. **Credibility Weighting:** Based on the content, assign a "Historical Credibility Score" (0.0 to 1.0).
|
| 30 |
+
* 0.0 = High frequency of inflammatory/unverified claims.
|
| 31 |
+
* 1.0 = Consistently neutral or verified sourcing.
|
| 32 |
+
|
| 33 |
+
**OUTPUT FORMAT (Strict JSON):**
|
| 34 |
+
{{
|
| 35 |
+
"username": "@{username}",
|
| 36 |
+
"thematic_clusters": ["Topic A", "Topic B"],
|
| 37 |
+
"echo_chamber_detected": boolean,
|
| 38 |
+
"bias_assessment": "Description of bias...",
|
| 39 |
+
"emotional_valence": "Dominant tone...",
|
| 40 |
+
"credibility_score": float,
|
| 41 |
+
"summary_profile": "A concise paragraph summarizing the user's role in the information ecosystem."
|
| 42 |
+
}}
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
async def load_user_history(username: str, limit: int = 50) -> str:
|
| 46 |
+
"""
|
| 47 |
+
Reads the user's history.csv and formats it into a text block for the LLM.
|
| 48 |
+
"""
|
| 49 |
+
csv_path = Path(f"data/profiles/{username}/history.csv")
|
| 50 |
+
if not csv_path.exists():
|
| 51 |
+
return ""
|
| 52 |
+
|
| 53 |
+
timeline_entries =[]
|
| 54 |
+
try:
|
| 55 |
+
with open(csv_path, 'r', encoding='utf-8', errors='replace') as f:
|
| 56 |
+
reader = csv.DictReader(f)
|
| 57 |
+
# Read all, sort by date descending if needed, but scraper usually does desc
|
| 58 |
+
rows = list(reader)
|
| 59 |
+
# Take latest 'limit' posts
|
| 60 |
+
recent_rows = rows[-limit:]
|
| 61 |
+
|
| 62 |
+
for row in recent_rows:
|
| 63 |
+
entry = (
|
| 64 |
+
f"[{row['timestamp']}] "
|
| 65 |
+
f"{'REPOST' if row.get('is_reply')=='True' else 'POST'}: "
|
| 66 |
+
f"\"{row['text']}\" "
|
| 67 |
+
f"(Likes: {row['metric_likes']}, Views: {row['metric_views']})"
|
| 68 |
+
)
|
| 69 |
+
timeline_entries.append(entry)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Error reading history for {username}: {e}")
|
| 72 |
+
return ""
|
| 73 |
+
|
| 74 |
+
return "\n".join(timeline_entries)
|
| 75 |
+
|
| 76 |
+
async def generate_user_profile_report(username: str):
|
| 77 |
+
"""
|
| 78 |
+
Orchestrates the analysis pipeline:
|
| 79 |
+
1. Load History.
|
| 80 |
+
2. Construct Prompt.
|
| 81 |
+
3. Call LLM (using Vertex/Gemini config from environment or default).
|
| 82 |
+
4. Save JSON Report.
|
| 83 |
+
"""
|
| 84 |
+
logger.info(f"Starting analysis for user: {username}")
|
| 85 |
+
|
| 86 |
+
timeline_text = await load_user_history(username)
|
| 87 |
+
if not timeline_text:
|
| 88 |
+
return {"error": "No history found or empty timeline."}
|
| 89 |
+
|
| 90 |
+
# Format Prompt
|
| 91 |
+
prompt = PROMPT_USER_PROFILING.format(username=username, timeline_text=timeline_text)
|
| 92 |
+
|
| 93 |
+
# Use Vertex AI by default if configured, else try Gemini Legacy
|
| 94 |
+
# For now, we reuse the pipeline functions in inference_logic if available,
|
| 95 |
+
# or create a direct call here for simplicity.
|
| 96 |
+
|
| 97 |
+
# We'll assume Vertex is the primary backend for this advanced analysis
|
| 98 |
+
# This requires valid credentials in the environment or passed config.
|
| 99 |
+
# Fallback to a placeholder if no model is loaded.
|
| 100 |
+
|
| 101 |
+
report_json = {}
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
# Attempt to use the existing Vertex Client in inference_logic if initialized
|
| 105 |
+
# Otherwise, we instantiate a quick one if env vars exist
|
| 106 |
+
project_id = os.getenv("VERTEX_PROJECT_ID")
|
| 107 |
+
location = os.getenv("VERTEX_LOCATION", "us-central1")
|
| 108 |
+
api_key = os.getenv("VERTEX_API_KEY")
|
| 109 |
+
|
| 110 |
+
if inference_logic.genai and project_id:
|
| 111 |
+
from google.genai import Client
|
| 112 |
+
from google.genai.types import GenerateContentConfig
|
| 113 |
+
|
| 114 |
+
if api_key:
|
| 115 |
+
client = Client(vertexai=True, project=project_id, location=location, api_key=api_key)
|
| 116 |
+
else:
|
| 117 |
+
client = Client(vertexai=True, project=project_id, location=location)
|
| 118 |
+
|
| 119 |
+
response = client.models.generate_content(
|
| 120 |
+
model="gemini-1.5-pro-preview-0409",
|
| 121 |
+
contents=prompt,
|
| 122 |
+
config=GenerateContentConfig(response_mime_type="application/json")
|
| 123 |
+
)
|
| 124 |
+
report_text = response.text
|
| 125 |
+
report_json = json.loads(report_text)
|
| 126 |
+
|
| 127 |
+
else:
|
| 128 |
+
# Fallback Mock for Demo/LITE mode
|
| 129 |
+
logger.warning("Vertex AI credentials not found. Generating Mock Analysis.")
|
| 130 |
+
report_json = {
|
| 131 |
+
"username": f"@{username}",
|
| 132 |
+
"thematic_clusters":["Simulated Topic 1", "Simulated Topic 2"],
|
| 133 |
+
"bias_assessment": "System running in LITE mode. Configure Vertex AI for real analysis.",
|
| 134 |
+
"credibility_score": 0.5,
|
| 135 |
+
"summary_profile": "Mock profile generated because AI backend is not active."
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.error(f"LLM Analysis failed: {e}")
|
| 140 |
+
report_json = {"error": str(e)}
|
| 141 |
+
|
| 142 |
+
# Save Report
|
| 143 |
+
output_path = Path(f"data/profiles/{username}/analysis_report.json")
|
| 144 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 145 |
+
json.dump(report_json, f, indent=2)
|
| 146 |
+
|
| 147 |
+
return report_json
|