GlazedDon0t commited on
Commit
1c08c4a
·
1 Parent(s): ba8b51d
Files changed (3) hide show
  1. frontend/src/App.tsx +415 -88
  2. main.go +12 -2
  3. src/app.py +879 -781
frontend/src/App.tsx CHANGED
@@ -5,21 +5,21 @@ import {
5
  StopCircle, RefreshCw, CheckCircle2, PenTool, ClipboardCheck, Info, Clock, FileText,
6
  Tag, Home, Cpu, FlaskConical, Target, Trash2, ArrowUpRight, CheckSquare, Square,
7
  Layers, Activity, Zap, BrainCircuit, Network, Archive, Plus, Edit3, RotateCcw,
8
- Bot, Trophy, HelpCircle, Settings, Calculator
9
  } from 'lucide-react';
10
 
11
  function App() {
12
  const[activeTab, setActiveTab] = useState('home');
13
  const [logs, setLogs] = useState<string>('System Ready.\n');
14
- const [isProcessing, setIsProcessing] = useState(false);
15
  const logContainerRef = useRef<HTMLDivElement>(null);
16
 
17
  // Processing Config State
18
- const[modelProvider, setModelProvider] = useState('nrp');
19
  const [apiKey, setApiKey] = useState('');
20
- const [baseUrl, setBaseUrl] = useState('https://ellm.nrp-nautilus.io/v1'); // NRP Default
21
- const [modelName, setModelName] = useState('qwen3'); // NRP Default
22
- const [projectId, setProjectId] = useState('');
23
  const [location, setLocation] = useState('us-central1');
24
  const [includeComments, setIncludeComments] = useState(false);
25
  const[reasoningMethod, setReasoningMethod] = useState('cot');
@@ -35,6 +35,7 @@ function App() {
35
  // Data States
36
  const [queueList, setQueueList] = useState<any[]>([]);
37
  const[selectedQueueItems, setSelectedQueueItems] = useState<Set<string>>(new Set());
 
38
  const[lastQueueIndex, setLastQueueIndex] = useState<number | null>(null);
39
 
40
  const[singleLinkInput, setSingleLinkInput] = useState('');
@@ -43,10 +44,10 @@ function App() {
43
  const[profilePosts, setProfilePosts] = useState<any[]>([]);
44
  const [communityDatasets, setCommunityDatasets] = useState<any[]>([]);
45
  const [communityAnalysis, setCommunityAnalysis] = useState<any>(null);
46
- const [integrityBoard, setIntegrityBoard] = useState<any[]>([]);
47
 
48
- const [datasetList, setDatasetList] = useState<any[]>([]);
49
- const[selectedItems, setSelectedItems] = useState<Set<string>>(new Set());
50
  const [lastDatasetIndex, setLastDatasetIndex] = useState<number | null>(null);
51
 
52
  const [benchmarks, setBenchmarks] = useState<any>(null);
@@ -79,12 +80,20 @@ function App() {
79
  const [agentConfig, setAgentConfig] = useState({ use_search: true, use_code: false });
80
 
81
  // Resampling configuration
82
- const [resampleCount, setResampleCount] = useState<number>(1);
83
 
84
  // Drag Selection references
85
  const isDraggingQueueRef = useRef(false);
86
  const isDraggingDatasetRef = useRef(false);
87
 
 
 
 
 
 
 
 
 
88
  useEffect(() => {
89
  const handleMouseUp = () => {
90
  isDraggingQueueRef.current = false;
@@ -123,12 +132,16 @@ function App() {
123
 
124
  setSelectedItems(new Set());
125
  setLastDatasetIndex(null);
126
- }, [activeTab, refreshTrigger]);
127
 
128
  useEffect(() => {
129
  if (logContainerRef.current) logContainerRef.current.scrollTop = logContainerRef.current.scrollHeight;
130
  }, [logs]);
131
 
 
 
 
 
132
  useEffect(() => {
133
  if (activeTab === 'agent' && agentMessages.length === 0) {
134
  callAgent(agentMethod, {
@@ -274,6 +287,14 @@ function App() {
274
  setLastQueueIndex(index);
275
  };
276
 
 
 
 
 
 
 
 
 
277
  const promoteSelected = async () => {
278
  if (selectedItems.size === 0) return alert("No items selected.");
279
  if (!confirm(`Promote ${selectedItems.size} items to Ground Truth?`)) return;
@@ -437,6 +458,20 @@ function App() {
437
  } catch(e) { alert("Error adding link"); }
438
  };
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  const clearProcessed = async () => {
441
  if(!confirm("Remove all 'Processed' items from the queue?")) return;
442
  try {
@@ -527,6 +562,84 @@ function App() {
527
  setRefreshTrigger(p => p+1);
528
  };
529
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  const callAgent = async (method: string, payloadParams: any) => {
531
  return fetch(agentEndpoint, {
532
  method: 'POST',
@@ -602,7 +715,18 @@ function App() {
602
 
603
  return (
604
  <div className="flex h-screen w-full bg-[#09090b] text-slate-200 font-sans overflow-hidden">
605
-
 
 
 
 
 
 
 
 
 
 
 
606
  {/* SIDEBAR */}
607
  <div className="w-[280px] flex flex-col border-r border-slate-800/60 bg-[#0c0c0e]">
608
  <div className="h-16 flex items-center px-6 border-b border-slate-800/60">
@@ -640,6 +764,196 @@ function App() {
640
  {/* HOME TAB */}
641
  {activeTab === 'home' && (
642
  <div className="h-full overflow-y-auto space-y-8 max-w-5xl pr-2">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  <div className="grid grid-cols-3 gap-6">
644
  <div className="col-span-2 bg-slate-900/50 border border-slate-800 rounded-xl p-6">
645
  <h2 className="text-xl font-bold text-white mb-4">Philosophy & Methodology</h2>
@@ -799,35 +1113,29 @@ function App() {
799
  <div className="space-y-1">
800
  <input type="password" value={apiKey} onChange={e => setApiKey(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white placeholder-slate-600" placeholder="NRP API Token"/>
801
  </div>
802
- <div className="space-y-1">
803
- <select value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
804
- <option value="qwen3">qwen3 (Multimodal)</option>
805
- <option value="gpt-oss">gpt-oss</option>
806
- <option value="kimi">kimi</option>
807
- <option value="glm-4.7">glm-4.7</option>
808
- <option value="minimax-m2">minimax-m2</option>
809
- <option value="glm-v">glm-v (Multimodal)</option>
810
- <option value="gemma3">gemma3 (Multimodal)</option>
811
- </select>
812
- </div>
813
  </>
814
  )}
815
-
816
- {modelProvider !== 'nrp' && (
817
- <div className="space-y-1">
818
- <input value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white placeholder-slate-600" placeholder="Model Name"/>
819
- </div>
820
- )}
821
 
822
- <select value={reasoningMethod} onChange={e => setReasoningMethod(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
823
- <option value="cot">Standard Chain of Thought</option>
824
- <option value="fcot">Fractal Chain of Thought</option>
825
- </select>
826
- <select value={promptTemplate} onChange={e => setPromptTemplate(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
827
- {availablePrompts.length > 0 ? availablePrompts.map(p => (
828
- <option key={p.id} value={p.id}>{p.name}</option>
829
- )) : <option value="standard">Standard</option>}
830
- </select>
 
 
 
 
 
 
 
 
 
 
 
831
  </div>
832
  </div>
833
 
@@ -1004,27 +1312,12 @@ function App() {
1004
  </>
1005
  )}
1006
 
1007
- {modelProvider === 'nrp' ? (
1008
- <div className="space-y-1">
1009
- <label className="text-[10px] text-slate-500">Model Name</label>
1010
- <select value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
1011
- <option value="qwen3">qwen3 (Multimodal)</option>
1012
- <option value="gpt-oss">gpt-oss</option>
1013
- <option value="kimi">kimi</option>
1014
- <option value="glm-4.7">glm-4.7</option>
1015
- <option value="minimax-m2">minimax-m2</option>
1016
- <option value="glm-v">glm-v (Multimodal)</option>
1017
- <option value="gemma3">gemma3 (Multimodal)</option>
1018
- </select>
1019
- </div>
1020
- ) : (
1021
- <div className="space-y-1">
1022
- <label className="text-[10px] text-slate-500">Model Name</label>
1023
- <input value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white"/>
1024
- </div>
1025
- )}
1026
-
1027
  <div className="space-y-1">
 
 
 
 
 
1028
  <label className="text-[10px] text-slate-500">Reasoning Method</label>
1029
  <select value={reasoningMethod} onChange={e => setReasoningMethod(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
1030
  <option value="cot">Standard Chain of Thought</option>
@@ -1076,34 +1369,68 @@ function App() {
1076
  </thead>
1077
  <tbody>
1078
  {queueList.map((q, i, arr) => (
1079
- <tr
1080
- key={i}
1081
- className={`border-t border-slate-800/50 hover:bg-white/5 ${selectedQueueItems.has(q.link) ? 'bg-indigo-900/20' : ''}`}
1082
- onMouseDown={(e) => {
1083
- isDraggingQueueRef.current = true;
1084
- toggleQueueSelection(e, q.link, i, arr);
1085
- }}
1086
- onMouseEnter={() => {
1087
- if (isDraggingQueueRef.current && !selectedQueueItems.has(q.link)) {
1088
- setSelectedQueueItems(prev => new Set(prev).add(q.link));
1089
- setLastQueueIndex(i);
1090
- }
1091
- }}
1092
- >
1093
- <td className="p-3 cursor-pointer">
1094
- {selectedQueueItems.has(q.link) ? <CheckSquare className="w-4 h-4 text-indigo-400"/> : <Square className="w-4 h-4 text-slate-600"/>}
1095
- </td>
1096
- <td className="p-3 font-bold text-[10px]">{q.task_type === 'Verify' ? <span className="text-amber-400">VERIFY</span> : <span className="text-slate-500">INGEST</span>}</td>
1097
- <td className="p-3 text-sky-500 font-mono break-all">{q.link}</td>
1098
- <td className="p-3">
1099
- {q.status === 'Processed' ?
1100
- <span className="text-emerald-500 flex items-center gap-1"><CheckCircle2 className="w-3 h-3"/> Done</span> :
1101
- q.status === 'Error' ?
1102
- <span className="text-red-500 flex items-center gap-1"><AlertCircle className="w-3 h-3"/> Error</span> :
1103
- <span className="text-amber-500">Pending</span>
1104
- }
1105
- </td>
1106
- </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1107
  ))}
1108
  </tbody>
1109
  </table>
@@ -1406,7 +1733,7 @@ function App() {
1406
  <span className="capitalize text-slate-300 font-bold">{k.replace('v', 'Video-').replace('a', 'Audio-').replace('c', 'Caption')}</span>
1407
  <span className="text-emerald-400 font-mono font-bold">{(manualScores as any)[k]}/10</span>
1408
  </div>
1409
- <input type="range" min="1" max="10" value={(manualScores as any)[k]} onChange={e => setManualScores({...manualScores, [k]: parseInt(e.target.value)})} className="w-full accent-emerald-500"/>
1410
  </div>
1411
  ))}
1412
  </div>
@@ -1486,7 +1813,7 @@ function App() {
1486
  </div>
1487
  )}
1488
 
1489
- {/* COMMUNITY AND ANALYTICS TABS */}
1490
  {activeTab === 'community' && (
1491
  <div className="flex h-full gap-6">
1492
  <div className="w-1/3 bg-slate-900/50 border border-slate-800 rounded-xl overflow-auto">
 
5
  StopCircle, RefreshCw, CheckCircle2, PenTool, ClipboardCheck, Info, Clock, FileText,
6
  Tag, Home, Cpu, FlaskConical, Target, Trash2, ArrowUpRight, CheckSquare, Square,
7
  Layers, Activity, Zap, BrainCircuit, Network, Archive, Plus, Edit3, RotateCcw,
8
+ Bot, Trophy, HelpCircle, Settings, Calculator, ChevronDown, ChevronUp
9
  } from 'lucide-react';
10
 
11
  function App() {
12
  const[activeTab, setActiveTab] = useState('home');
13
  const [logs, setLogs] = useState<string>('System Ready.\n');
14
+ const[isProcessing, setIsProcessing] = useState(false);
15
  const logContainerRef = useRef<HTMLDivElement>(null);
16
 
17
  // Processing Config State
18
+ const [modelProvider, setModelProvider] = useState('nrp');
19
  const [apiKey, setApiKey] = useState('');
20
+ const[baseUrl, setBaseUrl] = useState('https://ellm.nrp-nautilus.io/v1'); // NRP Default
21
+ const[modelName, setModelName] = useState('qwen3'); // Default
22
+ const[projectId, setProjectId] = useState('');
23
  const [location, setLocation] = useState('us-central1');
24
  const [includeComments, setIncludeComments] = useState(false);
25
  const[reasoningMethod, setReasoningMethod] = useState('cot');
 
35
  // Data States
36
  const [queueList, setQueueList] = useState<any[]>([]);
37
  const[selectedQueueItems, setSelectedQueueItems] = useState<Set<string>>(new Set());
38
+ const[expandedQueueItems, setExpandedQueueItems] = useState<Set<string>>(new Set());
39
  const[lastQueueIndex, setLastQueueIndex] = useState<number | null>(null);
40
 
41
  const[singleLinkInput, setSingleLinkInput] = useState('');
 
44
  const[profilePosts, setProfilePosts] = useState<any[]>([]);
45
  const [communityDatasets, setCommunityDatasets] = useState<any[]>([]);
46
  const [communityAnalysis, setCommunityAnalysis] = useState<any>(null);
47
+ const[integrityBoard, setIntegrityBoard] = useState<any[]>([]);
48
 
49
+ const[datasetList, setDatasetList] = useState<any[]>([]);
50
+ const [selectedItems, setSelectedItems] = useState<Set<string>>(new Set());
51
  const [lastDatasetIndex, setLastDatasetIndex] = useState<number | null>(null);
52
 
53
  const [benchmarks, setBenchmarks] = useState<any>(null);
 
80
  const [agentConfig, setAgentConfig] = useState({ use_search: true, use_code: false });
81
 
82
  // Resampling configuration
83
+ const[resampleCount, setResampleCount] = useState<number>(1);
84
 
85
  // Drag Selection references
86
  const isDraggingQueueRef = useRef(false);
87
  const isDraggingDatasetRef = useRef(false);
88
 
89
+ // Quick Demo State
90
+ const[demoLink, setDemoLink] = useState('');
91
+ const [demoLogs, setDemoLogs] = useState('');
92
+ const [demoIsProcessing, setDemoIsProcessing] = useState(false);
93
+ const[demoResult, setDemoResult] = useState<any>(null);
94
+ const[showDemoConfig, setShowDemoConfig] = useState(false);
95
+ const demoLogContainerRef = useRef<HTMLDivElement>(null);
96
+
97
  useEffect(() => {
98
  const handleMouseUp = () => {
99
  isDraggingQueueRef.current = false;
 
132
 
133
  setSelectedItems(new Set());
134
  setLastDatasetIndex(null);
135
+ },[activeTab, refreshTrigger]);
136
 
137
  useEffect(() => {
138
  if (logContainerRef.current) logContainerRef.current.scrollTop = logContainerRef.current.scrollHeight;
139
  }, [logs]);
140
 
141
+ useEffect(() => {
142
+ if (demoLogContainerRef.current) demoLogContainerRef.current.scrollTop = demoLogContainerRef.current.scrollHeight;
143
+ },[demoLogs]);
144
+
145
  useEffect(() => {
146
  if (activeTab === 'agent' && agentMessages.length === 0) {
147
  callAgent(agentMethod, {
 
287
  setLastQueueIndex(index);
288
  };
289
 
290
+ const toggleQueueExpand = (e: React.MouseEvent, link: string) => {
291
+ e.stopPropagation();
292
+ const newSet = new Set(expandedQueueItems);
293
+ if (newSet.has(link)) newSet.delete(link);
294
+ else newSet.add(link);
295
+ setExpandedQueueItems(newSet);
296
+ };
297
+
298
  const promoteSelected = async () => {
299
  if (selectedItems.size === 0) return alert("No items selected.");
300
  if (!confirm(`Promote ${selectedItems.size} items to Ground Truth?`)) return;
 
458
  } catch(e) { alert("Error adding link"); }
459
  };
460
 
461
+ const addSingleLinkDirect = async (link: string) => {
462
+ if(!link) return;
463
+ try {
464
+ const res = await fetch('/queue/add', {
465
+ method: 'POST', headers: {'Content-Type': 'application/json'},
466
+ body: JSON.stringify({ link: link })
467
+ });
468
+ const d = await res.json();
469
+ if(d.status === 'success') {
470
+ setRefreshTrigger(p => p+1);
471
+ } else { alert(d.message); }
472
+ } catch(e) { alert("Error adding link"); }
473
+ };
474
+
475
  const clearProcessed = async () => {
476
  if(!confirm("Remove all 'Processed' items from the queue?")) return;
477
  try {
 
562
  setRefreshTrigger(p => p+1);
563
  };
564
 
565
+ const runDemo = async () => {
566
+ if (!demoLink) return;
567
+ setDemoIsProcessing(true);
568
+ setDemoLogs('[SYSTEM] Preparing pipeline for single execution...\n');
569
+ setDemoResult(null);
570
+
571
+ try {
572
+ // 1. Ensure the link is added and forcefully requeued so it is 'Pending'
573
+ await fetch('/queue/add', {
574
+ method: 'POST', headers: {'Content-Type': 'application/json'},
575
+ body: JSON.stringify({ link: demoLink })
576
+ });
577
+ await fetch('/queue/requeue', {
578
+ method: 'POST', headers: {'Content-Type': 'application/json'},
579
+ body: JSON.stringify({ links: [demoLink] })
580
+ });
581
+
582
+ // 2. Setup FormData exactly like normal queue
583
+ const fd = new FormData();
584
+ const activeProvider = modelProvider === 'gcloud' ? 'vertex' : modelProvider;
585
+ fd.append('model_selection', activeProvider);
586
+ fd.append('gemini_api_key', apiKey);
587
+ fd.append('gemini_model_name', modelName);
588
+ fd.append('vertex_project_id', projectId);
589
+ fd.append('vertex_location', location);
590
+ fd.append('vertex_model_name', modelName);
591
+ fd.append('vertex_api_key', apiKey);
592
+ fd.append('nrp_api_key', apiKey);
593
+ fd.append('nrp_model_name', modelName);
594
+ fd.append('nrp_base_url', baseUrl);
595
+ fd.append('include_comments', includeComments.toString());
596
+ fd.append('reasoning_method', reasoningMethod);
597
+ fd.append('prompt_template', promptTemplate);
598
+ fd.append('custom_query', customQuery);
599
+ fd.append('max_reprompts', maxRetries.toString());
600
+
601
+ setDemoLogs(prev => prev + '[SYSTEM] Sending analysis payload to model server...\n');
602
+
603
+ // 3. Read Stream
604
+ const runRes = await fetch('/queue/run', { method: 'POST', body: fd });
605
+ const reader = runRes.body!.pipeThrough(new TextDecoderStream()).getReader();
606
+
607
+ while (true) {
608
+ const { value, done } = await reader.read();
609
+ if (done) break;
610
+ if (value.includes('event: close')) break;
611
+ const clean = value.replace(/data: /g, '').trim();
612
+ if (clean) setDemoLogs(prev => prev + clean + '\n');
613
+ }
614
+
615
+ // 4. Look up result from dataset
616
+ setDemoLogs(prev => prev + '\n[SYSTEM] Fetching structured object result...\n');
617
+ const dsRes = await fetch('/dataset/list');
618
+ const dsList = await dsRes.json();
619
+
620
+ const normalize = (l: string) => l.split('?')[0].replace('https://', '').replace('http://', '').replace('www.', '').replace(/\/$/, '');
621
+ const targetLink = normalize(demoLink);
622
+
623
+ // Sort descending to ensure we get the latest processed run
624
+ dsList.sort((a: any, b: any) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
625
+
626
+ const match = dsList.find((d: any) => normalize(d.link || '') === targetLink && d.source !== 'Manual');
627
+
628
+ if (match) {
629
+ setDemoResult(match);
630
+ setDemoLogs(prev => prev + '[SYSTEM] Result rendered successfully.\n');
631
+ } else {
632
+ setDemoLogs(prev => prev + '[SYSTEM] Error: Could not find parsed result. Processing might have failed.\n');
633
+ }
634
+
635
+ } catch (err: any) {
636
+ setDemoLogs(prev => prev + `\n[ERROR] ${err.message}\n`);
637
+ } finally {
638
+ setDemoIsProcessing(false);
639
+ setRefreshTrigger(p => p+1);
640
+ }
641
+ }
642
+
643
  const callAgent = async (method: string, payloadParams: any) => {
644
  return fetch(agentEndpoint, {
645
  method: 'POST',
 
715
 
716
  return (
717
  <div className="flex h-screen w-full bg-[#09090b] text-slate-200 font-sans overflow-hidden">
718
+ <datalist id="modelSuggestions">
719
+ <option value="gemini-1.5-pro-preview-0409" />
720
+ <option value="gemini-2.0-flash-exp" />
721
+ <option value="qwen3" />
722
+ <option value="gpt-oss" />
723
+ <option value="kimi" />
724
+ <option value="glm-4.7" />
725
+ <option value="minimax-m2" />
726
+ <option value="glm-v" />
727
+ <option value="gemma3" />
728
+ </datalist>
729
+
730
  {/* SIDEBAR */}
731
  <div className="w-[280px] flex flex-col border-r border-slate-800/60 bg-[#0c0c0e]">
732
  <div className="h-16 flex items-center px-6 border-b border-slate-800/60">
 
764
  {/* HOME TAB */}
765
  {activeTab === 'home' && (
766
  <div className="h-full overflow-y-auto space-y-8 max-w-5xl pr-2">
767
+
768
+ {/* QUICK DEMO SECTION */}
769
+ <div className="bg-slate-900/50 border border-slate-800 rounded-xl p-6 shadow-sm">
770
+ <div className="flex justify-between items-start mb-4">
771
+ <div>
772
+ <h2 className="text-xl font-bold text-white flex items-center gap-2">
773
+ <Zap className="w-5 h-5 text-amber-400" /> Try LiarMP4 - Quick Demo
774
+ </h2>
775
+ <p className="text-sm text-slate-400 mt-1">
776
+ Test the Multimodal Factuality Pipeline on a single video URL.
777
+ </p>
778
+ </div>
779
+ <button
780
+ onClick={() => setShowDemoConfig(!showDemoConfig)}
781
+ className="text-xs font-bold text-slate-500 hover:text-slate-300 flex items-center gap-1 bg-slate-950 px-3 py-1.5 rounded border border-slate-800"
782
+ >
783
+ <Settings className="w-4 h-4" />
784
+ {showDemoConfig ? "Hide Config" : "Show Config"}
785
+ </button>
786
+ </div>
787
+
788
+ {showDemoConfig && (
789
+ <div className="bg-slate-950 p-4 rounded-lg border border-slate-800 mb-4 grid grid-cols-2 gap-6 animate-in fade-in zoom-in-95 duration-200">
790
+ <div className="space-y-3">
791
+ <label className="text-[10px] text-slate-500 uppercase font-bold block border-b border-slate-800 pb-1">LLM Provider Config</label>
792
+ <select value={modelProvider} onChange={e => setModelProvider(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white">
793
+ <option value="vertex">Vertex AI (Enterprise)</option>
794
+ <option value="gemini">Gemini API (Public)</option>
795
+ <option value="gcloud">Google Cloud (Project + API Key)</option>
796
+ <option value="nrp">NRP (Nautilus Envoy Gateway)</option>
797
+ </select>
798
+
799
+ {modelProvider === 'nrp' && (
800
+ <>
801
+ <input value={baseUrl} onChange={e => setBaseUrl(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="Base URL"/>
802
+ <input type="password" value={apiKey} onChange={e => setApiKey(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="API Token"/>
803
+ </>
804
+ )}
805
+ {modelProvider === 'gemini' && (
806
+ <input type="password" value={apiKey} onChange={e => setApiKey(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="API Key"/>
807
+ )}
808
+ {(modelProvider === 'vertex' || modelProvider === 'gcloud') && (
809
+ <>
810
+ <input value={projectId} onChange={e => setProjectId(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="Project ID"/>
811
+ <input value={location} onChange={e => setLocation(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="Location"/>
812
+ {modelProvider === 'gcloud' && <input type="password" value={apiKey} onChange={e => setApiKey(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="API Key"/>}
813
+ </>
814
+ )}
815
+ <input list="modelSuggestions" value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white" placeholder="Model Name (e.g. gemini-1.5-pro)"/>
816
+ </div>
817
+ <div className="space-y-3">
818
+ <label className="text-[10px] text-slate-500 uppercase font-bold block border-b border-slate-800 pb-1">Inference Strategy</label>
819
+ <select value={reasoningMethod} onChange={e => setReasoningMethod(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white">
820
+ <option value="cot">Standard Chain of Thought</option>
821
+ <option value="fcot">Fractal Chain of Thought</option>
822
+ </select>
823
+ <select value={promptTemplate} onChange={e => setPromptTemplate(e.target.value)} className="w-full bg-slate-900 border border-slate-700 rounded p-2 text-xs text-white">
824
+ {availablePrompts.length > 0 ? availablePrompts.map(p => (
825
+ <option key={p.id} value={p.id}>{p.name}</option>
826
+ )) : <option value="standard">Standard</option>}
827
+ </select>
828
+ </div>
829
+ </div>
830
+ )}
831
+
832
+ <div className="flex gap-2 mb-4">
833
+ <input
834
+ type="text"
835
+ value={demoLink}
836
+ onChange={(e) => setDemoLink(e.target.value)}
837
+ placeholder="Enter X/Twitter Video URL here..."
838
+ className="flex-1 bg-slate-950 border border-slate-700 rounded-lg p-3 text-sm text-white focus:outline-none focus:border-indigo-500"
839
+ disabled={demoIsProcessing}
840
+ />
841
+ <button
842
+ onClick={runDemo}
843
+ disabled={demoIsProcessing || !demoLink}
844
+ className="bg-indigo-600 hover:bg-indigo-500 disabled:bg-slate-700 text-white px-6 py-3 rounded-lg font-bold flex items-center gap-2 transition"
845
+ >
846
+ {demoIsProcessing ? <RefreshCw className="w-4 h-4 animate-spin" /> : <Play className="w-4 h-4 fill-white" />}
847
+ {demoIsProcessing ? "Processing..." : "Analyze"}
848
+ </button>
849
+ </div>
850
+
851
+ {(demoIsProcessing || demoLogs) && !demoResult && (
852
+ <div className="bg-black border border-slate-800 rounded-lg p-4 font-mono text-[10px] text-emerald-500 h-64 overflow-y-auto whitespace-pre-wrap animate-in fade-in duration-300" ref={demoLogContainerRef}>
853
+ {demoLogs || "Initializing pipeline..."}
854
+ </div>
855
+ )}
856
+
857
+ {demoResult && (
858
+ <div className="mt-6 border-t border-slate-800 pt-6 animate-in fade-in slide-in-from-bottom-4 duration-500">
859
+ <div className="flex justify-between items-start mb-6">
860
+ <div>
861
+ <h3 className="text-xl font-bold text-white flex items-center gap-2">
862
+ <ShieldCheck className="w-6 h-6 text-emerald-400" />
863
+ <span className="text-indigo-400 uppercase tracking-wider text-[16px] bg-indigo-500/10 px-2 py-0.5 rounded border border-indigo-500/20">[{demoResult.config_model || 'AI'}]
864
+ </span>
865
+ Analysis Complete
866
+ </h3>
867
+ <div className="text-xs text-slate-400 mt-2 font-mono">
868
+ ID: {demoResult.id} | Prompt: {demoResult.config_prompt} | Reasoning: {demoResult.config_reasoning}
869
+ </div>
870
+ </div>
871
+ <div className="bg-slate-950 border border-slate-800 px-6 py-2 rounded-lg text-center">
872
+ <div className="text-[10px] uppercase text-slate-500 font-bold mb-1">Final Score</div>
873
+ <div className={`text-4xl font-bold font-mono ${demoResult.final_veracity_score < 50 ? 'text-red-400' : 'text-emerald-400'}`}>
874
+ {demoResult.final_veracity_score}
875
+ </div>
876
+ </div>
877
+ </div>
878
+
879
+ <div className="grid grid-cols-3 gap-6 mb-6">
880
+ <div className="col-span-2 space-y-4">
881
+ <div className="bg-slate-950 p-4 rounded-lg border border-slate-800">
882
+ <div className="text-xs font-bold text-indigo-400 uppercase mb-2">Video Context</div>
883
+ <div className="text-sm text-slate-300 italic leading-relaxed">"{demoResult.caption || 'No specific context found'}"</div>
884
+ </div>
885
+
886
+ <div className="bg-slate-950 p-4 rounded-lg border border-slate-800">
887
+ <div className="text-xs font-bold text-amber-400 uppercase mb-2">AI Reasoning</div>
888
+ <div className="text-sm text-slate-300 whitespace-pre-wrap leading-relaxed">{demoResult.reasoning || 'No reasoning provided'}</div>
889
+ </div>
890
+
891
+ {demoResult.tags && (
892
+ <div className="flex flex-wrap gap-2">
893
+ {demoResult.tags.split(',').map((t: string) => (
894
+ <span key={t} className="px-2 py-1 bg-slate-800 border border-slate-700 rounded text-xs text-slate-400 font-mono">{t.trim()}</span>
895
+ ))}
896
+ </div>
897
+ )}
898
+ </div>
899
+
900
+ <div className="space-y-4">
901
+ <div className="bg-slate-950 p-4 rounded-lg border border-slate-800">
902
+ <div className="text-xs font-bold text-sky-400 uppercase mb-3">Veracity Vectors</div>
903
+ <div className="space-y-2">
904
+ {[
905
+ { label: 'Visual Integrity', score: parseFloat(demoResult.visual_score) || 0 },
906
+ { label: 'Audio Integrity', score: parseFloat(demoResult.audio_score) || 0 },
907
+ { label: 'Source Credibility', score: parseFloat(demoResult.source_score) || 0 },
908
+ { label: 'Logical Consistency', score: parseFloat(demoResult.logic_score) || 0 },
909
+ { label: 'Emotional Manip.', score: parseFloat(demoResult.emotion_score) || 0 },
910
+ ].map((v) => (
911
+ <div key={v.label} className="flex justify-between items-center text-xs">
912
+ <span className="text-slate-400 w-32 truncate">{v.label}</span>
913
+ <div className="flex items-center gap-2 flex-1 ml-2">
914
+ <div className="flex-1 h-1.5 bg-slate-800 rounded-full overflow-hidden">
915
+ <div className={`h-full ${v.score < 5 ? 'bg-red-400' : v.score < 8 ? 'bg-amber-400' : 'bg-emerald-400'}`} style={{ width: `${(v.score / 10) * 100}%` }} />
916
+ </div>
917
+ <span className="font-mono text-slate-300 w-6 text-right">{v.score}</span>
918
+ </div>
919
+ </div>
920
+ ))}
921
+ </div>
922
+ </div>
923
+
924
+ <div className="bg-slate-950 p-4 rounded-lg border border-slate-800">
925
+ <div className="text-xs font-bold text-pink-400 uppercase mb-3">Modality Alignment</div>
926
+ <div className="space-y-2">
927
+ {[
928
+ { label: 'Video ↔ Audio', score: parseFloat(demoResult.align_video_audio) || 0 },
929
+ { label: 'Video ↔ Caption', score: parseFloat(demoResult.align_video_caption) || 0 },
930
+ { label: 'Audio ↔ Caption', score: parseFloat(demoResult.align_audio_caption) || 0 },
931
+ ].map((v) => (
932
+ <div key={v.label} className="flex justify-between items-center text-xs">
933
+ <span className="text-slate-400 w-32 truncate">{v.label}</span>
934
+ <div className="flex items-center gap-2 flex-1 ml-2">
935
+ <div className="flex-1 h-1.5 bg-slate-800 rounded-full overflow-hidden">
936
+ <div className={`h-full ${v.score < 5 ? 'bg-red-400' : v.score < 8 ? 'bg-amber-400' : 'bg-emerald-400'}`} style={{ width: `${(v.score / 10) * 100}%` }} />
937
+ </div>
938
+ <span className="font-mono text-slate-300 w-6 text-right">{v.score}</span>
939
+ </div>
940
+ </div>
941
+ ))}
942
+ </div>
943
+ </div>
944
+ </div>
945
+ </div>
946
+
947
+ <div className="flex justify-end">
948
+ <button onClick={() => {setDemoResult(null); setDemoLogs(''); setDemoLink('');}} className="text-xs text-slate-500 hover:text-white flex items-center gap-1 border border-slate-800 px-3 py-1.5 rounded bg-slate-950">
949
+ <RotateCcw className="w-3 h-3"/> Reset Demo
950
+ </button>
951
+ </div>
952
+ </div>
953
+ )}
954
+ </div>
955
+
956
+ {/* EXISTING HOME TAB CONTENT */}
957
  <div className="grid grid-cols-3 gap-6">
958
  <div className="col-span-2 bg-slate-900/50 border border-slate-800 rounded-xl p-6">
959
  <h2 className="text-xl font-bold text-white mb-4">Philosophy & Methodology</h2>
 
1113
  <div className="space-y-1">
1114
  <input type="password" value={apiKey} onChange={e => setApiKey(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white placeholder-slate-600" placeholder="NRP API Token"/>
1115
  </div>
 
 
 
 
 
 
 
 
 
 
 
1116
  </>
1117
  )}
 
 
 
 
 
 
1118
 
1119
+ <div className="space-y-1">
1120
+ <label className="text-[10px] text-slate-500">Model Name</label>
1121
+ <input list="modelSuggestions" value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white placeholder-slate-600" placeholder="Model Name"/>
1122
+ </div>
1123
+
1124
+ <div className="space-y-1 mt-2">
1125
+ <label className="text-[10px] text-slate-500">Reasoning Method</label>
1126
+ <select value={reasoningMethod} onChange={e => setReasoningMethod(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
1127
+ <option value="cot">Standard Chain of Thought</option>
1128
+ <option value="fcot">Fractal Chain of Thought</option>
1129
+ </select>
1130
+ </div>
1131
+ <div className="space-y-1">
1132
+ <label className="text-[10px] text-slate-500">Prompt Persona</label>
1133
+ <select value={promptTemplate} onChange={e => setPromptTemplate(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
1134
+ {availablePrompts.length > 0 ? availablePrompts.map(p => (
1135
+ <option key={p.id} value={p.id}>{p.name}</option>
1136
+ )) : <option value="standard">Standard</option>}
1137
+ </select>
1138
+ </div>
1139
  </div>
1140
  </div>
1141
 
 
1312
  </>
1313
  )}
1314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1315
  <div className="space-y-1">
1316
+ <label className="text-[10px] text-slate-500">Model Name</label>
1317
+ <input list="modelSuggestions" value={modelName} onChange={e => setModelName(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white" placeholder="Model Name"/>
1318
+ </div>
1319
+
1320
+ <div className="space-y-1 mt-2">
1321
  <label className="text-[10px] text-slate-500">Reasoning Method</label>
1322
  <select value={reasoningMethod} onChange={e => setReasoningMethod(e.target.value)} className="w-full bg-slate-950 border border-slate-700 rounded p-2 text-xs text-white">
1323
  <option value="cot">Standard Chain of Thought</option>
 
1369
  </thead>
1370
  <tbody>
1371
  {queueList.map((q, i, arr) => (
1372
+ <React.Fragment key={i}>
1373
+ <tr
1374
+ className={`border-t border-slate-800/50 hover:bg-white/5 ${selectedQueueItems.has(q.link) ? 'bg-indigo-900/20' : ''}`}
1375
+ onMouseDown={(e) => {
1376
+ isDraggingQueueRef.current = true;
1377
+ toggleQueueSelection(e, q.link, i, arr);
1378
+ }}
1379
+ onMouseEnter={() => {
1380
+ if (isDraggingQueueRef.current && !selectedQueueItems.has(q.link)) {
1381
+ setSelectedQueueItems(prev => new Set(prev).add(q.link));
1382
+ setLastQueueIndex(i);
1383
+ }
1384
+ }}
1385
+ >
1386
+ <td className="p-3 cursor-pointer">
1387
+ {selectedQueueItems.has(q.link) ? <CheckSquare className="w-4 h-4 text-indigo-400"/> : <Square className="w-4 h-4 text-slate-600"/>}
1388
+ </td>
1389
+ <td className="p-3 font-bold text-[10px]">{q.task_type === 'Verify' ? <span className="text-amber-400">VERIFY</span> : <span className="text-slate-500">INGEST</span>}</td>
1390
+ <td className="p-3">
1391
+ <div className="flex items-center gap-2">
1392
+ <span className="text-sky-500 font-mono break-all">{q.link}</span>
1393
+ {q.comments && q.comments.length > 0 && (
1394
+ <button
1395
+ onClick={(e) => toggleQueueExpand(e, q.link)}
1396
+ className="px-2 py-1 bg-slate-800 rounded text-[10px] text-slate-300 hover:bg-slate-700 flex items-center gap-1 z-10"
1397
+ >
1398
+ {expandedQueueItems.has(q.link) ? <ChevronUp className="w-3 h-3"/> : <ChevronDown className="w-3 h-3"/>}
1399
+ {q.comments.length} Comments
1400
+ </button>
1401
+ )}
1402
+ </div>
1403
+ </td>
1404
+ <td className="p-3">
1405
+ {q.status === 'Processed' ?
1406
+ <span className="text-emerald-500 flex items-center gap-1"><CheckCircle2 className="w-3 h-3"/> Done</span> :
1407
+ q.status === 'Error' ?
1408
+ <span className="text-red-500 flex items-center gap-1"><AlertCircle className="w-3 h-3"/> Error</span> :
1409
+ <span className="text-amber-500">Pending</span>
1410
+ }
1411
+ </td>
1412
+ </tr>
1413
+ {expandedQueueItems.has(q.link) && q.comments && q.comments.length > 0 && (
1414
+ <tr className="bg-slate-900/40">
1415
+ <td colSpan={4} className="p-0">
1416
+ <div className="px-12 py-3 border-l-2 border-indigo-500 ml-4 space-y-2">
1417
+ {q.comments.map((c: any, ci: number) => (
1418
+ <div key={ci} className="flex flex-col gap-1 p-2 bg-slate-900 border border-slate-800 rounded">
1419
+ <div className="flex justify-between items-center">
1420
+ <div className="text-[10px] font-bold text-indigo-400">{c.author}</div>
1421
+ <div className="flex gap-2">
1422
+ {c.link && <a href={c.link} target="_blank" rel="noreferrer" className="text-[10px] text-sky-400 hover:underline">View Post</a>}
1423
+ {c.link && <button onClick={() => addSingleLinkDirect(c.link)} className="text-[10px] bg-slate-800 hover:bg-slate-700 text-slate-300 px-2 py-0.5 rounded flex items-center gap-1"><Plus className="w-3 h-3"/> Queue Comment</button>}
1424
+ </div>
1425
+ </div>
1426
+ <div className="text-xs text-slate-400 line-clamp-2">{c.text}</div>
1427
+ </div>
1428
+ ))}
1429
+ </div>
1430
+ </td>
1431
+ </tr>
1432
+ )}
1433
+ </React.Fragment>
1434
  ))}
1435
  </tbody>
1436
  </table>
 
1733
  <span className="capitalize text-slate-300 font-bold">{k.replace('v', 'Video-').replace('a', 'Audio-').replace('c', 'Caption')}</span>
1734
  <span className="text-emerald-400 font-mono font-bold">{(manualScores as any)[k]}/10</span>
1735
  </div>
1736
+ <input type="range" min="1" max="10" value={(manualScores as any)[k]} onChange={e => setManualScores({...manualScores,[k]: parseInt(e.target.value)})} className="w-full accent-emerald-500"/>
1737
  </div>
1738
  ))}
1739
  </div>
 
1813
  </div>
1814
  )}
1815
 
1816
+ {/* COMMUNITY AND ANALYTICS TABS (UNCHANGED) */}
1817
  {activeTab === 'community' && (
1818
  <div className="flex h-full gap-6">
1819
  <div className="w-1/3 bg-slate-900/50 border border-slate-800 rounded-xl overflow-auto">
main.go CHANGED
@@ -33,7 +33,17 @@ func main() {
33
  strings.HasPrefix(r.URL.Path, "/download-dataset") ||
34
  strings.HasPrefix(r.URL.Path, "/extension") ||
35
  strings.HasPrefix(r.URL.Path, "/manage") ||
36
- strings.HasPrefix(r.URL.Path, "/queue") {
 
 
 
 
 
 
 
 
 
 
37
 
38
  log.Printf("Proxying %s to Python Backend...", r.URL.Path)
39
  proxy.ServeHTTP(w, r)
@@ -57,4 +67,4 @@ func main() {
57
  if err := http.ListenAndServe(":"+port, nil); err != nil {
58
  log.Fatal(err)
59
  }
60
- }
 
33
  strings.HasPrefix(r.URL.Path, "/download-dataset") ||
34
  strings.HasPrefix(r.URL.Path, "/extension") ||
35
  strings.HasPrefix(r.URL.Path, "/manage") ||
36
+ strings.HasPrefix(r.URL.Path, "/queue") ||
37
+ strings.HasPrefix(r.URL.Path, "/a2a") ||
38
+ strings.HasPrefix(r.URL.Path, "/health") ||
39
+ strings.HasPrefix(r.URL.Path, "/benchmarks") ||
40
+ strings.HasPrefix(r.URL.Path, "/config") ||
41
+ strings.HasPrefix(r.URL.Path, "/tags") ||
42
+ strings.HasPrefix(r.URL.Path, "/manual") ||
43
+ strings.HasPrefix(r.URL.Path, "/profiles") ||
44
+ strings.HasPrefix(r.URL.Path, "/community") ||
45
+ strings.HasPrefix(r.URL.Path, "/dataset") ||
46
+ strings.HasPrefix(r.URL.Path, "/analyze") {
47
 
48
  log.Printf("Proxying %s to Python Backend...", r.URL.Path)
49
  proxy.ServeHTTP(w, r)
 
67
  if err := http.ListenAndServe(":"+port, nil); err != nil {
68
  log.Fatal(err)
69
  }
70
+ }
src/app.py CHANGED
@@ -1,5 +1,11 @@
1
  import os
2
  import sys
 
 
 
 
 
 
3
  import asyncio
4
  import subprocess
5
  from pathlib import Path
@@ -10,9 +16,6 @@ import datetime
10
  import json
11
  import hashlib
12
  import re
13
- import glob
14
- import shutil
15
- import time
16
  from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
17
  from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
18
  from fastapi.templating import Jinja2Templates
@@ -22,29 +25,17 @@ import yt_dlp
22
  import inference_logic
23
  import factuality_logic
24
  import transcription
25
- from factuality_logic import parse_vtt
26
- from toon_parser import parse_veracity_toon
 
27
 
28
- try:
29
- import mlcroissant as mlc
30
- CROISSANT_AVAILABLE = True
31
- except ImportError:
32
- try:
33
- import croissant as mlc
34
- CROISSANT_AVAILABLE = True
35
- except ImportError:
36
- mlc = None
37
- CROISSANT_AVAILABLE = False
38
-
39
- # Configure Logging with High Verbosity
40
- logging.basicConfig(
41
- level=logging.INFO,
42
- format="%(asctime)s - %(levelname)s - %(message)s",
43
- handlers=[logging.StreamHandler(sys.stdout)]
44
- )
45
- logger = logging.getLogger("vChat")
46
 
47
- LITE_MODE = os.getenv("LITE_MODE", "false").lower() == "true"
 
 
48
 
49
  app = FastAPI()
50
 
@@ -56,469 +47,718 @@ app.add_middleware(
56
  allow_headers=["*"],
57
  )
58
 
59
- # HF Spaces specific path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  STATIC_DIR = "/app/static"
61
  if not os.path.isdir(STATIC_DIR):
62
- # Fallback if running locally
63
- STATIC_DIR = "static"
64
- os.makedirs(STATIC_DIR, exist_ok=True)
65
-
 
 
 
 
66
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
67
- templates = Jinja2Templates(directory=STATIC_DIR)
68
 
69
- # Ensure data directories exist (HF Spaces writable locations)
70
- os.makedirs("data/videos", exist_ok=True)
71
- os.makedirs("data", exist_ok=True)
72
- os.makedirs("data/labels", exist_ok=True)
73
- os.makedirs("data/prompts", exist_ok=True)
74
- os.makedirs("data/responses", exist_ok=True)
75
- os.makedirs("metadata", exist_ok=True)
76
 
77
- STOP_QUEUE_SIGNAL = False
 
 
78
 
79
- PROMPT_VARIANTS = {
80
- "standard": {"description": "Standard Analysis", "instruction": "You are an AI Factuality Assessment Agent operating under the 'Ali Arsanjani Factuality Factors' framework. Cross-reference claims and assess structural validity."},
81
- "skeptic": {"description": "High Skepticism", "instruction": "You are a highly skeptical investigator. Question all assumptions, identify the weakest links in the narrative, and aggressively penalize logical fallacies."}
82
- }
83
 
84
- @app.on_event("startup")
85
- async def startup_event():
86
- logger.info("Application starting up...")
87
- try:
88
- transcription.load_model()
89
- except Exception as e:
90
- logger.warning(f"Could not load Whisper model: {e}")
91
 
92
- if not LITE_MODE:
93
- try:
94
- inference_logic.load_models()
95
- except Exception as e:
96
- logger.fatal(f"Could not load local inference models. Error: {e}", exc_info=True)
97
- else:
98
- logger.info("Running in LITE mode (API Only).")
99
 
100
- @app.get("/", response_class=HTMLResponse)
101
- async def read_root(request: Request):
102
- custom_model_available = False
103
- if not LITE_MODE:
104
- custom_model_available = inference_logic.peft_model is not None
105
- if not (Path(STATIC_DIR) / "index.html").exists():
106
- return HTMLResponse(content="Frontend not found.", status_code=404)
107
- return templates.TemplateResponse("index.html", {
108
- "request": request,
109
- "custom_model_available": custom_model_available,
110
- "lite_mode": LITE_MODE
111
- })
112
 
113
- @app.get("/config/prompts")
114
- async def list_prompts():
115
- return[{"id": k, "name": v['description']} for k, v in PROMPT_VARIANTS.items()]
116
 
117
- @app.get("/model-architecture", response_class=PlainTextResponse)
118
- async def get_model_architecture():
119
- if LITE_MODE: return "Running in LITE mode."
120
- if inference_logic.base_model: return str(inference_logic.base_model)
121
- return "Model not loaded."
 
 
 
 
122
 
123
- @app.get("/download-dataset")
124
- async def download_dataset():
125
- file_path = Path("data/dataset.csv")
126
- if file_path.exists():
127
- return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
128
- return Response("Dataset not found.", status_code=404)
 
129
 
130
- progress_message = ""
131
- def progress_hook(d):
132
- global progress_message
133
- if d['status'] == 'downloading':
134
- progress_message = f"Downloading: {d.get('_percent_str', 'N/A')} at {d.get('_speed_str', 'N/A')}\r"
135
- elif d['status'] == 'finished':
136
- progress_message = f"\nDownload finished. Preparing video assets...\n"
137
-
138
- def get_cookies_path():
139
- """Look for cookies file in known locations for better yt-dlp support."""
140
- candidates =["cookies.txt", "data/cookies.txt", "/app/cookies.txt"]
141
- for c in candidates:
142
- if os.path.exists(c):
143
- return os.path.abspath(c)
144
- return None
145
-
146
- async def run_subprocess_async(command: list[str]):
147
- cmd_str = ' '.join(command)
148
- logger.info(f"[Subprocess] Running: {cmd_str}")
149
- process = await asyncio.create_subprocess_exec(*command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
150
- stdout, stderr = await process.communicate()
151
-
152
- if process.returncode != 0:
153
- err_msg = stderr.decode()
154
- logger.error(f"[Subprocess] Failed ({process.returncode}): {err_msg}")
155
- raise RuntimeError(f"Command failed: {err_msg}")
156
- logger.info(f"[Subprocess] Success.")
157
- return stdout.decode()
158
-
159
- def extract_tweet_id(url: str) -> str | None:
160
- match = re.search(r"(?:twitter|x)\.com/[^/]+/status/(\d+)", url)
161
- if match: return match.group(1)
162
- return None
163
-
164
- def normalize_link(link: str) -> str:
165
- if not link: return ""
166
- return link.split('?')[0].strip().rstrip('/').replace('http://', '').replace('https://', '').replace('www.', '')
167
-
168
- def check_if_processed(link: str) -> bool:
169
- target_id = extract_tweet_id(link)
170
- link_clean = normalize_link(link)
171
-
172
  for filename in["data/dataset.csv", "data/manual_dataset.csv"]:
173
  path = Path(filename)
174
- if not path.exists(): continue
175
- try:
176
- with open(path, 'r', encoding='utf-8', errors='ignore') as f:
177
- sample = f.read(2048)
178
- f.seek(0)
179
- try: has_header = csv.Sniffer().has_header(sample)
180
- except: has_header = True
181
-
182
- if has_header:
183
- reader = csv.DictReader(f)
184
- for row in reader:
185
- row_link = normalize_link(row.get('link', ''))
186
- if row_link == link_clean: return True
187
- row_id = row.get('id', '')
188
- if target_id and row_id == target_id: return True
189
- else:
190
- reader = csv.reader(f)
191
- for row in reader:
192
- if not row: continue
193
- if link_clean in [normalize_link(r) for r in row]: return True
194
- if target_id and target_id in row: return True
195
- except Exception:
196
- continue
197
- return False
198
-
199
- def update_queue_status_in_file(link: str, new_status: str):
200
  q_path = Path("data/batch_queue.csv")
201
  if not q_path.exists(): return
202
  rows =[]
 
 
203
  with open(q_path, 'r', encoding='utf-8') as f:
204
- reader = csv.reader(f)
205
- try: header = next(reader)
206
- except StopIteration: return
 
 
207
  for row in reader:
208
- if row and normalize_link(row[0]) == normalize_link(link):
209
- if len(row) > 2: row[2] = new_status
210
- else: row.extend([new_status, "Ingest"])
 
 
211
  rows.append(row)
212
- with open(q_path, 'w', newline='', encoding='utf-8') as f:
 
 
 
 
 
 
 
 
 
213
  writer = csv.writer(f)
214
- writer.writerow(header)
215
- writer.writerows(rows)
216
-
217
- async def prepare_video_assets_async(url: str) -> dict:
218
- global progress_message
219
- loop = asyncio.get_event_loop()
220
- is_local = not (url.startswith("http://") or url.startswith("https://"))
221
- video_id = "unknown"
222
- transcript_path = None
223
-
224
- logger.info(f"Preparing assets for URL: {url}")
225
 
226
- if is_local:
227
- original_path = Path(url)
228
- if not original_path.exists(): raise FileNotFoundError(f"File not found: {url}")
229
- video_id = hashlib.md5(str(url).encode('utf-8')).hexdigest()[:16]
230
- metadata = {"id": video_id, "link": url, "caption": original_path.stem}
231
- else:
232
- tweet_id = extract_tweet_id(url)
233
- video_id = tweet_id if tweet_id else hashlib.md5(url.encode('utf-8')).hexdigest()[:16]
234
- sanitized_check = Path(f"data/videos/{video_id}_fixed.mp4")
235
-
236
- cookies_path = get_cookies_path()
237
- ydl_opts = {
238
- 'format': 'best[ext=mp4]/best',
239
- 'outtmpl': 'data/videos/%(id)s.%(ext)s',
240
- 'progress_hooks': [progress_hook],
241
- 'quiet': False,
242
- 'no_warnings': False,
243
- 'noplaylist': True,
244
- 'no_overwrites': True,
245
- 'writesubtitles': True,
246
- 'writeautomaticsub': True,
247
- 'subtitleslangs': ['en'],
248
- 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
249
- }
250
-
251
- if cookies_path:
252
- ydl_opts['cookiefile'] = cookies_path
253
- logger.info(f"Using cookies from {cookies_path}")
254
-
255
- if sanitized_check.exists():
256
- logger.info(f"Video {video_id} already cached at {sanitized_check}")
257
- original_path = sanitized_check
258
- metadata = {"id": video_id, "link": url, "caption": "Cached Video"}
259
- else:
260
- try:
261
- logger.info(f"Starting yt-dlp download for {video_id}...")
262
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
263
- info = await loop.run_in_executor(None, lambda: ydl.extract_info(url, download=True))
264
- original_path = Path(ydl.prepare_filename(info))
265
- metadata = {
266
- "id": info.get("id", video_id), "link": info.get("webpage_url", url),
267
- "caption": info.get("description", info.get("title", "N/A")).encode('ascii', 'ignore').decode('ascii').strip()[:500],
268
- "postdatetime": info.get("upload_date", "N/A")
269
- }
270
- video_id = info.get("id", video_id)
271
- logger.info("yt-dlp download successful.")
272
- except yt_dlp.utils.DownloadError as e:
273
- logger.error(f"yt-dlp download error: {e}")
274
- if "No video could be found" in str(e):
275
- raise ValueError(f"No video content found at {url}")
276
- raise RuntimeError(f"Download failed: {str(e)}")
277
- except Exception as e:
278
- logger.error(f"Unexpected yt-dlp error: {e}")
279
- raise RuntimeError(f"Download failed: {str(e)}")
280
-
281
- transcript_path = next(Path("data/videos").glob(f"{video_id}*.en.vtt"), None)
282
- if not transcript_path: transcript_path = next(Path("data/videos").glob(f"{video_id}*.vtt"), None)
283
-
284
- sanitized_path = Path(f"data/videos/{video_id}_fixed.mp4")
285
-
286
- # --- FFmpeg Sanitization Logic with Robust Fallback ---
287
- if not sanitized_path.exists() and original_path.exists():
288
- logger.info(f"Sanitizing video {video_id} (Original: {original_path})...")
289
- ffmpeg_bin = shutil.which('ffmpeg')
290
- if not ffmpeg_bin: raise RuntimeError("FFmpeg binary not found in system path!")
291
-
292
- try:
293
- await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c:v", "libx264", "-c:a", "aac", "-pix_fmt", "yuv420p", "-y", str(sanitized_path)])
294
- logger.info("Sanitization (re-encode) successful.")
295
- except Exception as e:
296
- logger.warning(f"Re-encode failed ({e}). Attempting Stream Copy...")
297
- try:
298
- await run_subprocess_async([ffmpeg_bin, "-i", str(original_path), "-c", "copy", "-y", str(sanitized_path)])
299
- logger.info("Sanitization (copy) successful.")
300
- except Exception as e2:
301
- logger.error(f"Sanitization failed completely: {e2}")
302
- if original_path.suffix == '.mp4':
303
- logger.warning("Using original file as sanitized file.")
304
- shutil.copy(original_path, sanitized_path)
305
- else:
306
- raise RuntimeError("Could not produce a valid MP4 file.")
307
-
308
- # --- Audio Extraction ---
309
- audio_path = sanitized_path.with_suffix('.wav')
310
- if not audio_path.exists() and sanitized_path.exists():
311
- logger.info(f"Extracting audio to {audio_path}...")
312
- try:
313
- await run_subprocess_async(["ffmpeg", "-i", str(sanitized_path), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", str(audio_path)])
314
- logger.info("Audio extraction successful.")
315
- except Exception as e:
316
- logger.error(f"Audio extraction failed: {e}")
317
-
318
- # --- Transcription ---
319
- if not transcript_path and audio_path.exists() and transcription.transcription_model is not None:
320
- logger.info("Generating transcript via Whisper...")
321
- transcript_path = await loop.run_in_executor(None, transcription.generate_transcript, str(audio_path))
322
- elif not transcript_path:
323
- logger.info("Skipping local transcription (Whisper not loaded or audio missing).")
324
 
325
- return {"video": str(sanitized_path), "transcript": str(transcript_path) if transcript_path else None, "metadata": metadata}
 
 
326
 
327
- def safe_int(value):
328
- try:
329
- clean = re.sub(r'[^\d]', '', str(value))
330
- return int(clean) if clean else 0
331
- except Exception:
332
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
- async def generate_and_save_croissant_metadata(row_data: dict) -> str:
 
335
  try:
336
- sanitized_data = {
337
- "id": str(row_data.get("id", "")),
338
- "link": str(row_data.get("link", "")),
339
- "visual_integrity_score": safe_int(row_data.get("visual_integrity_score")),
340
- "final_veracity_score": safe_int(row_data.get("final_veracity_score"))
341
- }
342
- video_id = sanitized_data["id"]
343
- timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
344
- croissant_json = {
345
- "@context": "https://schema.org/",
346
- "@type": "Dataset",
347
- "name": f"vchat-label-{video_id}",
348
- "description": f"Veracity analysis labels for video {video_id}",
349
- "url": sanitized_data["link"],
350
- "variableMeasured": sanitized_data
351
- }
352
- path = Path("metadata") / f"{video_id}_{timestamp}.json"
353
- path.write_text(json.dumps(croissant_json, indent=2))
354
- return str(path)
355
- except Exception:
356
- return "N/A (Error)"
 
 
 
 
 
 
 
357
 
358
- async def get_labels_for_link(video_url: str, gemini_config: dict, vertex_config: dict, nrp_config: dict, model_selection: str, include_comments: bool, reasoning_method: str = "cot", system_persona: str = ""):
 
 
 
 
 
 
359
  try:
360
- yield f"Downloading assets for {video_url}..."
 
 
361
 
362
- try:
363
- paths = await prepare_video_assets_async(video_url)
364
- except ValueError as ve:
365
- yield f"Skipped: {str(ve)}"
366
- logger.warning(f"Skipping {video_url}: {ve}")
367
- return
368
- except Exception as e:
369
- yield f"Error preparing assets: {str(e)}"
370
- logger.error(f"Asset prep failed for {video_url}: {e}")
371
- return
372
-
373
- video_path = paths["video"]
374
- transcript_text = parse_vtt(paths["transcript"]) if paths["transcript"] else "No transcript (Audio/Video Analysis only)."
375
- caption = paths["metadata"].get("caption", "")
376
 
377
- yield f"Assets ready. Running inference ({model_selection}, {reasoning_method.upper()})..."
378
- logger.info(f"Starting inference pipeline for {video_url} (Transcript len: {len(transcript_text)})")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
- final_labels = None
381
- raw_toon = ""
382
- prompt_used = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
- if model_selection == 'gemini':
385
- pipeline = inference_logic.run_gemini_labeling_pipeline
386
- config = gemini_config
387
- elif model_selection == 'vertex':
388
- pipeline = inference_logic.run_vertex_labeling_pipeline
389
- config = vertex_config
390
- elif model_selection == 'nrp':
391
- if hasattr(inference_logic, 'run_nrp_labeling_pipeline'):
392
- pipeline = inference_logic.run_nrp_labeling_pipeline
393
- else:
394
- yield "NRP Pipeline not supported in this version."
395
- return
396
- config = nrp_config
397
- else:
398
- yield f"Unknown model selection: {model_selection}"
399
- return
400
 
401
- try:
402
- async for msg in pipeline(video_path, caption, transcript_text, config, include_comments, reasoning_method, system_persona, paths["metadata"]["id"]):
403
- if isinstance(msg, dict) and "parsed_data" in msg:
404
- final_labels = msg["parsed_data"]
405
- raw_toon = msg.get("raw_toon", "")
406
- prompt_used = msg.get("prompt_used", "")
407
- logger.info("Inference successful. Data parsed.")
408
- elif isinstance(msg, str):
409
- yield msg
410
- elif isinstance(msg, dict) and "error" in msg:
411
- yield f"API Error: {msg['error']}"
412
- except Exception as pipe_err:
413
- logger.error(f"Pipeline crashed: {pipe_err}")
414
- yield f"Critical Pipeline Failure: {pipe_err}"
415
- return
416
-
417
- if not final_labels:
418
- logger.error(f"Inference pipeline completed but returned no labels for {video_url}")
419
- yield "No labels generated. Check logs."
420
- return
421
 
422
- final_labels["meta_info"] = {
423
- "prompt_used": prompt_used,
424
- "model_selection": model_selection,
425
- "reasoning_method": reasoning_method
426
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
- vec = final_labels.get("veracity_vectors", {})
429
- mod = final_labels.get("modalities", {})
430
- fin = final_labels.get("final_assessment", {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
 
 
 
432
  row = {
433
- "id": paths["metadata"]["id"],
434
- "link": paths["metadata"]["link"],
435
- "caption": caption,
436
- "postdatetime": paths["metadata"].get("postdatetime", ""),
437
- "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
438
- "videotranscriptionpath": paths["transcript"] or "",
439
- "visual_integrity_score": vec.get("visual_integrity_score", "0"),
440
- "audio_integrity_score": vec.get("audio_integrity_score", "0"),
441
- "source_credibility_score": vec.get("source_credibility_score", "0"),
442
- "logical_consistency_score": vec.get("logical_consistency_score", "0"),
443
- "emotional_manipulation_score": vec.get("emotional_manipulation_score", "0"),
444
- "video_audio_score": mod.get("video_audio_score", "0"),
445
- "video_caption_score": mod.get("video_caption_score", "0"),
446
- "audio_caption_score": mod.get("audio_caption_score", "0"),
447
- "final_veracity_score": fin.get("veracity_score_total", "0"),
448
- "final_reasoning": fin.get("reasoning", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  }
450
- yield {"csv_row": row, "full_json": final_labels, "raw_toon": raw_toon}
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  except Exception as e:
453
- logger.error(f"Fatal error in get_labels_for_link: {e}", exc_info=True)
454
- yield {"error": str(e)}
455
 
456
- @app.get("/queue/list")
457
- async def get_queue_list():
458
- queue_path = Path("data/batch_queue.csv")
459
- if not queue_path.exists(): return[]
460
- items =[]
461
- with open(queue_path, 'r', encoding='utf-8') as f:
462
- reader = csv.reader(f)
463
- try: next(reader)
464
- except: pass
465
- for row in reader:
466
- if len(row) > 0:
467
- link = row[0]
468
- status = row[2] if len(row) > 2 else ("Processed" if check_if_processed(link) else "Pending")
469
- items.append({
470
- "link": link,
471
- "timestamp": row[1] if len(row) > 1 else "",
472
- "status": status,
473
- "task_type": row[3] if len(row) > 3 else "Ingest"
474
- })
475
- return items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476
 
477
  @app.post("/queue/add")
478
- async def add_queue_item(payload: dict = Body(...)):
479
- link = payload.get("link")
480
- if not link:
481
- return {"status": "error", "message": "Link required"}
482
  q_path = Path("data/batch_queue.csv")
483
  existing = set()
484
  if q_path.exists():
485
- with open(q_path, 'r', encoding='utf-8') as f:
486
- reader = csv.reader(f)
487
- for row in reader:
488
- if row: existing.add(normalize_link(row[0]))
489
-
490
- if normalize_link(link) in existing:
491
- return {"status": "ignored", "message": "Link already in queue"}
492
 
493
  with open(q_path, 'a', newline='', encoding='utf-8') as f:
494
- writer = csv.writer(f)
495
- if not q_path.exists() or q_path.stat().st_size == 0:
496
- writer.writerow(["link", "ingest_timestamp", "status", "task_type"])
497
- writer.writerow([link.strip(), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Pending", "Ingest"])
498
  return {"status": "success", "link": link}
499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
500
  @app.post("/queue/clear_processed")
501
  async def clear_processed_queue():
502
  q_path = Path("data/batch_queue.csv")
503
  if not q_path.exists(): return {"status": "success", "removed_count": 0}
504
-
505
  kept_rows =[]
506
  removed_count = 0
507
- with open(q_path, 'r', encoding='utf-8') as f:
508
- reader = csv.reader(f)
509
- header = next(reader, ["link", "ingest_timestamp", "status", "task_type"])
510
- for row in reader:
511
- if not row: continue
512
- link = row[0]
513
- status = row[2] if len(row) > 2 else "Pending"
514
- if status == "Processed" or check_if_processed(link):
515
- removed_count += 1
516
- else:
517
- kept_rows.append(row)
518
-
519
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
520
- writer = csv.writer(f)
521
- writer.writerow(header)
522
  writer.writerows(kept_rows)
523
  return {"status": "success", "removed_count": removed_count}
524
 
@@ -526,24 +766,17 @@ async def clear_processed_queue():
526
  async def delete_queue_items(request: Request):
527
  try:
528
  data = await request.json()
529
- target_links = set(normalize_link(l) for l in data.get("links",[]))
530
  q_path = Path("data/batch_queue.csv")
531
  if not q_path.exists(): return {"status": "success", "count": 0}
532
  kept_rows =[]
533
  deleted_count = 0
534
- with open(q_path, 'r', encoding='utf-8') as f:
535
- reader = csv.reader(f)
536
- header = next(reader,["link", "ingest_timestamp", "status", "task_type"])
537
- for row in reader:
538
- if not row: continue
539
- link = row[0]
540
- if normalize_link(link) in target_links:
541
- deleted_count += 1
542
- else:
543
- kept_rows.append(row)
544
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
545
- writer = csv.writer(f)
546
- writer.writerow(header)
547
  writer.writerows(kept_rows)
548
  return {"status": "success", "count": deleted_count}
549
  except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
@@ -552,89 +785,109 @@ async def delete_queue_items(request: Request):
552
  async def requeue_items(request: Request):
553
  try:
554
  data = await request.json()
555
- target_links = set(normalize_link(l) for l in data.get("links",[]))
556
  q_path = Path("data/batch_queue.csv")
557
  if not q_path.exists(): return {"status": "success", "count": 0}
558
  rows =[]
559
  requeued_count = 0
560
- with open(q_path, 'r', encoding='utf-8') as f:
561
- reader = csv.reader(f)
562
- header = next(reader,["link", "ingest_timestamp", "status", "task_type"])
563
- for row in reader:
564
- if not row: continue
565
- link = row[0]
566
- if normalize_link(link) in target_links:
567
- if len(row) > 2: row[2] = "Pending"
568
- requeued_count += 1
569
- rows.append(row)
570
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
571
- writer = csv.writer(f)
572
- writer.writerow(header)
573
  writer.writerows(rows)
574
  return {"status": "success", "count": requeued_count}
575
  except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
576
 
577
- @app.post("/queue/stop")
578
- async def stop_queue_processing():
579
- global STOP_QUEUE_SIGNAL
580
- logger.info("Received Stop Signal from User.")
581
- STOP_QUEUE_SIGNAL = True
582
- return {"status": "stopping"}
583
-
584
- @app.post("/queue/upload_csv")
585
- async def upload_csv_to_queue(file: UploadFile = File(...)):
586
  try:
587
- content = await file.read()
588
- try:
589
- decoded = content.decode('utf-8').splitlines()
590
- except UnicodeDecodeError:
591
- decoded = content.decode('latin-1').splitlines()
592
-
593
- reader = csv.reader(decoded)
594
- links_to_add =[]
595
- header = next(reader, None)
596
- if not header: return {"status": "empty file"}
597
-
598
- link_idx = 0
599
- header_lower = [h.lower() for h in header]
600
-
601
- if "link" in header_lower: link_idx = header_lower.index("link")
602
- elif "url" in header_lower: link_idx = header_lower.index("url")
603
- elif len(header) > 0 and header[0].strip().startswith("http"):
604
- links_to_add.append(header[0])
605
- link_idx = 0
606
-
607
- for row in reader:
608
- if len(row) > link_idx and row[link_idx].strip():
609
- links_to_add.append(row[link_idx].strip())
610
 
611
- queue_path = Path("data/batch_queue.csv")
612
- existing_links = set()
613
- if queue_path.exists():
614
- with open(queue_path, 'r', encoding='utf-8') as f:
615
- existing_links = set(f.read().splitlines())
616
 
617
- added_count = 0
618
- with open(queue_path, 'a', newline='', encoding='utf-8') as f:
619
- writer = csv.writer(f)
620
- if not queue_path.exists() or queue_path.stat().st_size == 0:
621
- writer.writerow(["link", "ingest_timestamp", "status", "task_type"])
 
 
 
 
 
 
 
622
 
623
- for link in links_to_add:
624
- duplicate = False
625
- for line in existing_links:
626
- if normalize_link(link) in line:
627
- duplicate = True
628
- break
629
- if duplicate: continue
630
-
631
- writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Pending", "Ingest"])
632
- added_count += 1
633
-
634
- return {"status": "success", "added": added_count}
635
  except Exception as e:
636
- logging.error(f"Upload CSV error: {e}")
637
- return JSONResponse(status_code=400, content={"error": str(e), "status": "failed"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
  @app.post("/queue/run")
640
  async def run_queue_processing(
@@ -642,293 +895,138 @@ async def run_queue_processing(
642
  gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
643
  vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
644
  nrp_api_key: str = Form(""), nrp_model_name: str = Form(""), nrp_base_url: str = Form("https://ellm.nrp-nautilus.io/v1"),
645
- include_comments: bool = Form(False),
646
- reasoning_method: str = Form("cot"),
647
- prompt_template: str = Form("standard"),
648
- custom_query: str = Form(""),
649
- max_reprompts: int = Form(1)
650
  ):
651
  global STOP_QUEUE_SIGNAL
652
  STOP_QUEUE_SIGNAL = False
 
653
  gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name, "max_retries": max_reprompts}
654
- vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key, "max_retries": max_reprompts}
655
  nrp_config = {"api_key": nrp_api_key, "model_name": nrp_model_name, "base_url": nrp_base_url, "max_retries": max_reprompts}
656
-
657
  sel_p = PROMPT_VARIANTS.get(prompt_template, PROMPT_VARIANTS['standard'])
658
  system_persona_txt = sel_p['instruction']
659
- if custom_query.strip(): system_persona_txt += f"\n\nSPECIAL INSTRUCTION: {custom_query}"
660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  async def queue_stream():
662
- queue_path = Path("data/batch_queue.csv")
663
- if not queue_path.exists():
664
- yield "data: Queue empty.\n\n"
665
- return
666
 
667
- items =[]
668
- with open(queue_path, 'r', encoding='utf-8') as f:
669
- reader = csv.reader(f)
670
- try: next(reader)
671
- except: pass
672
- for row in reader:
673
- if row and len(row) > 0:
674
- status = row[2] if len(row) > 2 else "Pending"
675
- if status == "Pending":
676
- items.append(row[0])
677
 
678
- processed_count = 0
679
- total = len(items)
680
-
681
- logger.info(f"Starting batch queue processing for {total} items.")
682
-
683
- for i, link in enumerate(items):
684
- if STOP_QUEUE_SIGNAL:
685
- yield "data: [SYSTEM] Stopped by user.\n\n"
686
- logger.info("Stopping queue loop.")
687
  break
688
 
689
- if check_if_processed(link):
690
- yield f"data: [SKIP] {link} processed.\n\n"
691
- update_queue_status_in_file(link, "Processed")
692
- continue
693
-
694
- yield f"data: [START] {i+1}/{total}: {link}\n\n"
695
- final_data = None
696
-
697
- # Streaming results from pipeline
698
- async for res in get_labels_for_link(link, gemini_config, vertex_config, nrp_config, model_selection, include_comments, reasoning_method, system_persona_txt):
699
- if isinstance(res, str):
700
- msg = res.replace('\n', ' ')
701
- yield f"data: {msg}\n\n"
702
- if isinstance(res, dict):
703
- if "error" in res:
704
- yield f"data:[ERROR DETAIL] {res['error']}\n\n"
705
- if "csv_row" in res:
706
- final_data = res
707
-
708
- if final_data:
709
- row = final_data["csv_row"]
710
- vid_id = row["id"]
711
- ts = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
712
-
713
- # Save artifacts
714
- json_path = f"data/labels/{vid_id}_{ts}_labels.json"
715
- with open(json_path, 'w') as f: json.dump(final_data["full_json"], f, indent=2)
716
- with open(f"data/labels/{vid_id}_{ts}.toon", 'w') as f: f.write(final_data["raw_toon"])
717
-
718
- prompt_content = final_data.get("full_json", {}).get("meta_info", {}).get("prompt_used", "")
719
- if prompt_content:
720
- with open(f"data/prompts/{vid_id}_{ts}_prompt.txt", 'w', encoding='utf-8') as f:
721
- f.write(prompt_content)
722
-
723
- raw_response = final_data.get("raw_toon", "")
724
- if raw_response:
725
- with open(f"data/responses/{vid_id}.txt", 'w', encoding='utf-8') as f:
726
- f.write(raw_response)
727
-
728
- row["metadatapath"] = await generate_and_save_croissant_metadata(row)
729
- row["json_path"] = json_path
730
-
731
- dpath = Path("data/dataset.csv")
732
- exists = dpath.exists()
733
- with open(dpath, 'a', newline='', encoding='utf-8') as f:
734
- writer = csv.DictWriter(f, fieldnames=list(row.keys()), extrasaction='ignore')
735
- if not exists: writer.writeheader()
736
- writer.writerow(row)
737
-
738
- update_queue_status_in_file(link, "Processed")
739
- processed_count += 1
740
- yield f"data: [SUCCESS] Labeled.\n\n"
741
- else:
742
- update_queue_status_in_file(link, "Error")
743
- yield f"data: [FAIL] Failed to label. Check logs.\n\n"
744
-
745
- yield f"data: Batch Complete. +{processed_count} videos labeled.\n\n"
746
- yield "event: close\ndata: Done\n\n"
747
-
748
- return StreamingResponse(queue_stream(), media_type="text/event-stream")
749
 
750
- @app.post("/extension/ingest")
751
- async def extension_ingest(request: Request):
752
- try:
753
- data = await request.json()
754
- link = data.get("link")
755
- if not link: raise HTTPException(status_code=400, detail="No link")
756
- q_path = Path("data/batch_queue.csv")
757
- file_exists = q_path.exists()
758
-
759
- if file_exists:
760
- with open(q_path, 'r', encoding='utf-8') as f:
761
- if link in f.read():
762
- return {"status": "queued", "msg": "Duplicate"}
763
-
764
- with open(q_path, 'a', newline='', encoding='utf-8') as f:
765
- writer = csv.writer(f)
766
- if not file_exists: writer.writerow(["link", "ingest_timestamp", "status", "task_type"])
767
- writer.writerow([link, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "Pending", "Ingest"])
768
 
769
- return {"status": "queued", "link": link}
770
- except Exception as e:
771
- raise HTTPException(status_code=500, detail=str(e))
 
772
 
773
- @app.post("/extension/save_comments")
774
- async def extension_save_comments(request: Request):
775
- try:
776
- data = await request.json()
777
- link = data.get("link")
778
- comments = data.get("comments",[])
779
- if not link or not comments: raise HTTPException(status_code=400, detail="Missing data")
780
-
781
- csv_path = Path("data/comments.csv")
782
- exists = csv_path.exists()
783
- fieldnames = ["link", "author", "comment_text", "timestamp"]
784
-
785
- with open(csv_path, 'a', newline='', encoding='utf-8') as f:
786
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
787
- if not exists: writer.writeheader()
788
 
789
- ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
790
- for c in comments:
791
- row = {"link": link, "timestamp": ts}
792
- if isinstance(c, dict):
793
- row["author"] = c.get("author", "Unknown")
794
- row["comment_text"] = c.get("text", "").strip()
795
- else:
796
- row["author"] = "Unknown"
797
- row["comment_text"] = str(c).strip()
798
-
799
- if row["comment_text"]:
800
- writer.writerow(row)
801
-
802
- return {"status": "saved", "count": len(comments)}
803
- except Exception as e:
804
- raise HTTPException(status_code=500, detail=str(e))
805
 
806
- @app.post("/extension/save_manual")
807
- async def extension_save_manual(request: Request):
808
- try:
809
- data = await request.json()
810
- link = data.get("link")
811
- labels = data.get("labels", {})
812
- stats = data.get("stats", {})
813
- if not link: raise HTTPException(status_code=400, detail="No link")
814
-
815
- video_id = extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:16]
816
-
817
- row_data = {
818
- "id": video_id,
819
- "link": link,
820
- "caption": data.get("caption", ""),
821
- "collecttime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
822
- "source": "manual_extension",
823
- "visual_integrity_score": labels.get("visual_integrity_score", 0),
824
- "audio_integrity_score": labels.get("audio_integrity_score", 0),
825
- "source_credibility_score": labels.get("source_credibility_score", 0),
826
- "logical_consistency_score": labels.get("logical_consistency_score", 0),
827
- "emotional_manipulation_score": labels.get("emotional_manipulation_score", 0),
828
- "video_audio_score": labels.get("video_audio_score", 0),
829
- "video_caption_score": labels.get("video_caption_score", 0),
830
- "audio_caption_score": labels.get("audio_caption_score", 0),
831
- "final_veracity_score": labels.get("final_veracity_score", 0),
832
- "final_reasoning": labels.get("reasoning", ""),
833
- "stats_likes": stats.get("likes", 0),
834
- "stats_shares": stats.get("shares", 0),
835
- "stats_comments": stats.get("comments", 0),
836
- "stats_platform": stats.get("platform", "unknown")
837
- }
838
-
839
- dpath = Path("data/manual_dataset.csv")
840
- exists = dpath.exists()
841
- fieldnames = list(row_data.keys())
842
-
843
- with open(dpath, 'a', newline='', encoding='utf-8') as f:
844
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
845
- if not exists: writer.writeheader()
846
- writer.writerow(row_data)
847
-
848
- return {"status": "saved"}
849
- except Exception as e:
850
- raise HTTPException(status_code=500, detail=str(e))
851
-
852
- @app.get("/manage/list")
853
- async def list_data():
854
- data =[]
855
- def read_csv(path, source_type):
856
- if not path.exists(): return
857
- with open(path, 'r', encoding='utf-8', errors='ignore') as f:
858
- reader = csv.DictReader(f)
859
- for row in reader:
860
- if not row.get('id') or row['id'].strip() == "":
861
- link = row.get('link', '')
862
- tid = extract_tweet_id(link)
863
- row['id'] = tid if tid else hashlib.md5(link.encode()).hexdigest()[:16]
864
 
865
- json_content = None
866
- if row.get('json_path') and os.path.exists(row['json_path']):
867
- try:
868
- with open(row['json_path'], 'r') as jf: json_content = json.load(jf)
869
- except: pass
870
 
871
- row['source_type'] = source_type
872
- row['json_data'] = json_content
873
- data.append(row)
874
-
875
- read_csv(Path("data/dataset.csv"), "auto")
876
- read_csv(Path("data/manual_dataset.csv"), "manual")
877
- data.sort(key=lambda x: x.get('collecttime', ''), reverse=True)
878
- return data
879
-
880
- @app.delete("/manage/delete")
881
- async def delete_data(id: str = "", link: str = ""):
882
- if not id and not link: raise HTTPException(status_code=400, detail="Must provide ID or Link")
883
- deleted_count = 0
884
- target_id = id
885
-
886
- def remove_from_csv(path):
887
- nonlocal deleted_count, target_id
888
- if not path.exists(): return
889
- rows =[]
890
- found_in_file = False
891
- with open(path, 'r', encoding='utf-8', errors='ignore') as f:
892
- reader = csv.DictReader(f)
893
- fieldnames = reader.fieldnames
894
- for row in reader:
895
- is_match = False
896
- if id and row.get('id') == id: is_match = True
897
- elif link and row.get('link') == link: is_match = True
898
- if is_match:
899
- found_in_file = True
900
- deleted_count += 1
901
- if not target_id: target_id = row.get('id')
902
- else: rows.append(row)
903
- if found_in_file:
904
- with open(path, 'w', newline='', encoding='utf-8') as f:
905
- writer = csv.DictWriter(f, fieldnames=fieldnames)
906
- writer.writeheader()
907
- writer.writerows(rows)
908
-
909
- remove_from_csv(Path("data/dataset.csv"))
910
- remove_from_csv(Path("data/manual_dataset.csv"))
911
- if target_id:
912
- for p in Path("data/labels").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
913
- for p in Path("metadata").glob(f"{target_id}_*"): p.unlink(missing_ok=True)
914
- return {"status": "deleted", "count": deleted_count}
915
-
916
- @app.post("/label_video")
917
- async def label_video_endpoint(
918
- video_url: str = Form(...), model_selection: str = Form(...),
919
- gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
920
- vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
921
- nrp_api_key: str = Form(""), nrp_model_name: str = Form(""), nrp_base_url: str = Form("https://ellm.nrp-nautilus.io/v1"),
922
- include_comments: bool = Form(False),
923
- reasoning_method: str = Form("cot")
924
- ):
925
- gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name}
926
- vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key}
927
- nrp_config = {"api_key": nrp_api_key, "model_name": nrp_model_name, "base_url": nrp_base_url}
928
-
929
- async def stream():
930
- async for msg in get_labels_for_link(video_url, gemini_config, vertex_config, nrp_config, model_selection, include_comments, reasoning_method):
931
- if isinstance(msg, str): yield f"data: {msg}\n\n"
932
- if isinstance(msg, dict) and "csv_row" in msg: yield "data: Done. Labels generated.\n\n"
933
- yield "event: close\ndata: Done.\n\n"
934
- return StreamingResponse(stream(), media_type="text/event-stream")
 
1
  import os
2
  import sys
3
+
4
+ # --- FIX: Ensure 'src' is in sys.path so sibling imports work ---
5
+ current_dir = os.path.dirname(os.path.abspath(__file__))
6
+ if current_dir not in sys.path:
7
+ sys.path.append(current_dir)
8
+
9
  import asyncio
10
  import subprocess
11
  from pathlib import Path
 
16
  import json
17
  import hashlib
18
  import re
 
 
 
19
  from fastapi import FastAPI, Request, Form, UploadFile, File, Body, HTTPException
20
  from fastapi.responses import HTMLResponse, StreamingResponse, PlainTextResponse, Response, FileResponse, JSONResponse
21
  from fastapi.templating import Jinja2Templates
 
25
  import inference_logic
26
  import factuality_logic
27
  import transcription
28
+ import user_analysis_logic
29
+ import agent_logic
30
+ import common_utils
31
 
32
+ from toon_parser import parse_veracity_toon
33
+ from labeling_logic import PROMPT_VARIANTS, LABELING_PROMPT_TEMPLATE, FCOT_MACRO_PROMPT
34
+ import benchmarking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
37
+ logger = logging.getLogger(__name__)
38
+ LITE_MODE = os.getenv("LITE_MODE", "true").lower() == "true"
39
 
40
  app = FastAPI()
41
 
 
47
  allow_headers=["*"],
48
  )
49
 
50
+ # --- CRITICAL: Mount A2A Agent Application FIRST ---
51
+ agent_mount_status = "pending"
52
+ try:
53
+ logger.info("Attempting to build A2A Agent App...")
54
+ a2a_agent_app = agent_logic.create_a2a_app()
55
+ if a2a_agent_app:
56
+ app.mount("/a2a", a2a_agent_app)
57
+ agent_mount_status = "success"
58
+ logger.info("✅ A2A Agent App successfully mounted at /a2a")
59
+ else:
60
+ logger.warning("⚠️ Agent factory returned None. Mounting internal fallback.")
61
+ from fastapi import FastAPI as InnerFastAPI
62
+ fallback = InnerFastAPI()
63
+ @fallback.post("/")
64
+ @fallback.post("/jsonrpc")
65
+ async def fallback_endpoint(request: Request):
66
+ return {"jsonrpc": "2.0", "result": {"text": "Fallback Agent (Factory returned None)", "data": {"status": "fallback"}}}
67
+ app.mount("/a2a", fallback)
68
+ agent_mount_status = "fallback_none"
69
+ except Exception as e:
70
+ logger.critical(f"❌ Failed to mount A2A Agent: {e}", exc_info=True)
71
+ from fastapi import FastAPI as InnerFastAPI
72
+ fallback = InnerFastAPI()
73
+ @fallback.post("/")
74
+ @fallback.post("/jsonrpc")
75
+ async def fallback_endpoint(request: Request):
76
+ return {"jsonrpc": "2.0", "result": {"text": f"Emergency Agent (Mount Error: {str(e)})", "data": {"status": "error"}}}
77
+ app.mount("/a2a", fallback)
78
+ agent_mount_status = f"error_{str(e)}"
79
+
80
+ # --- Static Files & Frontend ---
81
  STATIC_DIR = "/app/static"
82
  if not os.path.isdir(STATIC_DIR):
83
+ if os.path.isdir("/usr/share/vchat/static"):
84
+ STATIC_DIR = "/usr/share/vchat/static"
85
+ elif os.path.isdir("frontend/dist"):
86
+ STATIC_DIR = "frontend/dist"
87
+ else:
88
+ STATIC_DIR = "static"
89
+ os.makedirs(STATIC_DIR, exist_ok=True)
90
+
91
  app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
 
92
 
93
+ # --- FIX: Explicitly mount assets for Vite support ---
94
+ assets_path = os.path.join(STATIC_DIR, "assets")
95
+ if os.path.exists(assets_path):
96
+ app.mount("/assets", StaticFiles(directory=assets_path), name="assets")
 
 
 
97
 
98
+ if not os.path.isdir("data/videos"):
99
+ os.makedirs("data/videos", exist_ok=True)
100
+ app.mount("/videos", StaticFiles(directory="data/videos"), name="videos")
101
 
102
+ templates = Jinja2Templates(directory=STATIC_DIR)
 
 
 
103
 
104
+ # Ensure all data directories exist
105
+ for d in["data", "data/videos", "data/labels", "data/prompts", "data/responses", "metadata", "data/profiles", "data/comments", "data/mnl_labeled", "models/sandbox_autogluon"]:
106
+ os.makedirs(d, exist_ok=True)
 
 
 
 
107
 
108
+ try:
109
+ csv.field_size_limit(sys.maxsize)
110
+ except OverflowError:
111
+ csv.field_size_limit(2147483647)
 
 
 
112
 
113
+ STOP_QUEUE_SIGNAL = False
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # --- CONSTANTS ---
116
+ QUEUE_COLUMNS =["link", "ingest_timestamp", "status", "task_type"]
 
117
 
118
+ GROUND_TRUTH_FIELDS =[
119
+ "id", "link", "timestamp", "caption",
120
+ "visual_integrity_score", "audio_integrity_score", "source_credibility_score",
121
+ "logical_consistency_score", "emotional_manipulation_score",
122
+ "video_audio_score", "video_caption_score", "audio_caption_score",
123
+ "final_veracity_score", "final_reasoning",
124
+ "stats_likes", "stats_shares", "stats_comments", "stats_platform",
125
+ "tags", "classification", "source"
126
+ ]
127
 
128
+ DATASET_COLUMNS =[
129
+ "id", "link", "timestamp", "caption",
130
+ "final_veracity_score", "visual_score", "audio_score", "source_score", "logic_score", "emotion_score",
131
+ "align_video_audio", "align_video_caption", "align_audio_caption",
132
+ "classification", "reasoning", "tags", "raw_toon",
133
+ "config_type", "config_model", "config_prompt", "config_reasoning", "config_params"
134
+ ]
135
 
136
+ def ensure_csv_schema(file_path: Path, fieldnames: list):
137
+ if not file_path.exists(): return
138
+ try:
139
+ rows =[]
140
+ with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
141
+ start_pos = f.tell()
142
+ line = f.readline()
143
+ if not line: return
144
+ existing_header =[h.strip() for h in line.split(',')]
145
+ missing =[col for col in fieldnames if col not in existing_header]
146
+ if not missing: return
147
+ f.seek(start_pos)
148
+ dict_reader = csv.DictReader(f)
149
+ rows = list(dict_reader)
150
+
151
+ with open(file_path, 'w', newline='', encoding='utf-8') as f:
152
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
153
+ writer.writeheader()
154
+ for row in rows: writer.writerow(row)
155
+ except Exception as e: logger.error(f"Schema migration error: {e}")
156
+
157
+ def get_processed_indices():
158
+ processed_ids = set()
159
+ processed_links = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  for filename in["data/dataset.csv", "data/manual_dataset.csv"]:
161
  path = Path(filename)
162
+ for row in common_utils.robust_read_csv(path):
163
+ if row.get('id'): processed_ids.add(row.get('id'))
164
+ if row.get('link'): processed_links.add(common_utils.normalize_link(row.get('link')))
165
+ return processed_ids, processed_links
166
+
167
+ def check_if_processed(link: str, processed_ids=None, processed_links=None) -> bool:
168
+ target_id = common_utils.extract_tweet_id(link)
169
+ link_clean = common_utils.normalize_link(link)
170
+ if processed_ids is None or processed_links is None:
171
+ p_ids, p_links = get_processed_indices()
172
+ else: p_ids, p_links = processed_ids, processed_links
173
+ return (target_id and target_id in p_ids) or (link_clean and link_clean in p_links)
174
+
175
+ def update_queue_status(link: str, status: str, task_type: str = None):
 
 
 
 
 
 
 
 
 
 
 
 
176
  q_path = Path("data/batch_queue.csv")
177
  if not q_path.exists(): return
178
  rows =[]
179
+ updated = False
180
+ norm_target = common_utils.normalize_link(link)
181
  with open(q_path, 'r', encoding='utf-8') as f:
182
+ reader = csv.DictReader(f)
183
+ fieldnames = list(reader.fieldnames) if reader.fieldnames else list(QUEUE_COLUMNS)
184
+ for f_name in QUEUE_COLUMNS:
185
+ if f_name not in fieldnames: fieldnames.append(f_name)
186
+
187
  for row in reader:
188
+ if "task_type" not in row or not row["task_type"]: row["task_type"] = "Ingest"
189
+ if common_utils.normalize_link(row.get("link", "")) == norm_target:
190
+ if task_type is None or row["task_type"] == task_type:
191
+ row["status"] = status
192
+ updated = True
193
  rows.append(row)
194
+
195
+ if updated:
196
+ with open(q_path, 'w', newline='', encoding='utf-8') as f:
197
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
198
+ writer.writeheader()
199
+ writer.writerows(rows)
200
+
201
+ def log_queue_error(link: str, error_msg: str, task_type: str = None):
202
+ p = Path("data/queue_errors.csv")
203
+ with open(p, 'a', newline='', encoding='utf-8') as f:
204
  writer = csv.writer(f)
205
+ if not p.exists() or p.stat().st_size == 0: writer.writerow(["link", "timestamp", "error"])
206
+ writer.writerow([link, datetime.datetime.now().isoformat(), error_msg])
207
+ update_queue_status(link, "Error", task_type)
 
 
 
 
 
 
 
 
208
 
209
+ @app.on_event("startup")
210
+ async def startup_event():
211
+ ensure_csv_schema(Path("data/dataset.csv"), DATASET_COLUMNS)
212
+ ensure_csv_schema(Path("data/manual_dataset.csv"), GROUND_TRUTH_FIELDS)
213
+ ensure_csv_schema(Path("data/batch_queue.csv"), QUEUE_COLUMNS)
214
+ if not LITE_MODE:
215
+ try: inference_logic.load_models()
216
+ except Exception: pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ @app.get("/health")
219
+ async def health_check():
220
+ return {"status": "ok", "agent_mount": agent_mount_status}
221
 
222
+ @app.get("/benchmarks/stats")
223
+ async def get_benchmark_stats():
224
+ return benchmarking.calculate_benchmarks()
225
+
226
+ @app.get("/benchmarks/leaderboard")
227
+ async def get_benchmark_leaderboard():
228
+ return benchmarking.generate_leaderboard()
229
+
230
+ @app.post("/benchmarks/train_predictive")
231
+ async def run_predictive_training(config: dict = Body(...)):
232
+ return benchmarking.train_predictive_sandbox(config)
233
+
234
+ @app.get("/config/prompts")
235
+ async def list_prompts():
236
+ return [{"id": k, "name": v['description']} for k, v in PROMPT_VARIANTS.items()]
237
+
238
+ @app.get("/config/tags")
239
+ async def list_configured_tags():
240
+ path = Path("data/tags.json")
241
+ if path.exists():
242
+ with open(path, 'r') as f: return json.load(f)
243
+ return {}
244
+
245
+ @app.post("/config/tags")
246
+ async def save_configured_tags(tags: dict = Body(...)):
247
+ path = Path("data/tags.json")
248
+ with open(path, 'w', encoding='utf-8') as f: json.dump(tags, f, indent=2)
249
+ return {"status": "success"}
250
+
251
+ @app.get("/tags/list")
252
+ async def list_all_tags():
253
+ tags_count = {}
254
+ path = Path("data/dataset.csv")
255
+ if path.exists():
256
+ for row in common_utils.robust_read_csv(path):
257
+ t_str = row.get("tags", "")
258
+ if t_str:
259
+ for t in t_str.split(','):
260
+ t = t.strip()
261
+ if t: tags_count[t] = tags_count.get(t, 0) + 1
262
+ sorted_tags = sorted(tags_count.items(), key=lambda x: x[1], reverse=True)
263
+ return [{"name": k, "count": v} for k, v in sorted_tags]
264
 
265
+ @app.post("/extension/ingest")
266
+ async def extension_ingest_link(request: Request):
267
  try:
268
+ data = await request.json()
269
+ link = data.get("link")
270
+ comments = data.get("comments",[])
271
+ if not link:
272
+ raise HTTPException(status_code=400, detail="Link required")
273
+
274
+ q_path = Path("data/batch_queue.csv")
275
+ existing = set()
276
+ if q_path.exists():
277
+ for r in common_utils.robust_read_csv(q_path): existing.add(common_utils.normalize_link(r.get('link')))
278
+
279
+ normalized = common_utils.normalize_link(link)
280
+ if normalized not in existing:
281
+ with open(q_path, 'a', newline='', encoding='utf-8') as f:
282
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
283
+ if not q_path.exists() or q_path.stat().st_size == 0: writer.writeheader()
284
+ writer.writerow({"link": link.strip(), "ingest_timestamp": datetime.datetime.now().isoformat(), "status": "Pending", "task_type": "Ingest"})
285
+
286
+ if comments:
287
+ tid = common_utils.extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:10]
288
+ context_path = Path(f"data/comments/{tid}_ingest.json")
289
+ with open(context_path, 'w', encoding='utf-8') as f:
290
+ json.dump({
291
+ "link": link,
292
+ "timestamp": datetime.datetime.now().isoformat(),
293
+ "comments": comments
294
+ }, f, indent=2)
295
+ logger.info(f"Saved {len(comments)} comments for ingestion context: {tid}")
296
 
297
+ return {"status": "success", "link": link, "comments_saved": len(comments)}
298
+ except Exception as e:
299
+ logger.error(f"Ingest Error: {e}")
300
+ return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
301
+
302
+ @app.post("/manual/promote")
303
+ async def promote_to_ground_truth(request: Request):
304
  try:
305
+ data = await request.json()
306
+ target_ids = data.get("ids",[])
307
+ if not target_ids and data.get("id"): target_ids = [data.get("id")]
308
 
309
+ if not target_ids: return JSONResponse({"status": "error", "message": "No IDs provided"}, status_code=400)
310
+
311
+ ai_path = Path("data/dataset.csv")
312
+ ai_rows = {}
313
+ if ai_path.exists():
314
+ for row in common_utils.robust_read_csv(ai_path):
315
+ if row.get('id'): ai_rows[str(row['id'])] = row
 
 
 
 
 
 
 
316
 
317
+ manual_path = Path("data/manual_dataset.csv")
318
+ manual_exists = manual_path.exists()
319
+ existing_ids = set()
320
+ if manual_exists:
321
+ for row in common_utils.robust_read_csv(manual_path):
322
+ if row.get('id'): existing_ids.add(str(row['id']))
323
+
324
+ new_rows =[]
325
+ promoted_count = 0
326
+ for tid in target_ids:
327
+ tid_str = str(tid)
328
+ if tid_str in existing_ids: continue
329
+ found_row = ai_rows.get(tid_str)
330
+ if found_row:
331
+ mapped_row = {
332
+ "id": found_row.get("id"), "link": found_row.get("link"),
333
+ "timestamp": datetime.datetime.now().isoformat(), "caption": found_row.get("caption"),
334
+ "visual_integrity_score": found_row.get("visual_score", 0),
335
+ "audio_integrity_score": found_row.get("audio_score", 0),
336
+ "source_credibility_score": 5, "logical_consistency_score": found_row.get("logic_score", 0),
337
+ "emotional_manipulation_score": 5, "video_audio_score": 5,
338
+ "video_caption_score": found_row.get("align_video_caption", 0), "audio_caption_score": 5,
339
+ "final_veracity_score": found_row.get("final_veracity_score", 0),
340
+ "final_reasoning": found_row.get("reasoning", ""),
341
+ "stats_likes": 0, "stats_shares": 0, "stats_comments": 0, "stats_platform": "twitter",
342
+ "tags": found_row.get("tags", ""), "classification": found_row.get("classification", "None"),
343
+ "source": "manual_promoted"
344
+ }
345
+ new_rows.append(mapped_row)
346
+ promoted_count += 1
347
+ existing_ids.add(tid_str)
348
+
349
+ if not new_rows: return {"status": "success", "promoted_count": 0}
350
+
351
+ mode = 'a' if manual_exists else 'w'
352
+ with open(manual_path, mode, newline='', encoding='utf-8') as f:
353
+ writer = csv.DictWriter(f, fieldnames=GROUND_TRUTH_FIELDS, extrasaction='ignore')
354
+ if not manual_exists or manual_path.stat().st_size == 0: writer.writeheader()
355
+ for r in new_rows: writer.writerow(r)
356
+
357
+ return {"status": "success", "promoted_count": promoted_count}
358
+ except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
359
+
360
+ @app.post("/manual/delete")
361
+ async def delete_ground_truth(request: Request):
362
+ try:
363
+ data = await request.json()
364
+ target_ids = data.get("ids",[])
365
+ if not target_ids and data.get("id"): target_ids = [data.get("id")]
366
+ if not target_ids: raise HTTPException(status_code=400)
367
 
368
+ target_ids =[str(t) for t in target_ids]
369
+ manual_path = Path("data/manual_dataset.csv")
370
+ if not manual_path.exists(): return {"status": "error", "message": "File not found"}
371
+
372
+ rows =[]
373
+ deleted_count = 0
374
+ with open(manual_path, 'r', encoding='utf-8') as f:
375
+ reader = csv.DictReader(f)
376
+ for row in reader:
377
+ if str(row.get('id')) in target_ids: deleted_count += 1
378
+ else: rows.append(row)
379
+ with open(manual_path, 'w', newline='', encoding='utf-8') as f:
380
+ writer = csv.DictWriter(f, fieldnames=GROUND_TRUTH_FIELDS)
381
+ writer.writeheader()
382
+ writer.writerows(rows)
383
+
384
+ return {"status": "success", "deleted_count": deleted_count}
385
+ except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
386
+
387
+ @app.post("/manual/verify_queue")
388
+ async def verify_queue_items(request: Request):
389
+ try:
390
+ data = await request.json()
391
+ target_ids = data.get("ids",[])
392
+ resample_count = max(1, min(data.get("resample_count", 1), 100))
393
+ if not target_ids: return JSONResponse({"status": "error", "message": "No IDs provided"}, status_code=400)
394
 
395
+ manual_path = Path("data/manual_dataset.csv")
396
+ links_to_queue =[]
397
+ if manual_path.exists():
398
+ for row in common_utils.robust_read_csv(manual_path):
399
+ if str(row.get('id')) in target_ids:
400
+ links_to_queue.append(row.get('link'))
 
 
 
 
 
 
 
 
 
 
401
 
402
+ if not links_to_queue:
403
+ return {"status": "error", "message": "No matching links found in Ground Truth."}
404
+
405
+ q_path = Path("data/batch_queue.csv")
406
+ added_count = 0
407
+ with open(q_path, 'a', newline='', encoding='utf-8') as f:
408
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
409
+ if not q_path.exists() or q_path.stat().st_size == 0: writer.writeheader()
410
+ for link in links_to_queue:
411
+ for _ in range(resample_count):
412
+ writer.writerow({
413
+ "link": link.strip(),
414
+ "ingest_timestamp": datetime.datetime.now().isoformat(),
415
+ "status": "Pending",
416
+ "task_type": "Verify"
417
+ })
418
+ added_count += 1
 
 
 
419
 
420
+ return {"status": "success", "queued_count": added_count, "message": f"Added {added_count} items to queue for verification pipeline."}
421
+ except Exception as e:
422
+ return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
423
+
424
+ @app.get("/profiles/list")
425
+ async def list_profiles():
426
+ profiles_dir = Path("data/profiles")
427
+ profiles =[]
428
+ if not profiles_dir.exists(): return profiles
429
+ try:
430
+ for d in profiles_dir.iterdir():
431
+ if d.is_dir():
432
+ hist = d / "history.csv"
433
+ count = 0
434
+ if hist.exists():
435
+ with open(hist, 'r', encoding='utf-8', errors='ignore') as f: count = sum(1 for _ in f) - 1
436
+ profiles.append({"username": d.name, "posts_count": max(0, count)})
437
+ except Exception: pass
438
+ return sorted(profiles, key=lambda x: x['username'])
439
+
440
+ @app.get("/profiles/{username}/posts")
441
+ async def get_profile_posts(username: str):
442
+ csv_path = Path(f"data/profiles/{username}/history.csv")
443
+ posts =[]
444
+ if not csv_path.exists(): return posts
445
+ p_ids, p_links = get_processed_indices()
446
+ try:
447
+ for row in common_utils.robust_read_csv(csv_path):
448
+ link = row.get('link', '')
449
+ is_labeled = False
450
+ t_id = common_utils.extract_tweet_id(link)
451
+ if t_id and t_id in p_ids: is_labeled = True
452
+ elif common_utils.normalize_link(link) in p_links: is_labeled = True
453
+ row['is_labeled'] = is_labeled
454
+ posts.append(row)
455
+ except Exception: pass
456
+ return posts
457
+
458
+ @app.post("/extension/ingest_user_history")
459
+ async def ingest_user_history(request: Request):
460
+ try:
461
+ data = await request.json()
462
+ username = data.get("username")
463
+ posts = data.get("posts",[])
464
+ if not username or not posts: raise HTTPException(status_code=400)
465
+ profile_dir = Path(f"data/profiles/{username}")
466
+ profile_dir.mkdir(parents=True, exist_ok=True)
467
+ csv_path = profile_dir / "history.csv"
468
+ file_exists = csv_path.exists()
469
+ existing = set()
470
+ if file_exists:
471
+ for row in common_utils.robust_read_csv(csv_path): existing.add(row.get('link'))
472
 
473
+ with open(csv_path, 'a', newline='', encoding='utf-8') as f:
474
+ fieldnames =["link", "timestamp", "text", "is_reply", "metric_replies", "metric_reposts", "metric_likes", "metric_views", "ingested_at"]
475
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
476
+ if not file_exists: writer.writeheader()
477
+ ts = datetime.datetime.now().isoformat()
478
+ count = 0
479
+ for p in posts:
480
+ if p['link'] not in existing:
481
+ m = p.get('metrics', {})
482
+ writer.writerow({
483
+ "link": p.get('link'), "timestamp": p.get('timestamp'),
484
+ "text": p.get('text', '').replace('\n', ' '), "is_reply": p.get('is_reply', False),
485
+ "metric_replies": m.get('replies', 0), "metric_reposts": m.get('reposts', 0),
486
+ "metric_likes": m.get('likes', 0), "metric_views": m.get('views', 0),
487
+ "ingested_at": ts
488
+ })
489
+ count += 1
490
+ return {"status": "success", "new_posts": count}
491
+ except Exception as e: raise HTTPException(status_code=500, detail=str(e))
492
+
493
+ @app.post("/extension/save_comments")
494
+ async def extension_save_comments(request: Request):
495
+ try:
496
+ data = await request.json()
497
+ link = data.get("link")
498
+ comments = data.get("comments",[])
499
+ if not link: raise HTTPException(status_code=400)
500
+ tweet_id = common_utils.extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:10]
501
+ csv_path = Path(f"data/comments/{tweet_id}.csv")
502
+ with open(csv_path, 'w', newline='', encoding='utf-8') as f:
503
+ writer = csv.DictWriter(f, fieldnames=["author", "text", "link", "timestamp"])
504
+ writer.writeheader()
505
+ ts = datetime.datetime.now().isoformat()
506
+ for c in comments:
507
+ writer.writerow({
508
+ "author": c.get("author", "Unknown"),
509
+ "text": c.get("text", "").replace("\n", " "),
510
+ "link": c.get("link", ""),
511
+ "timestamp": ts
512
+ })
513
+ return {"status": "success", "count": len(comments)}
514
+ except Exception as e: raise HTTPException(status_code=500, detail=str(e))
515
+
516
+ @app.post("/extension/save_manual")
517
+ @app.post("/manual/save")
518
+ async def save_manual_label(request: Request):
519
+ try:
520
+ data = await request.json()
521
+ link = data.get("link")
522
+ if not link:
523
+ return JSONResponse({"status": "error", "message": "Link required"}, status_code=400)
524
 
525
+ tweet_id = common_utils.extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:10]
526
+ labels = data.get("labels", data)
527
+
528
  row = {
529
+ "id": tweet_id, "link": link, "timestamp": datetime.datetime.now().isoformat(),
530
+ "caption": data.get("caption", ""),
531
+ "visual_integrity_score": labels.get("visual_integrity_score", 0),
532
+ "audio_integrity_score": labels.get("audio_integrity_score", 0),
533
+ "source_credibility_score": labels.get("source_credibility_score", 0),
534
+ "logical_consistency_score": labels.get("logical_consistency_score", 0),
535
+ "emotional_manipulation_score": labels.get("emotional_manipulation_score", 5),
536
+ "video_audio_score": labels.get("video_audio_score", 0),
537
+ "video_caption_score": labels.get("video_caption_score", 0),
538
+ "audio_caption_score": labels.get("audio_caption_score", 0),
539
+ "final_veracity_score": labels.get("final_veracity_score", 0),
540
+ "final_reasoning": labels.get("reasoning", labels.get("final_reasoning", "")),
541
+ "stats_likes": 0, "stats_shares": 0, "stats_comments": 0, "stats_platform": "twitter",
542
+ "tags": data.get("tags", labels.get("tags", "")),
543
+ "classification": labels.get("classification", "None"),
544
+ "source": "Manual"
545
+ }
546
+
547
+ tag_str = str(row["tags"])
548
+ tag_list =[t.strip() for t in tag_str.split(',') if t.strip()]
549
+
550
+ deep_json = {
551
+ "veracity_vectors": {
552
+ "visual_integrity_score": str(row["visual_integrity_score"]),
553
+ "audio_integrity_score": str(row["audio_integrity_score"]),
554
+ "source_credibility_score": str(row["source_credibility_score"]),
555
+ "logical_consistency_score": str(row["logical_consistency_score"]),
556
+ "emotional_manipulation_score": str(row["emotional_manipulation_score"])
557
+ },
558
+ "modalities": {
559
+ "video_audio_score": str(row["video_audio_score"]),
560
+ "video_caption_score": str(row["video_caption_score"]),
561
+ "audio_caption_score": str(row["audio_caption_score"])
562
+ },
563
+ "video_context_summary": row["caption"],
564
+ "tags": tag_list,
565
+ "factuality_factors": {
566
+ "claim_accuracy": "Manual",
567
+ "evidence_gap": "Manual Verification",
568
+ "grounding_check": "Manual Verification"
569
+ },
570
+ "disinformation_analysis": {
571
+ "classification": row["classification"],
572
+ "intent": "Manual Labeling",
573
+ "threat_vector": "Manual Labeling"
574
+ },
575
+ "final_assessment": {
576
+ "veracity_score_total": str(row["final_veracity_score"]),
577
+ "reasoning": row["final_reasoning"]
578
+ },
579
+ "raw_parsed_structure": {
580
+ "summary": {"text": row["caption"]},
581
+ "tags": {"keywords": row["tags"]},
582
+ "final": {"score": str(row["final_veracity_score"]), "reasoning": row["final_reasoning"]}
583
+ },
584
+ "meta_info": {
585
+ "id": tweet_id,
586
+ "timestamp": row["timestamp"],
587
+ "link": link,
588
+ "model_selection": "Manual"
589
+ }
590
  }
 
591
 
592
+ json_path_direct = Path(f"data/labels/{tweet_id}.json")
593
+ with open(json_path_direct, 'w', encoding='utf-8') as jf:
594
+ json.dump(deep_json, jf, indent=2, ensure_ascii=False)
595
+
596
+ with open(Path(f"data/mnl_labeled/{tweet_id}.json"), 'w', encoding='utf-8') as jf:
597
+ json.dump(row, jf, indent=2, ensure_ascii=False)
598
+
599
+ manual_path = Path("data/manual_dataset.csv")
600
+ exists = manual_path.exists()
601
+ ensure_csv_schema(manual_path, GROUND_TRUTH_FIELDS)
602
+
603
+ rows =[]
604
+ found = False
605
+ if exists:
606
+ for r in common_utils.robust_read_csv(manual_path):
607
+ if str(r.get('id')) == str(tweet_id):
608
+ clean_row = {k: row.get(k, "") for k in GROUND_TRUTH_FIELDS}
609
+ rows.append(clean_row)
610
+ found = True
611
+ else:
612
+ rows.append(r)
613
+
614
+ if not found:
615
+ clean_row = {k: row.get(k, "") for k in GROUND_TRUTH_FIELDS}
616
+ rows.append(clean_row)
617
+
618
+ with open(manual_path, 'w', newline='', encoding='utf-8') as f:
619
+ writer = csv.DictWriter(f, fieldnames=GROUND_TRUTH_FIELDS, extrasaction='ignore')
620
+ writer.writeheader()
621
+ writer.writerows(rows)
622
+
623
+ update_queue_status(link, "Processed")
624
+ return {"status": "success", "id": tweet_id}
625
  except Exception as e:
626
+ logger.error(f"Save Manual Error: {e}")
627
+ return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
628
 
629
+ @app.get("/community/list_datasets")
630
+ async def list_community_datasets():
631
+ path = Path("data/comments")
632
+ files =[]
633
+ if path.exists():
634
+ for f in path.glob("*.csv"):
635
+ files.append({"id": f.stem, "count": sum(1 for _ in open(f, encoding='utf-8'))-1})
636
+ return files
637
+
638
+ @app.post("/community/analyze")
639
+ async def analyze_community(dataset_id: str = Body(..., embed=True)):
640
+ path = Path(f"data/comments/{dataset_id}.csv")
641
+ if not path.exists(): raise HTTPException(status_code=404)
642
+ comments = list(common_utils.robust_read_csv(path))
643
+ if not comments: return {"score": 0, "verdict": "No Data"}
644
+ s_keys =["fake", "lie", "staged", "bs", "propaganda", "ai", "deepfake"]
645
+ t_keys =["true", "real", "confirmed", "fact", "source", "proof"]
646
+ s_count = sum(1 for c in comments if any(k in c['text'].lower() for k in s_keys))
647
+ t_count = sum(1 for c in comments if any(k in c['text'].lower() for k in t_keys))
648
+ score = max(0, min(100, 50 + (t_count * 2) - (s_count * 5)))
649
+ verdict = "Community Skepticism" if score < 30 else "Community Verification" if score > 70 else "Neutral/Mixed"
650
+ return {"dataset_id": dataset_id, "trust_score": score, "verdict": verdict, "details": {"skeptical_comments": s_count, "trusting_comments": t_count}}
651
+
652
+ @app.get("/dataset/list")
653
+ async def get_dataset_list():
654
+ dataset =[]
655
+ m_path = Path("data/manual_dataset.csv")
656
+ manual_ids = set()
657
+ if m_path.exists():
658
+ for row in common_utils.robust_read_csv(m_path):
659
+ row['source'] = 'Manual'
660
+ if row.get('id'): manual_ids.add(str(row['id']))
661
+ dataset.append(row)
662
+
663
+ path = Path("data/dataset.csv")
664
+ if path.exists():
665
+ for row in common_utils.robust_read_csv(path):
666
+ tid = str(row.get('id', ''))
667
+ if tid not in manual_ids:
668
+ row['source'] = 'AI'
669
+ dataset.append(row)
670
+ return sorted(dataset, key=lambda x: x.get('timestamp', ''), reverse=True)
671
+
672
+ @app.get("/analytics/account_integrity")
673
+ async def get_account_integrity():
674
+ id_map = {}
675
+ prof_dir = Path("data/profiles")
676
+ if prof_dir.exists():
677
+ for d in prof_dir.iterdir():
678
+ for row in common_utils.robust_read_csv(d/"history.csv"):
679
+ tid = common_utils.extract_tweet_id(row.get('link',''))
680
+ if tid: id_map[tid] = d.name
681
+
682
+ scores_map = {}
683
+ for fname in ["data/dataset.csv", "data/manual_dataset.csv"]:
684
+ for row in common_utils.robust_read_csv(Path(fname)):
685
+ tid = row.get('id')
686
+ sc = row.get('final_veracity_score', '0')
687
+ try: val = float(re.sub(r'[^\d.]', '', str(sc)))
688
+ except: val = 0
689
+
690
+ auth = id_map.get(tid, "Unknown")
691
+ if auth != "Unknown":
692
+ if auth not in scores_map: scores_map[auth] =[]
693
+ scores_map[auth].append(val)
694
+
695
+ return sorted([{"username": k, "avg_veracity": round(sum(v)/len(v),1), "posts_labeled": len(v)} for k,v in scores_map.items()], key=lambda x: x['avg_veracity'], reverse=True)
696
 
697
  @app.post("/queue/add")
698
+ async def add_queue_item(link: str = Body(..., embed=True)):
 
 
 
699
  q_path = Path("data/batch_queue.csv")
700
  existing = set()
701
  if q_path.exists():
702
+ for r in common_utils.robust_read_csv(q_path): existing.add(common_utils.normalize_link(r.get('link')))
703
+
704
+ normalized = common_utils.normalize_link(link)
705
+ if not normalized: raise HTTPException(status_code=400, detail="Invalid link")
706
+ if normalized in existing: return {"status": "ignored", "message": "Link already in queue"}
 
 
707
 
708
  with open(q_path, 'a', newline='', encoding='utf-8') as f:
709
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
710
+ if not q_path.exists() or q_path.stat().st_size == 0: writer.writeheader()
711
+ writer.writerow({"link": link.strip(), "ingest_timestamp": datetime.datetime.now().isoformat(), "status": "Pending", "task_type": "Ingest"})
 
712
  return {"status": "success", "link": link}
713
 
714
+ @app.post("/queue/upload_csv")
715
+ async def upload_csv(file: UploadFile = File(...)):
716
+ contents = await file.read()
717
+ lines = contents.decode('utf-8').splitlines()
718
+ q_path = Path("data/batch_queue.csv")
719
+ existing = set()
720
+ if q_path.exists():
721
+ for r in common_utils.robust_read_csv(q_path): existing.add(common_utils.normalize_link(r.get('link')))
722
+
723
+ added = 0
724
+ with open(q_path, 'a', newline='', encoding='utf-8') as f:
725
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
726
+ if not q_path.exists() or q_path.stat().st_size == 0: writer.writeheader()
727
+ for line in lines:
728
+ if 'http' in line:
729
+ raw = line.split(',')[0].strip()
730
+ if common_utils.normalize_link(raw) not in existing:
731
+ writer.writerow({"link": raw, "ingest_timestamp": datetime.datetime.now().isoformat(), "status": "Pending", "task_type": "Ingest"})
732
+ added += 1
733
+ return {"status": "success", "added_count": added}
734
+
735
+ @app.post("/queue/stop")
736
+ async def stop_processing():
737
+ global STOP_QUEUE_SIGNAL
738
+ STOP_QUEUE_SIGNAL = True
739
+ return {"status": "success", "message": "Stopping queue processing..."}
740
+
741
  @app.post("/queue/clear_processed")
742
  async def clear_processed_queue():
743
  q_path = Path("data/batch_queue.csv")
744
  if not q_path.exists(): return {"status": "success", "removed_count": 0}
745
+ p_ids, p_links = get_processed_indices()
746
  kept_rows =[]
747
  removed_count = 0
748
+ for row in common_utils.robust_read_csv(q_path):
749
+ link = row.get("link")
750
+ status = row.get("status", "Pending")
751
+ task_type = row.get("task_type", "Ingest")
752
+
753
+ is_done = False
754
+ if status == "Processed": is_done = True
755
+ elif task_type != "Verify" and check_if_processed(link, p_ids, p_links): is_done = True
756
+
757
+ if is_done: removed_count += 1
758
+ else: kept_rows.append(row)
 
759
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
760
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
761
+ writer.writeheader()
762
  writer.writerows(kept_rows)
763
  return {"status": "success", "removed_count": removed_count}
764
 
 
766
  async def delete_queue_items(request: Request):
767
  try:
768
  data = await request.json()
769
+ target_links = set(common_utils.normalize_link(l) for l in data.get("links",[]))
770
  q_path = Path("data/batch_queue.csv")
771
  if not q_path.exists(): return {"status": "success", "count": 0}
772
  kept_rows =[]
773
  deleted_count = 0
774
+ for row in common_utils.robust_read_csv(q_path):
775
+ if common_utils.normalize_link(row.get('link')) in target_links: deleted_count += 1
776
+ else: kept_rows.append(row)
 
 
 
 
 
 
 
777
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
778
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
779
+ writer.writeheader()
780
  writer.writerows(kept_rows)
781
  return {"status": "success", "count": deleted_count}
782
  except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
 
785
  async def requeue_items(request: Request):
786
  try:
787
  data = await request.json()
788
+ target_links = set(common_utils.normalize_link(l) for l in data.get("links",[]))
789
  q_path = Path("data/batch_queue.csv")
790
  if not q_path.exists(): return {"status": "success", "count": 0}
791
  rows =[]
792
  requeued_count = 0
793
+ for row in common_utils.robust_read_csv(q_path):
794
+ if common_utils.normalize_link(row.get('link')) in target_links:
795
+ row['status'] = 'Pending'
796
+ requeued_count += 1
797
+ rows.append(row)
 
 
 
 
 
798
  with open(q_path, 'w', newline='', encoding='utf-8') as f:
799
+ writer = csv.DictWriter(f, fieldnames=QUEUE_COLUMNS, extrasaction='ignore')
800
+ writer.writeheader()
801
  writer.writerows(rows)
802
  return {"status": "success", "count": requeued_count}
803
  except Exception as e: return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
804
 
805
+ @app.post("/dataset/delete")
806
+ async def delete_dataset_items(request: Request):
 
 
 
 
 
 
 
807
  try:
808
+ data = await request.json()
809
+ target_ids = data.get("ids",[])
810
+ if not target_ids: raise HTTPException(status_code=400)
811
+ target_ids = set(str(t) for t in target_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
 
813
+ path = Path("data/dataset.csv")
814
+ if not path.exists(): return {"status": "success", "count": 0}
 
 
 
815
 
816
+ rows =[]
817
+ deleted_count = 0
818
+ for row in common_utils.robust_read_csv(path):
819
+ if str(row.get('id')) in target_ids:
820
+ deleted_count += 1
821
+ else:
822
+ rows.append(row)
823
+
824
+ with open(path, 'w', newline='', encoding='utf-8') as f:
825
+ writer = csv.DictWriter(f, fieldnames=DATASET_COLUMNS, extrasaction='ignore')
826
+ writer.writeheader()
827
+ writer.writerows(rows)
828
 
829
+ return {"status": "success", "deleted_count": deleted_count}
 
 
 
 
 
 
 
 
 
 
 
830
  except Exception as e:
831
+ return JSONResponse({"status": "error", "message": str(e)}, status_code=500)
832
+
833
+ @app.post("/analyze/user_context")
834
+ async def analyze_user_context(request: Request):
835
+ try:
836
+ data = await request.json()
837
+ rep = await user_analysis_logic.generate_user_profile_report(data.get("username"))
838
+ return {"status": "success", "report": rep}
839
+ except Exception as e: return JSONResponse({"error": str(e)}, status_code=500)
840
+
841
+ @app.get("/download-dataset")
842
+ async def download_dataset():
843
+ file_path = Path("data/dataset.csv")
844
+ if file_path.exists():
845
+ return FileResponse(path=file_path, filename="dataset.csv", media_type='text/csv')
846
+ return Response("Dataset not found.", status_code=404)
847
+
848
+ @app.get("/model-architecture", response_class=PlainTextResponse)
849
+ async def get_model_architecture():
850
+ if LITE_MODE: return "Running in LITE mode."
851
+ if inference_logic.base_model: return str(inference_logic.base_model)
852
+ return "Model not loaded."
853
+
854
+ @app.get("/", response_class=HTMLResponse)
855
+ async def read_root(request: Request):
856
+ return templates.TemplateResponse("index.html", {"request": request})
857
+
858
+ @app.get("/queue/list")
859
+ async def get_queue_list():
860
+ q_path = Path("data/batch_queue.csv")
861
+ items =[]
862
+ p_ids, p_links = get_processed_indices()
863
+ for row in common_utils.robust_read_csv(q_path):
864
+ if row:
865
+ l = row.get("link")
866
+ status = row.get("status", "Pending")
867
+ task_type = row.get("task_type") or "Ingest"
868
+
869
+ if status == "Pending" and task_type != "Verify" and check_if_processed(l, p_ids, p_links): status = "Processed"
870
+
871
+ # Fetch associated comments to display in the dropdown
872
+ comments =[]
873
+ tid = common_utils.extract_tweet_id(l) or hashlib.md5(l.encode()).hexdigest()[:10]
874
+ c_path = Path(f"data/comments/{tid}_ingest.json")
875
+ if c_path.exists():
876
+ try:
877
+ with open(c_path, 'r') as f:
878
+ c_data = json.load(f)
879
+ comments = c_data.get('comments',[])
880
+ except Exception:
881
+ pass
882
+
883
+ items.append({
884
+ "link": l,
885
+ "timestamp": row.get("ingest_timestamp",""),
886
+ "status": status,
887
+ "task_type": task_type,
888
+ "comments": comments
889
+ })
890
+ return items
891
 
892
  @app.post("/queue/run")
893
  async def run_queue_processing(
 
895
  gemini_api_key: str = Form(""), gemini_model_name: str = Form(""),
896
  vertex_project_id: str = Form(""), vertex_location: str = Form(""), vertex_model_name: str = Form(""), vertex_api_key: str = Form(""),
897
  nrp_api_key: str = Form(""), nrp_model_name: str = Form(""), nrp_base_url: str = Form("https://ellm.nrp-nautilus.io/v1"),
898
+ include_comments: bool = Form(False), reasoning_method: str = Form("cot"), prompt_template: str = Form("standard"),
899
+ custom_query: str = Form(""), max_reprompts: int = Form(1)
 
 
 
900
  ):
901
  global STOP_QUEUE_SIGNAL
902
  STOP_QUEUE_SIGNAL = False
903
+
904
  gemini_config = {"api_key": gemini_api_key, "model_name": gemini_model_name, "max_retries": max_reprompts}
905
+ vertex_config = {"project_id": vertex_project_id, "location": vertex_location, "model_name": vertex_model_name, "api_key": vertex_api_key, "max_retries": max_reprompts, "use_search": True}
906
  nrp_config = {"api_key": nrp_api_key, "model_name": nrp_model_name, "base_url": nrp_base_url, "max_retries": max_reprompts}
907
+
908
  sel_p = PROMPT_VARIANTS.get(prompt_template, PROMPT_VARIANTS['standard'])
909
  system_persona_txt = sel_p['instruction']
910
+ if custom_query.strip(): system_persona_txt += f"\n\nSPECIAL INSTRUCTION FOR THIS BATCH: {custom_query}"
911
 
912
+ if model_selection == 'vertex':
913
+ active_config = vertex_config
914
+ active_model_name = vertex_model_name
915
+ elif model_selection == 'nrp':
916
+ active_config = nrp_config
917
+ active_model_name = nrp_model_name
918
+ else:
919
+ active_config = gemini_config
920
+ active_model_name = gemini_model_name
921
+
922
+ config_params_dict = {
923
+ "reprompts": max_reprompts,
924
+ "include_comments": include_comments,
925
+ "agent_active": False
926
+ }
927
+ config_params_str = json.dumps(config_params_dict)
928
+
929
  async def queue_stream():
930
+ q_path = Path("data/batch_queue.csv")
931
+ items =[r for r in common_utils.robust_read_csv(q_path) if r.get("link") and r.get("status", "Pending") == "Pending"]
932
+ p_ids, p_links = get_processed_indices()
933
+ yield f"data:[SYSTEM] Persona: {sel_p['description']}\n\n"
934
 
935
+ for item in items:
936
+ link = item.get("link")
937
+ task_type = item.get("task_type") or "Ingest"
 
 
 
 
 
 
 
938
 
939
+ if STOP_QUEUE_SIGNAL:
940
+ yield f"data:[SYSTEM] Stopping by user request.\n\n"
 
 
 
 
 
 
 
941
  break
942
 
943
+ if task_type != "Verify" and check_if_processed(link, p_ids, p_links):
944
+ update_queue_status(link, "Processed", task_type)
945
+ continue
946
+
947
+ gt_data = None
948
+ if task_type == "Verify":
949
+ manual_path = Path("data/manual_dataset.csv")
950
+ if manual_path.exists():
951
+ for row in common_utils.robust_read_csv(manual_path):
952
+ if common_utils.normalize_link(row.get('link', '')) == common_utils.normalize_link(link):
953
+ gt_data = row
954
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
 
956
+ yield f"data: [START] {link} (Type: {task_type})\n\n"
957
+ tid = common_utils.extract_tweet_id(link) or hashlib.md5(link.encode()).hexdigest()[:10]
958
+ assets = await common_utils.prepare_video_assets(link, tid)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
 
960
+ if not assets or (not assets.get('video') and not assets.get('caption')):
961
+ log_queue_error(link, "Download/Fetch Error", task_type)
962
+ yield f"data: - Download Error.\n\n"
963
+ continue
964
 
965
+ trans = common_utils.parse_vtt(assets['transcript']) if assets.get('transcript') else "No transcript (Audio/Video missing)."
966
+ video_file = assets.get('video')
967
+ if not video_file:
968
+ yield f"data: - No video found. Text-only analysis.\n\n"
969
+ video_file = None
970
+ else: yield f"data: - Video found. Inferencing...\n\n"
 
 
 
 
 
 
 
 
 
971
 
972
+ comments_path = Path(f"data/comments/{tid}_ingest.json")
973
+ current_system_persona = system_persona_txt
974
+ if comments_path.exists():
975
+ try:
976
+ with open(comments_path, 'r') as f:
977
+ c_data = json.load(f)
978
+ comments = c_data.get('comments',[])
979
+ if comments:
980
+ yield f"data: - Found {len(comments)} comments. Generating Community Context...\n\n"
981
+ community_summary = await inference_logic.generate_community_summary(comments, model_selection, active_config)
982
+ current_system_persona += f"\n\n### COMMUNITY NOTES / CONTEXT (from Comments):\n{community_summary}\n\nUse this community context to cross-reference claims but remain objective."
983
+ yield f"data: - Context Generated.\n\n"
984
+ except Exception as e:
985
+ logger.error(f"Error processing comments for context: {e}")
 
 
986
 
987
+ res_data = None
988
+ if model_selection == 'gemini':
989
+ async for chunk in inference_logic.run_gemini_labeling_pipeline(video_file, assets['caption'], trans, gemini_config, include_comments, reasoning_method, current_system_persona, request_id=tid):
990
+ if isinstance(chunk, str): yield f"data: - {chunk}\n\n"
991
+ else: res_data = chunk
992
+ elif model_selection == 'vertex':
993
+ async for chunk in inference_logic.run_vertex_labeling_pipeline(video_file, assets['caption'], trans, vertex_config, include_comments, reasoning_method, current_system_persona, request_id=tid):
994
+ if isinstance(chunk, str): yield f"data: - {chunk}\n\n"
995
+ else: res_data = chunk
996
+ elif model_selection == 'nrp':
997
+ async for chunk in inference_logic.run_nrp_labeling_pipeline(video_file, assets['caption'], trans, nrp_config, include_comments, reasoning_method, current_system_persona, request_id=tid):
998
+ if isinstance(chunk, str): yield f"data: - {chunk}\n\n"
999
+ else: res_data = chunk
1000
+
1001
+ if res_data and "parsed_data" in res_data:
1002
+ parsed = res_data["parsed_data"]
1003
+ d_path = Path("data/dataset.csv")
1004
+ ensure_csv_schema(d_path, DATASET_COLUMNS)
1005
+ exists = d_path.exists()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1006
 
1007
+ ai_score_val = parsed['final_assessment'].get('veracity_score_total', 0)
1008
+ try: ai_score = float(ai_score_val)
1009
+ except: ai_score = 0
 
 
1010
 
1011
+ if task_type == "Verify" and gt_data is not None:
1012
+ gt_final = float(gt_data.get('final_veracity_score', 0))
1013
+ delta = abs(ai_score - gt_final)
1014
+ vec_ai = parsed.get('veracity_vectors', {})
1015
+ mod_ai = parsed.get('modalities', {})
1016
+
1017
+ def s_float(v):
1018
+ try: return float(v)
1019
+ except: return 0.0
1020
+
1021
+ yield f"data: -[VERIFICATION PIPELINE] Configuration Analysis:\n"
1022
+ yield f"data: Model: {active_model_name} | Provider: {model_selection}\n"
1023
+ yield f"data: Reasoning: {reasoning_method} | Prompt: {prompt_template} | Reprompts: {max_reprompts}\n"
1024
+ yield f"data: -[VERIFICATION SCORES COMPARISON (AI vs Ground Truth)]\n"
1025
+ yield f"data: Visual Integrity : AI {s_float(vec_ai.get('visual_integrity_score'))} | GT {s_float(gt_data.get('visual_integrity_score'))}\n"
1026
+ yield f"data: Audio Integrity : AI {s_float(vec_ai.get('audio_integrity_score'))} | GT {s_float(gt_data.get('audio_integrity_score'))}\n"
1027
+ yield f"data: Source Credibility : AI {s_float(vec_ai.get('source_credibility_score'))} | GT {s_float(gt_data.get('source_credibility_score'))}\n"
1028
+ yield f"data: Logical Consistency: AI {s_float(vec_ai.get('logical_consistency_score'))} | GT {s_float(gt_data.get('logical_consistency_score'))}\n"
1029
+ yield f"data: Emotional Manipul. : AI {s_float(vec_ai.get('emotional_manipulation_score'))} | GT {s_float(gt_data.get('emotional_manipulation_score'))}\n"
1030
+ yield f"data: Video-Audio Align : AI {s_float(mod_ai.get('video_audio_score'))} | GT {s_float(gt_data.get('video_audio_score'))}\n"
1031
+ yield f"data: Video-Caption Align: AI {s_float(mod_ai.get('video_caption_score'))} | GT {s_float(gt_data.get('video_caption_score'))}\n"
1032
+ yield f"data: Audio-Caption Align: AI {s_float(mod_ai.get('audio_caption_score'))} | GT {s_float(gt_