Ram-090's picture
Restructure project with FastAPI + React frontend and Railway deployment
1bbe15b
"""
VDHF Results Logger - Stores verification results in categorized output files.
Output files:
- output/passed_results.txt - Claims that passed verification
- output/failed_results.txt - Claims that failed (hallucinations)
- output/refined_prompts.txt - Prompts that were refined/regenerated
- output/combined_report.txt - Complete combined report
"""
import os
from datetime import datetime
# Output directory (relative to project root)
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Output file paths
PASSED_FILE = os.path.join(OUTPUT_DIR, "passed_results.txt")
FAILED_FILE = os.path.join(OUTPUT_DIR, "failed_results.txt")
REFINED_FILE = os.path.join(OUTPUT_DIR, "refined_prompts.txt")
COMBINED_FILE = os.path.join(OUTPUT_DIR, "combined_report.txt")
class ResultsLogger:
"""Logs verification results to categorized files."""
def __init__(self):
self.passed_results = []
self.failed_results = []
self.refined_prompts = []
self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def log_passed(self, query, claim, score, evidence):
"""Log a passed (supported) claim."""
self.passed_results.append({
'query': query,
'claim': claim,
'score': score,
'evidence': evidence
})
def log_failed(self, query, claim, score, reason="Insufficient evidence"):
"""Log a failed (unsupported/hallucinated) claim."""
self.failed_results.append({
'query': query,
'claim': claim,
'score': score,
'reason': reason
})
def log_refined(self, query, original_response, refined_response,
original_ratio, new_ratio, removed_claims):
"""Log a refined/regenerated prompt."""
self.refined_prompts.append({
'query': query,
'original_response': original_response,
'refined_response': refined_response,
'original_ratio': original_ratio,
'new_ratio': new_ratio,
'removed_claims': removed_claims
})
def save_all(self):
"""Save all results to their respective files."""
self._save_passed()
self._save_failed()
self._save_refined()
self._save_combined()
print(f"\n[*] Results saved to {OUTPUT_DIR}/")
def _save_passed(self):
"""Save passed results to file."""
with open(PASSED_FILE, 'w', encoding='utf-8') as f:
f.write("=" * 70 + "\n")
f.write("PASSED VERIFICATION RESULTS\n")
f.write(f"Generated: {self.timestamp}\n")
f.write("=" * 70 + "\n\n")
if not self.passed_results:
f.write("No passed results recorded.\n")
else:
f.write(f"Total Passed Claims: {len(self.passed_results)}\n")
f.write("-" * 70 + "\n\n")
for i, result in enumerate(self.passed_results, 1):
f.write(f"[{i}] PASSED CLAIM\n")
f.write(f" Query: {result['query']}\n")
f.write(f" Claim: {result['claim']}\n")
f.write(f" Score: {result['score']:.3f}\n")
f.write(f" Evidence: {result['evidence'][:100]}...\n")
f.write("\n")
def _save_failed(self):
"""Save failed results to file."""
with open(FAILED_FILE, 'w', encoding='utf-8') as f:
f.write("=" * 70 + "\n")
f.write("FAILED VERIFICATION RESULTS (HALLUCINATIONS)\n")
f.write(f"Generated: {self.timestamp}\n")
f.write("=" * 70 + "\n\n")
if not self.failed_results:
f.write("No failed results recorded.\n")
else:
f.write(f"Total Failed Claims: {len(self.failed_results)}\n")
f.write("-" * 70 + "\n\n")
for i, result in enumerate(self.failed_results, 1):
f.write(f"[{i}] FAILED CLAIM (HALLUCINATION)\n")
f.write(f" Query: {result['query']}\n")
f.write(f" Claim: {result['claim']}\n")
f.write(f" Score: {result['score']:.3f}\n")
f.write(f" Reason: {result['reason']}\n")
f.write("\n")
def _save_refined(self):
"""Save refined prompts to file."""
with open(REFINED_FILE, 'w', encoding='utf-8') as f:
f.write("=" * 70 + "\n")
f.write("REFINED PROMPTS (REGENERATED RESPONSES)\n")
f.write(f"Generated: {self.timestamp}\n")
f.write("=" * 70 + "\n\n")
if not self.refined_prompts:
f.write("No refined prompts recorded.\n")
else:
f.write(f"Total Refinements: {len(self.refined_prompts)}\n")
f.write("-" * 70 + "\n\n")
for i, result in enumerate(self.refined_prompts, 1):
f.write(f"[{i}] REFINED PROMPT\n")
f.write(f" Query: {result['query']}\n")
f.write(f" Original Support Ratio: {result['original_ratio']:.1%}\n")
f.write(f" New Support Ratio: {result['new_ratio']:.1%}\n")
f.write(f"\n ORIGINAL RESPONSE:\n")
f.write(f" {result['original_response'][:200]}...\n")
f.write(f"\n REFINED RESPONSE:\n")
f.write(f" {result['refined_response']}\n")
f.write(f"\n REMOVED CLAIMS:\n")
for claim in result['removed_claims']:
f.write(f" - {claim}\n")
f.write("\n")
def _save_combined(self):
"""Save combined report to file."""
with open(COMBINED_FILE, 'w', encoding='utf-8') as f:
f.write("=" * 70 + "\n")
f.write("VDHF COMBINED VERIFICATION REPORT\n")
f.write(f"Generated: {self.timestamp}\n")
f.write("=" * 70 + "\n\n")
# Summary
f.write("SUMMARY\n")
f.write("-" * 70 + "\n")
f.write(f" Passed Claims: {len(self.passed_results)}\n")
f.write(f" Failed Claims: {len(self.failed_results)}\n")
f.write(f" Refined Prompts: {len(self.refined_prompts)}\n")
total = len(self.passed_results) + len(self.failed_results)
if total > 0:
pass_rate = len(self.passed_results) / total * 100
f.write(f" Overall Pass Rate: {pass_rate:.1f}%\n")
f.write("\n")
# Passed section
f.write("=" * 70 + "\n")
f.write("SECTION 1: PASSED CLAIMS\n")
f.write("=" * 70 + "\n\n")
if not self.passed_results:
f.write("No passed claims.\n\n")
else:
for i, result in enumerate(self.passed_results, 1):
f.write(f" [{i}] {result['claim'][:60]}...\n")
f.write(f" Score: {result['score']:.3f} | Query: {result['query'][:40]}...\n\n")
# Failed section
f.write("=" * 70 + "\n")
f.write("SECTION 2: FAILED CLAIMS (HALLUCINATIONS)\n")
f.write("=" * 70 + "\n\n")
if not self.failed_results:
f.write("No failed claims.\n\n")
else:
for i, result in enumerate(self.failed_results, 1):
f.write(f" [{i}] {result['claim'][:60]}...\n")
f.write(f" Score: {result['score']:.3f} | Reason: {result['reason']}\n\n")
# Refined section
f.write("=" * 70 + "\n")
f.write("SECTION 3: REFINED PROMPTS\n")
f.write("=" * 70 + "\n\n")
if not self.refined_prompts:
f.write("No refined prompts.\n\n")
else:
for i, result in enumerate(self.refined_prompts, 1):
f.write(f" [{i}] Query: {result['query']}\n")
f.write(f" Ratio: {result['original_ratio']:.1%} -> {result['new_ratio']:.1%}\n")
f.write(f" Removed {len(result['removed_claims'])} unsupported claims\n\n")