Spaces:
Sleeping
Sleeping
| """ | |
| VDHF Results Logger - Stores verification results in categorized output files. | |
| Output files: | |
| - output/passed_results.txt - Claims that passed verification | |
| - output/failed_results.txt - Claims that failed (hallucinations) | |
| - output/refined_prompts.txt - Prompts that were refined/regenerated | |
| - output/combined_report.txt - Complete combined report | |
| """ | |
| import os | |
| from datetime import datetime | |
| # Output directory (relative to project root) | |
| PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| # Output file paths | |
| PASSED_FILE = os.path.join(OUTPUT_DIR, "passed_results.txt") | |
| FAILED_FILE = os.path.join(OUTPUT_DIR, "failed_results.txt") | |
| REFINED_FILE = os.path.join(OUTPUT_DIR, "refined_prompts.txt") | |
| COMBINED_FILE = os.path.join(OUTPUT_DIR, "combined_report.txt") | |
| class ResultsLogger: | |
| """Logs verification results to categorized files.""" | |
| def __init__(self): | |
| self.passed_results = [] | |
| self.failed_results = [] | |
| self.refined_prompts = [] | |
| self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| def log_passed(self, query, claim, score, evidence): | |
| """Log a passed (supported) claim.""" | |
| self.passed_results.append({ | |
| 'query': query, | |
| 'claim': claim, | |
| 'score': score, | |
| 'evidence': evidence | |
| }) | |
| def log_failed(self, query, claim, score, reason="Insufficient evidence"): | |
| """Log a failed (unsupported/hallucinated) claim.""" | |
| self.failed_results.append({ | |
| 'query': query, | |
| 'claim': claim, | |
| 'score': score, | |
| 'reason': reason | |
| }) | |
| def log_refined(self, query, original_response, refined_response, | |
| original_ratio, new_ratio, removed_claims): | |
| """Log a refined/regenerated prompt.""" | |
| self.refined_prompts.append({ | |
| 'query': query, | |
| 'original_response': original_response, | |
| 'refined_response': refined_response, | |
| 'original_ratio': original_ratio, | |
| 'new_ratio': new_ratio, | |
| 'removed_claims': removed_claims | |
| }) | |
| def save_all(self): | |
| """Save all results to their respective files.""" | |
| self._save_passed() | |
| self._save_failed() | |
| self._save_refined() | |
| self._save_combined() | |
| print(f"\n[*] Results saved to {OUTPUT_DIR}/") | |
| def _save_passed(self): | |
| """Save passed results to file.""" | |
| with open(PASSED_FILE, 'w', encoding='utf-8') as f: | |
| f.write("=" * 70 + "\n") | |
| f.write("PASSED VERIFICATION RESULTS\n") | |
| f.write(f"Generated: {self.timestamp}\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.passed_results: | |
| f.write("No passed results recorded.\n") | |
| else: | |
| f.write(f"Total Passed Claims: {len(self.passed_results)}\n") | |
| f.write("-" * 70 + "\n\n") | |
| for i, result in enumerate(self.passed_results, 1): | |
| f.write(f"[{i}] PASSED CLAIM\n") | |
| f.write(f" Query: {result['query']}\n") | |
| f.write(f" Claim: {result['claim']}\n") | |
| f.write(f" Score: {result['score']:.3f}\n") | |
| f.write(f" Evidence: {result['evidence'][:100]}...\n") | |
| f.write("\n") | |
| def _save_failed(self): | |
| """Save failed results to file.""" | |
| with open(FAILED_FILE, 'w', encoding='utf-8') as f: | |
| f.write("=" * 70 + "\n") | |
| f.write("FAILED VERIFICATION RESULTS (HALLUCINATIONS)\n") | |
| f.write(f"Generated: {self.timestamp}\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.failed_results: | |
| f.write("No failed results recorded.\n") | |
| else: | |
| f.write(f"Total Failed Claims: {len(self.failed_results)}\n") | |
| f.write("-" * 70 + "\n\n") | |
| for i, result in enumerate(self.failed_results, 1): | |
| f.write(f"[{i}] FAILED CLAIM (HALLUCINATION)\n") | |
| f.write(f" Query: {result['query']}\n") | |
| f.write(f" Claim: {result['claim']}\n") | |
| f.write(f" Score: {result['score']:.3f}\n") | |
| f.write(f" Reason: {result['reason']}\n") | |
| f.write("\n") | |
| def _save_refined(self): | |
| """Save refined prompts to file.""" | |
| with open(REFINED_FILE, 'w', encoding='utf-8') as f: | |
| f.write("=" * 70 + "\n") | |
| f.write("REFINED PROMPTS (REGENERATED RESPONSES)\n") | |
| f.write(f"Generated: {self.timestamp}\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.refined_prompts: | |
| f.write("No refined prompts recorded.\n") | |
| else: | |
| f.write(f"Total Refinements: {len(self.refined_prompts)}\n") | |
| f.write("-" * 70 + "\n\n") | |
| for i, result in enumerate(self.refined_prompts, 1): | |
| f.write(f"[{i}] REFINED PROMPT\n") | |
| f.write(f" Query: {result['query']}\n") | |
| f.write(f" Original Support Ratio: {result['original_ratio']:.1%}\n") | |
| f.write(f" New Support Ratio: {result['new_ratio']:.1%}\n") | |
| f.write(f"\n ORIGINAL RESPONSE:\n") | |
| f.write(f" {result['original_response'][:200]}...\n") | |
| f.write(f"\n REFINED RESPONSE:\n") | |
| f.write(f" {result['refined_response']}\n") | |
| f.write(f"\n REMOVED CLAIMS:\n") | |
| for claim in result['removed_claims']: | |
| f.write(f" - {claim}\n") | |
| f.write("\n") | |
| def _save_combined(self): | |
| """Save combined report to file.""" | |
| with open(COMBINED_FILE, 'w', encoding='utf-8') as f: | |
| f.write("=" * 70 + "\n") | |
| f.write("VDHF COMBINED VERIFICATION REPORT\n") | |
| f.write(f"Generated: {self.timestamp}\n") | |
| f.write("=" * 70 + "\n\n") | |
| # Summary | |
| f.write("SUMMARY\n") | |
| f.write("-" * 70 + "\n") | |
| f.write(f" Passed Claims: {len(self.passed_results)}\n") | |
| f.write(f" Failed Claims: {len(self.failed_results)}\n") | |
| f.write(f" Refined Prompts: {len(self.refined_prompts)}\n") | |
| total = len(self.passed_results) + len(self.failed_results) | |
| if total > 0: | |
| pass_rate = len(self.passed_results) / total * 100 | |
| f.write(f" Overall Pass Rate: {pass_rate:.1f}%\n") | |
| f.write("\n") | |
| # Passed section | |
| f.write("=" * 70 + "\n") | |
| f.write("SECTION 1: PASSED CLAIMS\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.passed_results: | |
| f.write("No passed claims.\n\n") | |
| else: | |
| for i, result in enumerate(self.passed_results, 1): | |
| f.write(f" [{i}] {result['claim'][:60]}...\n") | |
| f.write(f" Score: {result['score']:.3f} | Query: {result['query'][:40]}...\n\n") | |
| # Failed section | |
| f.write("=" * 70 + "\n") | |
| f.write("SECTION 2: FAILED CLAIMS (HALLUCINATIONS)\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.failed_results: | |
| f.write("No failed claims.\n\n") | |
| else: | |
| for i, result in enumerate(self.failed_results, 1): | |
| f.write(f" [{i}] {result['claim'][:60]}...\n") | |
| f.write(f" Score: {result['score']:.3f} | Reason: {result['reason']}\n\n") | |
| # Refined section | |
| f.write("=" * 70 + "\n") | |
| f.write("SECTION 3: REFINED PROMPTS\n") | |
| f.write("=" * 70 + "\n\n") | |
| if not self.refined_prompts: | |
| f.write("No refined prompts.\n\n") | |
| else: | |
| for i, result in enumerate(self.refined_prompts, 1): | |
| f.write(f" [{i}] Query: {result['query']}\n") | |
| f.write(f" Ratio: {result['original_ratio']:.1%} -> {result['new_ratio']:.1%}\n") | |
| f.write(f" Removed {len(result['removed_claims'])} unsupported claims\n\n") | |