File size: 8,393 Bytes
1bbe15b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
VDHF Results Logger - Stores verification results in categorized output files.

Output files:
- output/passed_results.txt      - Claims that passed verification
- output/failed_results.txt      - Claims that failed (hallucinations)
- output/refined_prompts.txt     - Prompts that were refined/regenerated
- output/combined_report.txt     - Complete combined report
"""

import os
from datetime import datetime

# Output directory (relative to project root)
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Output file paths
PASSED_FILE = os.path.join(OUTPUT_DIR, "passed_results.txt")
FAILED_FILE = os.path.join(OUTPUT_DIR, "failed_results.txt")
REFINED_FILE = os.path.join(OUTPUT_DIR, "refined_prompts.txt")
COMBINED_FILE = os.path.join(OUTPUT_DIR, "combined_report.txt")


class ResultsLogger:
    """Logs verification results to categorized files."""

    def __init__(self):
        self.passed_results = []
        self.failed_results = []
        self.refined_prompts = []
        self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    def log_passed(self, query, claim, score, evidence):
        """Log a passed (supported) claim."""
        self.passed_results.append({
            'query': query,
            'claim': claim,
            'score': score,
            'evidence': evidence
        })

    def log_failed(self, query, claim, score, reason="Insufficient evidence"):
        """Log a failed (unsupported/hallucinated) claim."""
        self.failed_results.append({
            'query': query,
            'claim': claim,
            'score': score,
            'reason': reason
        })

    def log_refined(self, query, original_response, refined_response,
                    original_ratio, new_ratio, removed_claims):
        """Log a refined/regenerated prompt."""
        self.refined_prompts.append({
            'query': query,
            'original_response': original_response,
            'refined_response': refined_response,
            'original_ratio': original_ratio,
            'new_ratio': new_ratio,
            'removed_claims': removed_claims
        })

    def save_all(self):
        """Save all results to their respective files."""
        self._save_passed()
        self._save_failed()
        self._save_refined()
        self._save_combined()
        print(f"\n[*] Results saved to {OUTPUT_DIR}/")

    def _save_passed(self):
        """Save passed results to file."""
        with open(PASSED_FILE, 'w', encoding='utf-8') as f:
            f.write("=" * 70 + "\n")
            f.write("PASSED VERIFICATION RESULTS\n")
            f.write(f"Generated: {self.timestamp}\n")
            f.write("=" * 70 + "\n\n")

            if not self.passed_results:
                f.write("No passed results recorded.\n")
            else:
                f.write(f"Total Passed Claims: {len(self.passed_results)}\n")
                f.write("-" * 70 + "\n\n")

                for i, result in enumerate(self.passed_results, 1):
                    f.write(f"[{i}] PASSED CLAIM\n")
                    f.write(f"    Query: {result['query']}\n")
                    f.write(f"    Claim: {result['claim']}\n")
                    f.write(f"    Score: {result['score']:.3f}\n")
                    f.write(f"    Evidence: {result['evidence'][:100]}...\n")
                    f.write("\n")

    def _save_failed(self):
        """Save failed results to file."""
        with open(FAILED_FILE, 'w', encoding='utf-8') as f:
            f.write("=" * 70 + "\n")
            f.write("FAILED VERIFICATION RESULTS (HALLUCINATIONS)\n")
            f.write(f"Generated: {self.timestamp}\n")
            f.write("=" * 70 + "\n\n")

            if not self.failed_results:
                f.write("No failed results recorded.\n")
            else:
                f.write(f"Total Failed Claims: {len(self.failed_results)}\n")
                f.write("-" * 70 + "\n\n")

                for i, result in enumerate(self.failed_results, 1):
                    f.write(f"[{i}] FAILED CLAIM (HALLUCINATION)\n")
                    f.write(f"    Query: {result['query']}\n")
                    f.write(f"    Claim: {result['claim']}\n")
                    f.write(f"    Score: {result['score']:.3f}\n")
                    f.write(f"    Reason: {result['reason']}\n")
                    f.write("\n")

    def _save_refined(self):
        """Save refined prompts to file."""
        with open(REFINED_FILE, 'w', encoding='utf-8') as f:
            f.write("=" * 70 + "\n")
            f.write("REFINED PROMPTS (REGENERATED RESPONSES)\n")
            f.write(f"Generated: {self.timestamp}\n")
            f.write("=" * 70 + "\n\n")

            if not self.refined_prompts:
                f.write("No refined prompts recorded.\n")
            else:
                f.write(f"Total Refinements: {len(self.refined_prompts)}\n")
                f.write("-" * 70 + "\n\n")

                for i, result in enumerate(self.refined_prompts, 1):
                    f.write(f"[{i}] REFINED PROMPT\n")
                    f.write(f"    Query: {result['query']}\n")
                    f.write(f"    Original Support Ratio: {result['original_ratio']:.1%}\n")
                    f.write(f"    New Support Ratio: {result['new_ratio']:.1%}\n")
                    f.write(f"\n    ORIGINAL RESPONSE:\n")
                    f.write(f"    {result['original_response'][:200]}...\n")
                    f.write(f"\n    REFINED RESPONSE:\n")
                    f.write(f"    {result['refined_response']}\n")
                    f.write(f"\n    REMOVED CLAIMS:\n")
                    for claim in result['removed_claims']:
                        f.write(f"      - {claim}\n")
                    f.write("\n")

    def _save_combined(self):
        """Save combined report to file."""
        with open(COMBINED_FILE, 'w', encoding='utf-8') as f:
            f.write("=" * 70 + "\n")
            f.write("VDHF COMBINED VERIFICATION REPORT\n")
            f.write(f"Generated: {self.timestamp}\n")
            f.write("=" * 70 + "\n\n")

            # Summary
            f.write("SUMMARY\n")
            f.write("-" * 70 + "\n")
            f.write(f"  Passed Claims:     {len(self.passed_results)}\n")
            f.write(f"  Failed Claims:     {len(self.failed_results)}\n")
            f.write(f"  Refined Prompts:   {len(self.refined_prompts)}\n")
            total = len(self.passed_results) + len(self.failed_results)
            if total > 0:
                pass_rate = len(self.passed_results) / total * 100
                f.write(f"  Overall Pass Rate: {pass_rate:.1f}%\n")
            f.write("\n")

            # Passed section
            f.write("=" * 70 + "\n")
            f.write("SECTION 1: PASSED CLAIMS\n")
            f.write("=" * 70 + "\n\n")
            if not self.passed_results:
                f.write("No passed claims.\n\n")
            else:
                for i, result in enumerate(self.passed_results, 1):
                    f.write(f"  [{i}] {result['claim'][:60]}...\n")
                    f.write(f"      Score: {result['score']:.3f} | Query: {result['query'][:40]}...\n\n")

            # Failed section
            f.write("=" * 70 + "\n")
            f.write("SECTION 2: FAILED CLAIMS (HALLUCINATIONS)\n")
            f.write("=" * 70 + "\n\n")
            if not self.failed_results:
                f.write("No failed claims.\n\n")
            else:
                for i, result in enumerate(self.failed_results, 1):
                    f.write(f"  [{i}] {result['claim'][:60]}...\n")
                    f.write(f"      Score: {result['score']:.3f} | Reason: {result['reason']}\n\n")

            # Refined section
            f.write("=" * 70 + "\n")
            f.write("SECTION 3: REFINED PROMPTS\n")
            f.write("=" * 70 + "\n\n")
            if not self.refined_prompts:
                f.write("No refined prompts.\n\n")
            else:
                for i, result in enumerate(self.refined_prompts, 1):
                    f.write(f"  [{i}] Query: {result['query']}\n")
                    f.write(f"      Ratio: {result['original_ratio']:.1%} -> {result['new_ratio']:.1%}\n")
                    f.write(f"      Removed {len(result['removed_claims'])} unsupported claims\n\n")