File size: 12,012 Bytes
669d6a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
"""
Data Access Tracker for preventing test set contamination.
Logs every dataset access with temporal boundaries to detect data snooping.
"""

import inspect
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pandas as pd
from loguru import logger


class DataAccessTracker:
    """
    Track every data access to detect test set contamination.

    Critical for detecting data snooping bias in ML development where
    developers unknowingly test on the same data repeatedly during
    iterative optimization.
    """

    def __init__(self, log_file: Optional[Path] = None):
        """
        Initialize data access tracker.

        Args:
            log_file: Path to access log CSV (None = use default in cache dir)
        """
        # Import at runtime to avoid circular imports
        from . import CACHE_DIRS

        self.log_file = log_file or CACHE_DIRS["base"] / "data_access_log.csv"
        self.log_file.parent.mkdir(parents=True, exist_ok=True)

        self.access_log: List[Dict] = []
        self._load_existing_log()

    def _load_existing_log(self):
        """Load existing access log if it exists."""
        if self.log_file.exists():
            try:
                df = pd.read_csv(self.log_file)
                self.access_log = df.to_dict("records")
                logger.debug(f"Loaded {len(self.access_log)} existing access records")
            except Exception as e:
                logger.warning(f"Failed to load existing access log: {e}")
                self.access_log = []

    def log_access(
        self,
        dataset_name: str,
        start_date: pd.Timestamp,
        end_date: pd.Timestamp,
        purpose: str,
        data_shape: Optional[Tuple[int, int]] = None,
    ):
        """
        Log a dataset access with metadata.

        Args:
            dataset_name: Unique identifier for the dataset
            start_date: Start of temporal range
            end_date: End of temporal range
            purpose: One of 'train', 'test', 'validate', 'optimize', 'analyze'
            data_shape: Optional shape tuple (rows, cols)
        """
        entry = {
            "timestamp": datetime.now().isoformat(),
            "dataset": dataset_name,
            "start_date": str(start_date),
            "end_date": str(end_date),
            "purpose": purpose,
            "data_shape": str(data_shape) if data_shape else None,
            "caller": self._get_caller_info(),
        }

        self.access_log.append(entry)

        # AUTO-SAVE: Persist immediately for reliability
        try:
            self._save_log_immediate()  # New helper method
            logger.debug(
                f"Logged & saved access: {dataset_name} "
                f"[{start_date} to {end_date}] for {purpose}"
            )
        except Exception as e:
            logger.error(f"Failed to save access log: {e}")

    def _save_log_immediate(self):
        """Save log immediately (append mode for efficiency)."""
        try:
            # Use append mode to avoid rewriting entire file
            df_entry = pd.DataFrame([self.access_log[-1]])  # Just the last entry

            if self.log_file.exists():
                # Append to existing file
                df_entry.to_csv(self.log_file, mode="a", header=False, index=False)
            else:
                # Create new file with header
                df_entry.to_csv(self.log_file, mode="w", header=True, index=False)

        except Exception as e:
            logger.error(f"Failed to save access log: {e}")
            # Fall back to full save
            try:
                self.save_log()  # Existing method as fallback
            except Exception as e2:
                logger.error(f"Fallback save also failed: {e2}")

    def save_log(self):
        """Persist access log to disk."""
        try:
            df = pd.DataFrame(self.access_log)
            df.to_csv(self.log_file, index=False)
            logger.info(
                f"Saved {len(self.access_log)} access records to {self.log_file}"
            )
        except Exception as e:
            logger.error(f"Failed to save access log: {e}")

    def analyze_contamination(
        self, dataset_name: str, exclude_purposes: Optional[List[str]] = None
    ) -> Tuple[int, str, List[Dict]]:
        """
        Analyze how many times a dataset was accessed.

        Args:
            dataset_name: Dataset to analyze
            exclude_purposes: Purposes to exclude (e.g., ['validate'] for final check)

        Returns:
            Tuple of (access_count, warning_level, access_details)

        Warning Levels:
            - CLEAN: 0 accesses
            - ACCEPTABLE: 1-2 accesses (normal development)
            - WARNING: 3-10 accesses (risky, be careful)
            - CONTAMINATED: >10 accesses (severely compromised)
        """
        exclude_purposes = exclude_purposes or []

        matching_accesses = [
            entry
            for entry in self.access_log
            if entry["dataset"] == dataset_name
            and entry["purpose"] not in exclude_purposes
        ]

        access_count = len(matching_accesses)

        # Determine warning level
        if access_count == 0:
            warning_level = "CLEAN"
        elif access_count <= 2:
            warning_level = "ACCEPTABLE"
        elif access_count <= 10:
            warning_level = "WARNING"
        else:
            warning_level = "CONTAMINATED"

        return access_count, warning_level, matching_accesses

    def get_contamination_report(self) -> pd.DataFrame:
        """
        Generate comprehensive contamination report for all datasets.

        Returns:
            DataFrame with contamination analysis for each dataset
        """
        if not self.access_log:
            return pd.DataFrame()

        df = pd.DataFrame(self.access_log)

        report_data = []
        for dataset_name in df["dataset"].unique():
            access_count, warning_level, accesses = self.analyze_contamination(
                dataset_name
            )

            # Extract purposes from the already-filtered accesses
            purposes = (
                pd.Series([a["purpose"] for a in accesses]).value_counts().to_dict()
            )

            # Extract timestamps from the already-filtered accesses
            timestamps = [a["timestamp"] for a in accesses]

            report_data.append(
                {
                    "dataset": dataset_name,
                    "total_accesses": access_count,
                    "warning_level": warning_level,
                    "train_accesses": purposes.get("train", 0),
                    "test_accesses": purposes.get("test", 0),
                    "validate_accesses": purposes.get("validate", 0),
                    "optimize_accesses": purposes.get("optimize", 0),
                    "first_access": min(timestamps) if timestamps else None,
                    "last_access": max(timestamps) if timestamps else None,
                }
            )

        return pd.DataFrame(report_data)

    def print_contamination_report(self):
        """Print formatted contamination report to console."""
        report = self.get_contamination_report()

        if report.empty:
            print("No data accesses logged yet.")
            return

        print("\n" + "=" * 80)
        print("DATA CONTAMINATION REPORT")
        print("=" * 80)

        for _, row in report.iterrows():
            print(f"\nDataset: {row['dataset']}")
            print(f"  Warning Level: {row['warning_level']}")
            print(f"  Total Accesses: {row['total_accesses']}")
            print(f"  Breakdown:")
            print(f"    - Train: {row['train_accesses']}")
            print(
                f"    - Test: {row['test_accesses']} {'⚠️' if row['test_accesses'] > 2 else ''}"
            )
            print(
                f"    - Validate: {row['validate_accesses']} {'⚠️' if row['validate_accesses'] > 2 else ''}"
            )
            print(f"    - Optimize: {row['optimize_accesses']}")
            print(f"  First Access: {row['first_access']}")
            print(f"  Last Access: {row['last_access']}")

            if row["warning_level"] in ["WARNING", "CONTAMINATED"]:
                print(f"\n  ⚠️  WARNING: This dataset may be contaminated!")
                print(f"  ⚠️  Test/validation results on this data are unreliable.")

        print("\n" + "=" * 80)

        # Summary recommendations
        contaminated = report[report["warning_level"].isin(["WARNING", "CONTAMINATED"])]
        if len(contaminated) > 0:
            print("\nRECOMMENDATIONS:")
            print("  1. Use truly held-out validation set for honest assessment")
            print("  2. Document all contaminated datasets in your results")
            print("  3. Consider collecting fresh validation data")
            print("  4. Apply multiple testing corrections (e.g., Bonferroni)")
        else:
            print("\n✓ No contamination detected - data hygiene looks good!")

        print("=" * 80 + "\n")

    def export_detailed_log(self, output_file: Path):
        """
        Export detailed access log with analysis.

        Args:
            output_file: Path for output JSON file
        """
        report_data = {
            "summary": self.get_contamination_report().to_dict("records"),
            "detailed_log": self.access_log,
            "generated_at": datetime.now().isoformat(),
            "total_accesses": len(self.access_log),
        }

        with open(output_file, "w") as f:
            json.dump(report_data, f, indent=2)

        logger.info(f"Exported detailed access log to {output_file}")

    def _get_caller_info(self) -> str:
        """Get caller function/file for audit trail."""
        try:
            # Walk up the stack to find the calling function (skip tracker internals)
            frame = inspect.currentframe()
            for _ in range(3):  # Skip this method and decorator layers
                if frame is not None:
                    frame = frame.f_back

            if frame is not None:
                filename = Path(frame.f_code.co_filename).name
                lineno = frame.f_lineno
                funcname = frame.f_code.co_name
                return f"{filename}:{funcname}:{lineno}"
        except Exception:
            pass

        return "unknown"

    def clear_log(self):
        """Clear all access records (use with caution!)."""
        self.access_log = []
        if self.log_file.exists():
            self.log_file.unlink()
        logger.warning("Cleared all data access logs")


# =============================================================================
# Global instance and convenience functions
# =============================================================================

_global_tracker: Optional[DataAccessTracker] = None


def get_data_tracker() -> DataAccessTracker:
    """Get global data access tracker instance."""
    global _global_tracker
    if _global_tracker is None:
        _global_tracker = DataAccessTracker()
    return _global_tracker


def log_data_access(
    dataset_name: str,
    start_date: pd.Timestamp,
    end_date: pd.Timestamp,
    purpose: str,
    data_shape: Optional[Tuple[int, int]] = None,
):
    """Convenience function to log data access."""
    tracker = get_data_tracker()
    tracker.log_access(dataset_name, start_date, end_date, purpose, data_shape)


def print_contamination_report():
    """Convenience function to print contamination report."""
    tracker = get_data_tracker()
    tracker.print_contamination_report()


def clear_data_access_log():
    """Convenience function to clear access log."""
    tracker = get_data_tracker()
    tracker.clear_log()


__all__ = [
    "DataAccessTracker",
    "get_data_tracker",
    "log_data_access",
    "print_contamination_report",
    "clear_data_access_log",
]