File size: 5,342 Bytes
a657e9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Data loading and processing module for RAG Analytics
"""
import pandas as pd
import os
from pathlib import Path
from typing import Tuple, List
from config import DATA_FOLDER, COLUMN_MAP, METRIC_COLUMNS, NUMERIC_CONFIG_COLUMNS, REQUIRED_COLUMNS, DEBUG


def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    1. Renames columns by stripping special chars (spaces, =, -).
    2. Forces metric columns to numeric (floats).
    3. Retains all data without schema validation dropping rows.
    
    Args:
        df: Raw dataframe loaded from CSV
        
    Returns:
        Normalized dataframe with standardized column names and types
    """
    rename_dict = {}
    for col in df.columns:
        # Aggressive clean: "RMSE=trace relevance" -> "rmsetracerelevance"
        # Remove spaces, underscores, hyphens, equals signs
        clean_col = "".join(ch for ch in str(col).lower() if ch.isalnum())
        
        if clean_col in COLUMN_MAP:
            rename_dict[col] = COLUMN_MAP[clean_col]
    
    df = df.rename(columns=rename_dict)
    
    # Force ALL metric columns to float64 (Coerce errors to NaN then 0.0)
    # This ensures "Empty" strings or invalid values don't crash the graph
    # Using astype(float) explicitly ensures floating-point display
    for metric in METRIC_COLUMNS:
        if metric in df.columns:
            df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0.0).astype(float)
    
    # Force ALL numeric configuration columns to float64
    # This prevents integers like "256" from displaying as integers in graphs
    for config_col in NUMERIC_CONFIG_COLUMNS:
        if config_col in df.columns:
            # Convert to numeric, but preserve N/A as NaN (don't fill)
            df[config_col] = pd.to_numeric(df[config_col], errors='coerce').astype(float)
            
    return df


def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
    """
    Validates that the dataframe has required columns.
    
    Args:
        df: Dataframe to validate
        
    Returns:
        Tuple of (is_valid, error_message)
    """
    missing_cols = REQUIRED_COLUMNS - set(df.columns)
    
    if missing_cols:
        return False, f"Missing required columns: {', '.join(missing_cols)}"
    
    if df.empty:
        return False, "Dataframe is empty"
    
    return True, "Valid"


def load_csv_from_folder(folder_path: str = None) -> Tuple[pd.DataFrame, str]:
    """
    Loads all CSV files from the specified folder and combines them.
    
    Args:
        folder_path: Path to folder containing CSV files. If None, uses DATA_FOLDER from config.
        
    Returns:
        Tuple of (combined_dataframe, status_message)
    """
    if folder_path is None:
        folder_path = DATA_FOLDER
    
    folder = Path(folder_path)
    
    if not folder.exists():
        return pd.DataFrame(), f"Error: Data folder '{folder_path}' does not exist."
    
    if not folder.is_dir():
        return pd.DataFrame(), f"Error: '{folder_path}' is not a directory."
    
    # Find all CSV files
    csv_files = list(folder.glob("*.csv"))
    
    if not csv_files:
        return pd.DataFrame(), f"Error: No CSV files found in '{folder_path}'."
    
    all_dfs = []
    loaded_files = []
    errors = []
    
    for csv_file in csv_files:
        try:
            # Load raw CSV
            df_raw = pd.read_csv(csv_file, encoding='utf-8-sig')
            
            # Normalize column names and types
            df_clean = normalize_dataframe(df_raw)
            
            # Validate
            is_valid, error_msg = validate_dataframe(df_clean)
            if not is_valid:
                errors.append(f"{csv_file.name}: {error_msg}")
                continue
            
            all_dfs.append(df_clean)
            loaded_files.append(csv_file.name)
            
        except Exception as e:
            errors.append(f"{csv_file.name}: {str(e)}")
    
    if not all_dfs:
        error_summary = "\n".join(errors) if errors else "Unknown error"
        return pd.DataFrame(), f"Error: Failed to load any valid CSV files.\n{error_summary}"
    
    # Combine all dataframes
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    # Build status message
    status_parts = [f"Successfully loaded {len(final_df)} test runs from {len(loaded_files)} file(s):"]
    status_parts.extend([f"  • {fname}" for fname in loaded_files])
    
    if errors:
        status_parts.append(f"\n{len(errors)} file(s) skipped due to errors:")
        status_parts.extend([f"  • {err}" for err in errors])
    
    # Add debug info if enabled
    if DEBUG and not final_df.empty:
        sample = final_df.iloc[0]
        debug_info = f"\nDEBUG (Row 1): Relevance={sample.get('rmse_relevance', 'N/A')}, F1={sample.get('f1_score', 'N/A')}, AUCROC={sample.get('aucroc', 'N/A')}"
        status_parts.append(debug_info)
    
    return final_df, "\n".join(status_parts)


def get_available_datasets(df: pd.DataFrame) -> List[str]:
    """
    Extracts unique dataset names from the dataframe.
    
    Args:
        df: Dataframe containing dataset_name column
        
    Returns:
        List of unique dataset names
    """
    if df.empty or 'dataset_name' not in df.columns:
        return []
    
    return sorted(df['dataset_name'].unique().tolist())