Spaces:

puligadda
/

rag12-analytics

Sleeping

File size: 5,342 Bytes

a657e9e

"""
Data loading and processing module for RAG Analytics
"""
import pandas as pd
import os
from pathlib import Path
from typing import Tuple, List
from config import DATA_FOLDER, COLUMN_MAP, METRIC_COLUMNS, NUMERIC_CONFIG_COLUMNS, REQUIRED_COLUMNS, DEBUG


def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    1. Renames columns by stripping special chars (spaces, =, -).
    2. Forces metric columns to numeric (floats).
    3. Retains all data without schema validation dropping rows.
    
    Args:
        df: Raw dataframe loaded from CSV
        
    Returns:
        Normalized dataframe with standardized column names and types
    """
    rename_dict = {}
    for col in df.columns:
        # Aggressive clean: "RMSE=trace relevance" -> "rmsetracerelevance"
        # Remove spaces, underscores, hyphens, equals signs
        clean_col = "".join(ch for ch in str(col).lower() if ch.isalnum())
        
        if clean_col in COLUMN_MAP:
            rename_dict[col] = COLUMN_MAP[clean_col]
    
    df = df.rename(columns=rename_dict)
    
    # Force ALL metric columns to float64 (Coerce errors to NaN then 0.0)
    # This ensures "Empty" strings or invalid values don't crash the graph
    # Using astype(float) explicitly ensures floating-point display
    for metric in METRIC_COLUMNS:
        if metric in df.columns:
            df[metric] = pd.to_numeric(df[metric], errors='coerce').fillna(0.0).astype(float)
    
    # Force ALL numeric configuration columns to float64
    # This prevents integers like "256" from displaying as integers in graphs
    for config_col in NUMERIC_CONFIG_COLUMNS:
        if config_col in df.columns:
            # Convert to numeric, but preserve N/A as NaN (don't fill)
            df[config_col] = pd.to_numeric(df[config_col], errors='coerce').astype(float)
            
    return df


def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
    """
    Validates that the dataframe has required columns.
    
    Args:
        df: Dataframe to validate
        
    Returns:
        Tuple of (is_valid, error_message)
    """
    missing_cols = REQUIRED_COLUMNS - set(df.columns)
    
    if missing_cols:
        return False, f"Missing required columns: {', '.join(missing_cols)}"
    
    if df.empty:
        return False, "Dataframe is empty"
    
    return True, "Valid"


def load_csv_from_folder(folder_path: str = None) -> Tuple[pd.DataFrame, str]:
    """
    Loads all CSV files from the specified folder and combines them.
    
    Args:
        folder_path: Path to folder containing CSV files. If None, uses DATA_FOLDER from config.
        
    Returns:
        Tuple of (combined_dataframe, status_message)
    """
    if folder_path is None:
        folder_path = DATA_FOLDER
    
    folder = Path(folder_path)
    
    if not folder.exists():
        return pd.DataFrame(), f"Error: Data folder '{folder_path}' does not exist."
    
    if not folder.is_dir():
        return pd.DataFrame(), f"Error: '{folder_path}' is not a directory."
    
    # Find all CSV files
    csv_files = list(folder.glob("*.csv"))
    
    if not csv_files:
        return pd.DataFrame(), f"Error: No CSV files found in '{folder_path}'."
    
    all_dfs = []
    loaded_files = []
    errors = []
    
    for csv_file in csv_files:
        try:
            # Load raw CSV
            df_raw = pd.read_csv(csv_file, encoding='utf-8-sig')
            
            # Normalize column names and types
            df_clean = normalize_dataframe(df_raw)
            
            # Validate
            is_valid, error_msg = validate_dataframe(df_clean)
            if not is_valid:
                errors.append(f"{csv_file.name}: {error_msg}")
                continue
            
            all_dfs.append(df_clean)
            loaded_files.append(csv_file.name)
            
        except Exception as e:
            errors.append(f"{csv_file.name}: {str(e)}")
    
    if not all_dfs:
        error_summary = "\n".join(errors) if errors else "Unknown error"
        return pd.DataFrame(), f"Error: Failed to load any valid CSV files.\n{error_summary}"
    
    # Combine all dataframes
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    # Build status message
    status_parts = [f"Successfully loaded {len(final_df)} test runs from {len(loaded_files)} file(s):"]
    status_parts.extend([f"  • {fname}" for fname in loaded_files])
    
    if errors:
        status_parts.append(f"\n{len(errors)} file(s) skipped due to errors:")
        status_parts.extend([f"  • {err}" for err in errors])
    
    # Add debug info if enabled
    if DEBUG and not final_df.empty:
        sample = final_df.iloc[0]
        debug_info = f"\nDEBUG (Row 1): Relevance={sample.get('rmse_relevance', 'N/A')}, F1={sample.get('f1_score', 'N/A')}, AUCROC={sample.get('aucroc', 'N/A')}"
        status_parts.append(debug_info)
    
    return final_df, "\n".join(status_parts)


def get_available_datasets(df: pd.DataFrame) -> List[str]:
    """
    Extracts unique dataset names from the dataframe.
    
    Args:
        df: Dataframe containing dataset_name column
        
    Returns:
        List of unique dataset names
    """
    if df.empty or 'dataset_name' not in df.columns:
        return []
    
    return sorted(df['dataset_name'].unique().tolist())