Spaces:

SamsungResearch
/

TRUEBench

Running

File size: 4,858 Bytes

import pandas as pd
from pathlib import Path
from typing import Optional

# Global cache variables
_category_df_cache: dict[str, Optional[pd.DataFrame]] = {"open": None}
_language_df_cache: dict[str, Optional[pd.DataFrame]] = {"open": None}

def _load_category_csv(data_prefix: str = "") -> pd.DataFrame:
    """Load the category CSV file with proper encoding and delimiter."""
    abs_path = Path(__file__).parent
    df = pd.read_csv(str(abs_path / "data" / data_prefix / "stats.csv"), encoding='utf-8', delimiter="\t")
    return df.copy()

def _load_language_csv(data_prefix: str = "open/") -> pd.DataFrame:
    """Load the language CSV file with proper encoding and delimiter."""
    abs_path = Path(__file__).parent
    df = pd.read_csv(str(abs_path / "data" / data_prefix / "stats_lang.csv"), encoding='utf-8', delimiter="\t")
    return df.copy()

def get_category_dataframe(processed: bool = True, data_prefix: str = "open/") -> pd.DataFrame:
    """
    Get the category dataframe.
    
    Args:
        processed: If True, returns processed dataframe (for vis_utils.py compatibility)
                  If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
    
    Returns:
        pd.DataFrame: The category dataframe
    """
    global _category_df_cache
    
    if _category_df_cache.get(data_prefix) is None:
        _category_df_cache[data_prefix] = _load_category_csv(data_prefix)
    
    df = _category_df_cache[data_prefix].copy()
    
    if processed:
        # Apply vis_utils.py processing
        required_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Time to First Answer Token", "End-to-End Response Time", "Speed", "Parameter Size (B)", "Type", "Model Type", "Think", 'Content Generation', 'Editing', 'Data Analysis', 
                        'Reasoning', 'Hallucination', 'Safety', 'Repetition', 
                        'Summarization', 'Translation', 'Multi-Turn']
        
        for col in required_cols:
            if col not in df.columns:
                if col in ["Link", "Group"]:
                    df[col] = ""
                else:
                    df[col] = 0

        from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
        for col in NUMERIC_COLS_CATEGORY:
            if col in df.columns:
                if col in NUMERIC_INT_COLS_CATEGORY:
                    df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
                else:
                    df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
            else:
                df[col] = 0

        if "Think" not in df.columns:
            df["Think"] = "Off"

        df = df.fillna('')
        
    else:
        # Apply data_utils.py processing
        df = df.sort_values("Overall", ascending=False)
    
    return df

def get_language_dataframe(processed: bool = True, data_prefix: str = "open/") -> pd.DataFrame:
    """
    Get the language dataframe.
    
    Args:
        processed: If True, returns processed dataframe (for vis_utils.py compatibility)
                  If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
    
    Returns:
        pd.DataFrame: The language dataframe
    """
    global _language_df_cache
    
    if _language_df_cache.get(data_prefix) is None:
        _language_df_cache[data_prefix] = _load_language_csv(data_prefix)
    
    df = _language_df_cache[data_prefix].copy()
    
    if processed:
        # Apply vis_utils.py processing
        language_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Time to First Answer Token", "End-to-End Response Time", "Speed", "Parameter Size (B)", "Type", "Model Type", "Think", 'KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
        for col in language_cols:
            if col not in df.columns:
                if col in ["Link", "Group"]:
                    df[col] = ""
                else:
                    df[col] = 0
        
        from constants import NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
        for col in NUMERIC_COLS_LANGUAGE:
            if col in df.columns:
                if col in NUMERIC_INT_COLS_LANGUAGE:
                    df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
                else:
                    df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
            else:
                df[col] = 0
        
        df = df.fillna('')
    else:
        # Apply data_utils.py processing
        df = df.sort_values("Overall", ascending=False)
    
    return df

def clear_cache():
    """Clear the cached dataframes to force reload on next access."""
    global _category_df_cache, _language_df_cache
    _category_df_cache = {"open": None}
    _language_df_cache = {"open": None}