"""Data loading and validation utilities for QualiVec.""" import os import pandas as pd from typing import List, Optional, Dict, Any, Union, Tuple class DataLoader: """Handles data loading and validation for QualiVec.""" def __init__(self, verbose: bool = True): """Initialize the DataLoader. Args: verbose: Whether to print status messages. """ self.verbose = verbose def load_corpus(self, filepath: str, sentence_column: str = "sentence") -> pd.DataFrame: """Load a corpus from a CSV file. Args: filepath: Path to the CSV file. sentence_column: Name of the column containing sentences. Returns: DataFrame containing the corpus. Raises: FileNotFoundError: If the file does not exist. ValueError: If the sentence column is missing. """ if not os.path.exists(filepath): raise FileNotFoundError(f"File not found: {filepath}") # Load the data if self.verbose: print(f"Loading corpus from {filepath}...") df = pd.read_csv(filepath) # Validate schema if sentence_column not in df.columns: raise ValueError(f"Required column '{sentence_column}' not found in the CSV file.") # Basic validation if df[sentence_column].isna().any(): if self.verbose: print(f"Warning: {df[sentence_column].isna().sum()} null values found in '{sentence_column}' column.") if self.verbose: print(f"Loaded {len(df)} rows from {filepath}") return df def load_reference_vectors(self, filepath: str, class_column: str = "class", node_column: str = "matching_node") -> pd.DataFrame: """Load reference vectors from a CSV file. Args: filepath: Path to the CSV file. class_column: Name of the column containing class labels. node_column: Name of the column containing matching nodes. Returns: DataFrame containing the reference vectors. Raises: FileNotFoundError: If the file does not exist. ValueError: If required columns are missing. """ if not os.path.exists(filepath): raise FileNotFoundError(f"File not found: {filepath}") if self.verbose: print(f"Loading reference vectors from {filepath}...") df = pd.read_csv(filepath) # Validate schema required_columns = [class_column, node_column] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise ValueError(f"Required columns {missing_columns} not found in the CSV file.") # Basic validation if df[class_column].isna().any() or df[node_column].isna().any(): if self.verbose: print(f"Warning: Null values found in reference vectors.") if self.verbose: print(f"Loaded {len(df)} reference vectors from {filepath}") print(f"Unique classes: {df[class_column].nunique()}") return df def load_labeled_data(self, filepath: str, label_column: str = "label") -> pd.DataFrame: """Load manually labeled data from a CSV file. Args: filepath: Path to the CSV file. label_column: Name of the column containing labels. Returns: DataFrame containing the labeled data. Raises: FileNotFoundError: If the file does not exist. ValueError: If the label column is missing. """ if not os.path.exists(filepath): raise FileNotFoundError(f"File not found: {filepath}") if self.verbose: print(f"Loading labeled data from {filepath}...") df = pd.read_csv(filepath) # Validate schema if label_column not in df.columns: raise ValueError(f"Required column '{label_column}' not found in the CSV file.") # Basic validation if df[label_column].isna().any(): if self.verbose: print(f"Warning: {df[label_column].isna().sum()} null values found in '{label_column}' column.") if self.verbose: print(f"Loaded {len(df)} labeled samples from {filepath}") print(f"Label distribution:\n{df[label_column].value_counts()}") return df def save_dataframe(self, df: pd.DataFrame, filepath: str) -> None: """Save a DataFrame to a CSV file. Args: df: DataFrame to save. filepath: Path to save the CSV file. """ df.to_csv(filepath, index=False) if self.verbose: print(f"Saved {len(df)} rows to {filepath}") def validate_labels(self, labeled_df: pd.DataFrame, reference_df: pd.DataFrame, label_column: str = "label", class_column: str = "class") -> bool: """Validate that labels in the labeled data are a subset of those in the reference data. Args: labeled_df: DataFrame containing labeled data. reference_df: DataFrame containing reference vectors. label_column: Name of the column containing labels in labeled_df. class_column: Name of the column containing classes in reference_df. Returns: True if validation passes, False otherwise. """ labeled_classes = set(labeled_df[label_column].unique()) reference_classes = set(reference_df[class_column].unique()) unknown_classes = labeled_classes - reference_classes if unknown_classes: if self.verbose: print(f"Warning: Found {len(unknown_classes)} labels in labeled data that are not in reference vectors:") print(unknown_classes) return False if self.verbose: print("Label validation passed: All labels in labeled data are in reference vectors.") return True