Spaces:
Sleeping
Sleeping
File size: 6,605 Bytes
f133a92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
"""Data loading and validation utilities for QualiVec."""
import os
import pandas as pd
from typing import List, Optional, Dict, Any, Union, Tuple
class DataLoader:
"""Handles data loading and validation for QualiVec."""
def __init__(self, verbose: bool = True):
"""Initialize the DataLoader.
Args:
verbose: Whether to print status messages.
"""
self.verbose = verbose
def load_corpus(self, filepath: str, sentence_column: str = "sentence") -> pd.DataFrame:
"""Load a corpus from a CSV file.
Args:
filepath: Path to the CSV file.
sentence_column: Name of the column containing sentences.
Returns:
DataFrame containing the corpus.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the sentence column is missing.
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
# Load the data
if self.verbose:
print(f"Loading corpus from {filepath}...")
df = pd.read_csv(filepath)
# Validate schema
if sentence_column not in df.columns:
raise ValueError(f"Required column '{sentence_column}' not found in the CSV file.")
# Basic validation
if df[sentence_column].isna().any():
if self.verbose:
print(f"Warning: {df[sentence_column].isna().sum()} null values found in '{sentence_column}' column.")
if self.verbose:
print(f"Loaded {len(df)} rows from {filepath}")
return df
def load_reference_vectors(self, filepath: str, class_column: str = "class",
node_column: str = "matching_node") -> pd.DataFrame:
"""Load reference vectors from a CSV file.
Args:
filepath: Path to the CSV file.
class_column: Name of the column containing class labels.
node_column: Name of the column containing matching nodes.
Returns:
DataFrame containing the reference vectors.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If required columns are missing.
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
if self.verbose:
print(f"Loading reference vectors from {filepath}...")
df = pd.read_csv(filepath)
# Validate schema
required_columns = [class_column, node_column]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Required columns {missing_columns} not found in the CSV file.")
# Basic validation
if df[class_column].isna().any() or df[node_column].isna().any():
if self.verbose:
print(f"Warning: Null values found in reference vectors.")
if self.verbose:
print(f"Loaded {len(df)} reference vectors from {filepath}")
print(f"Unique classes: {df[class_column].nunique()}")
return df
def load_labeled_data(self, filepath: str, label_column: str = "label") -> pd.DataFrame:
"""Load manually labeled data from a CSV file.
Args:
filepath: Path to the CSV file.
label_column: Name of the column containing labels.
Returns:
DataFrame containing the labeled data.
Raises:
FileNotFoundError: If the file does not exist.
ValueError: If the label column is missing.
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
if self.verbose:
print(f"Loading labeled data from {filepath}...")
df = pd.read_csv(filepath)
# Validate schema
if label_column not in df.columns:
raise ValueError(f"Required column '{label_column}' not found in the CSV file.")
# Basic validation
if df[label_column].isna().any():
if self.verbose:
print(f"Warning: {df[label_column].isna().sum()} null values found in '{label_column}' column.")
if self.verbose:
print(f"Loaded {len(df)} labeled samples from {filepath}")
print(f"Label distribution:\n{df[label_column].value_counts()}")
return df
def save_dataframe(self, df: pd.DataFrame, filepath: str) -> None:
"""Save a DataFrame to a CSV file.
Args:
df: DataFrame to save.
filepath: Path to save the CSV file.
"""
df.to_csv(filepath, index=False)
if self.verbose:
print(f"Saved {len(df)} rows to {filepath}")
def validate_labels(self, labeled_df: pd.DataFrame, reference_df: pd.DataFrame,
label_column: str = "label", class_column: str = "class") -> bool:
"""Validate that labels in the labeled data are a subset of those in the reference data.
Args:
labeled_df: DataFrame containing labeled data.
reference_df: DataFrame containing reference vectors.
label_column: Name of the column containing labels in labeled_df.
class_column: Name of the column containing classes in reference_df.
Returns:
True if validation passes, False otherwise.
"""
labeled_classes = set(labeled_df[label_column].unique())
reference_classes = set(reference_df[class_column].unique())
unknown_classes = labeled_classes - reference_classes
if unknown_classes:
if self.verbose:
print(f"Warning: Found {len(unknown_classes)} labels in labeled data that are not in reference vectors:")
print(unknown_classes)
return False
if self.verbose:
print("Label validation passed: All labels in labeled data are in reference vectors.")
return True
|