bootstrap / src /qualivec /sampling.py
akhil-vaidya's picture
Upload 26 files
f133a92 verified
"""Sampling utilities for QualiVec."""
import pandas as pd
import numpy as np
from typing import Optional, Union, Literal
class Sampler:
"""Handles sampling mechanisms for QualiVec."""
def __init__(self, verbose: bool = True):
"""Initialize the Sampler.
Args:
verbose: Whether to print status messages.
"""
self.verbose = verbose
def sample(self,
df: pd.DataFrame,
sampling_type: Literal["random", "stratified"] = "random",
sample_size: Union[int, float] = 0.1,
stratify_column: Optional[str] = None,
seed: Optional[int] = None,
label_column: str = "Label") -> pd.DataFrame:
"""Sample data from a DataFrame.
Args:
df: DataFrame to sample from.
sampling_type: Type of sampling ("random" or "stratified").
sample_size: Size of the sample. If float, interpreted as a fraction.
stratify_column: Column to stratify by (required for stratified sampling).
seed: Random seed for reproducibility.
label_column: Name of the label column to add to the output.
Returns:
DataFrame containing the sampled data.
Raises:
ValueError: If parameters are invalid.
"""
# Set random seed if provided
if seed is not None:
np.random.seed(seed)
# Calculate sample size if given as a fraction
if isinstance(sample_size, float):
if not 0 < sample_size <= 1:
raise ValueError("Sample size as fraction must be between 0 and 1.")
n_samples = int(len(df) * sample_size)
else:
if not 0 < sample_size <= len(df):
raise ValueError(f"Sample size must be between 1 and {len(df)}.")
n_samples = sample_size
if self.verbose:
print(f"Sampling {n_samples} rows ({n_samples/len(df):.1%} of data)...")
# Perform sampling
if sampling_type == "random":
sample = df.sample(n=n_samples, random_state=seed)
elif sampling_type == "stratified":
if stratify_column is None:
raise ValueError("stratify_column must be provided for stratified sampling.")
if stratify_column not in df.columns:
raise ValueError(f"Stratification column '{stratify_column}' not found in DataFrame.")
# Check for NaN values in stratification column
if df[stratify_column].isna().any():
raise ValueError(f"NaN values found in stratification column '{stratify_column}'.")
# Calculate the proportion for each stratum
strata = df[stratify_column].value_counts(normalize=True)
# Create empty sample DataFrame
sample = pd.DataFrame(columns=df.columns)
# Sample from each stratum
for stratum, proportion in strata.items():
stratum_df = df[df[stratify_column] == stratum]
stratum_samples = max(1, int(n_samples * proportion))
stratum_sample = stratum_df.sample(n=min(stratum_samples, len(stratum_df)),
random_state=seed)
sample = pd.concat([sample, stratum_sample])
if self.verbose:
print(f"Stratified sampling based on '{stratify_column}':")
for stratum, count in sample[stratify_column].value_counts().items():
print(f" - {stratum}: {count} samples ({count/n_samples:.1%})")
else:
raise ValueError(f"Unknown sampling type: {sampling_type}")
# Add empty label column for manual annotation
if label_column not in sample.columns:
sample[label_column] = None
if self.verbose:
print(f"Created sample with {len(sample)} rows.")
return sample