File size: 6,605 Bytes
f133a92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""Data loading and validation utilities for QualiVec."""

import os
import pandas as pd
from typing import List, Optional, Dict, Any, Union, Tuple


class DataLoader:
    """Handles data loading and validation for QualiVec."""
    
    def __init__(self, verbose: bool = True):
        """Initialize the DataLoader.

        

        Args:

            verbose: Whether to print status messages.

        """
        self.verbose = verbose
    
    def load_corpus(self, filepath: str, sentence_column: str = "sentence") -> pd.DataFrame:
        """Load a corpus from a CSV file.

        

        Args:

            filepath: Path to the CSV file.

            sentence_column: Name of the column containing sentences.

            

        Returns:

            DataFrame containing the corpus.

            

        Raises:

            FileNotFoundError: If the file does not exist.

            ValueError: If the sentence column is missing.

        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File not found: {filepath}")
        
        # Load the data
        if self.verbose:
            print(f"Loading corpus from {filepath}...")
        
        df = pd.read_csv(filepath)
        
        # Validate schema
        if sentence_column not in df.columns:
            raise ValueError(f"Required column '{sentence_column}' not found in the CSV file.")
        
        # Basic validation
        if df[sentence_column].isna().any():
            if self.verbose:
                print(f"Warning: {df[sentence_column].isna().sum()} null values found in '{sentence_column}' column.")
        
        if self.verbose:
            print(f"Loaded {len(df)} rows from {filepath}")
        
        return df
    
    def load_reference_vectors(self, filepath: str, class_column: str = "class", 

                               node_column: str = "matching_node") -> pd.DataFrame:
        """Load reference vectors from a CSV file.

        

        Args:

            filepath: Path to the CSV file.

            class_column: Name of the column containing class labels.

            node_column: Name of the column containing matching nodes.

            

        Returns:

            DataFrame containing the reference vectors.

            

        Raises:

            FileNotFoundError: If the file does not exist.

            ValueError: If required columns are missing.

        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File not found: {filepath}")
        
        if self.verbose:
            print(f"Loading reference vectors from {filepath}...")
        
        df = pd.read_csv(filepath)
        
        # Validate schema
        required_columns = [class_column, node_column]
        missing_columns = [col for col in required_columns if col not in df.columns]
        
        if missing_columns:
            raise ValueError(f"Required columns {missing_columns} not found in the CSV file.")
        
        # Basic validation
        if df[class_column].isna().any() or df[node_column].isna().any():
            if self.verbose:
                print(f"Warning: Null values found in reference vectors.")
        
        if self.verbose:
            print(f"Loaded {len(df)} reference vectors from {filepath}")
            print(f"Unique classes: {df[class_column].nunique()}")
        
        return df
    
    def load_labeled_data(self, filepath: str, label_column: str = "label") -> pd.DataFrame:
        """Load manually labeled data from a CSV file.

        

        Args:

            filepath: Path to the CSV file.

            label_column: Name of the column containing labels.

            

        Returns:

            DataFrame containing the labeled data.

            

        Raises:

            FileNotFoundError: If the file does not exist.

            ValueError: If the label column is missing.

        """
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"File not found: {filepath}")
        
        if self.verbose:
            print(f"Loading labeled data from {filepath}...")
        
        df = pd.read_csv(filepath)
        
        # Validate schema
        if label_column not in df.columns:
            raise ValueError(f"Required column '{label_column}' not found in the CSV file.")
        
        # Basic validation
        if df[label_column].isna().any():
            if self.verbose:
                print(f"Warning: {df[label_column].isna().sum()} null values found in '{label_column}' column.")
        
        if self.verbose:
            print(f"Loaded {len(df)} labeled samples from {filepath}")
            print(f"Label distribution:\n{df[label_column].value_counts()}")
        
        return df
    
    def save_dataframe(self, df: pd.DataFrame, filepath: str) -> None:
        """Save a DataFrame to a CSV file.

        

        Args:

            df: DataFrame to save.

            filepath: Path to save the CSV file.

        """
        df.to_csv(filepath, index=False)
        
        if self.verbose:
            print(f"Saved {len(df)} rows to {filepath}")
    
    def validate_labels(self, labeled_df: pd.DataFrame, reference_df: pd.DataFrame, 

                        label_column: str = "label", class_column: str = "class") -> bool:
        """Validate that labels in the labeled data are a subset of those in the reference data.

        

        Args:

            labeled_df: DataFrame containing labeled data.

            reference_df: DataFrame containing reference vectors.

            label_column: Name of the column containing labels in labeled_df.

            class_column: Name of the column containing classes in reference_df.

            

        Returns:

            True if validation passes, False otherwise.

        """
        labeled_classes = set(labeled_df[label_column].unique())
        reference_classes = set(reference_df[class_column].unique())
        
        unknown_classes = labeled_classes - reference_classes
        
        if unknown_classes:
            if self.verbose:
                print(f"Warning: Found {len(unknown_classes)} labels in labeled data that are not in reference vectors:")
                print(unknown_classes)
            return False
        
        if self.verbose:
            print("Label validation passed: All labels in labeled data are in reference vectors.")
        
        return True