File size: 5,352 Bytes
5a3fcad
 
 
 
 
 
 
 
 
 
 
 
 
3ac4e2d
5a3fcad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
File Operations Utilities

This module provides helper functions for file handling operations,
particularly for CSV file processing in the NexDatawork agents.

Functions:
    load_csv_files: Load multiple CSV files into DataFrames
    validate_csv: Validate CSV file structure and content
    get_file_info: Get metadata about uploaded files
"""

import os
from typing import List, Dict, Any, Tuple

import pandas as pd


def load_csv_files(files: List[Any]) -> Tuple[pd.DataFrame, List[str]]:
    """
    Load multiple CSV files and concatenate them into a single DataFrame.
    
    This function reads all provided CSV files and combines them into
    a single DataFrame. Files are concatenated vertically (row-wise).
    
    Args:
        files: List of file objects with a .name attribute pointing to CSV paths.
               Typically comes from Gradio or similar file upload components.
               
    Returns:
        Tuple containing:
            - pd.DataFrame: The concatenated DataFrame
            - List[str]: List of loaded file names
            
    Raises:
        ValueError: If no files are provided or all files fail to load.
        
    Example:
        >>> df, names = load_csv_files(uploaded_files)
        >>> print(f"Loaded {len(names)} files with {len(df)} total rows")
    """
    if not files:
        raise ValueError("No files provided")
    
    dataframes = []
    loaded_names = []
    errors = []
    
    for f in files:
        try:
            # Get file path from file object
            file_path = f.name if hasattr(f, 'name') else str(f)
            
            # Load CSV
            df = pd.read_csv(file_path)
            dataframes.append(df)
            loaded_names.append(os.path.basename(file_path))
            
        except Exception as e:
            errors.append(f"{file_path}: {e}")
    
    if not dataframes:
        raise ValueError(f"Failed to load any files. Errors: {errors}")
    
    # Concatenate all DataFrames
    combined = pd.concat(dataframes, ignore_index=True)
    
    return combined, loaded_names


def validate_csv(file_path: str) -> Dict[str, Any]:
    """
    Validate a CSV file and return information about its structure.
    
    This function checks if a CSV file is valid and returns metadata
    including column types, row count, and any detected issues.
    
    Args:
        file_path: Path to the CSV file to validate.
        
    Returns:
        Dict with keys:
            - valid (bool): Whether the file is a valid CSV
            - rows (int): Number of rows
            - columns (int): Number of columns
            - column_names (List[str]): Column names
            - dtypes (Dict): Column data types
            - missing_values (Dict): Count of missing values per column
            - issues (List[str]): Any detected issues
            
    Example:
        >>> info = validate_csv("sales.csv")
        >>> if info["valid"]:
        ...     print(f"Valid CSV with {info['rows']} rows")
    """
    result = {
        "valid": False,
        "rows": 0,
        "columns": 0,
        "column_names": [],
        "dtypes": {},
        "missing_values": {},
        "issues": []
    }
    
    try:
        # Attempt to load the CSV
        df = pd.read_csv(file_path)
        
        result["valid"] = True
        result["rows"] = len(df)
        result["columns"] = len(df.columns)
        result["column_names"] = list(df.columns)
        result["dtypes"] = df.dtypes.astype(str).to_dict()
        result["missing_values"] = df.isnull().sum().to_dict()
        
        # Check for common issues
        # Duplicate column names
        if len(df.columns) != len(set(df.columns)):
            result["issues"].append("Duplicate column names detected")
        
        # Entirely empty columns
        empty_cols = df.columns[df.isnull().all()].tolist()
        if empty_cols:
            result["issues"].append(f"Empty columns: {empty_cols}")
        
        # High missing value ratio
        high_missing = [
            col for col, count in result["missing_values"].items()
            if count / len(df) > 0.5
        ]
        if high_missing:
            result["issues"].append(f"Columns with >50% missing: {high_missing}")
            
    except Exception as e:
        result["issues"].append(f"Failed to read file: {e}")
    
    return result


def get_file_info(files: List[Any]) -> List[Dict[str, Any]]:
    """
    Get metadata about multiple uploaded files.
    
    Args:
        files: List of file objects with .name attribute.
        
    Returns:
        List of dictionaries containing file metadata:
            - name: File name
            - size_kb: File size in KB
            - validation: Output from validate_csv
            
    Example:
        >>> info = get_file_info(uploaded_files)
        >>> for f in info:
        ...     print(f"{f['name']}: {f['validation']['rows']} rows")
    """
    results = []
    
    for f in files:
        file_path = f.name if hasattr(f, 'name') else str(f)
        
        file_info = {
            "name": os.path.basename(file_path),
            "size_kb": os.path.getsize(file_path) / 1024 if os.path.exists(file_path) else 0,
            "validation": validate_csv(file_path)
        }
        
        results.append(file_info)
    
    return results