Spaces:
Runtime error
Runtime error
File size: 5,352 Bytes
5a3fcad 3ac4e2d 5a3fcad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | """
File Operations Utilities
This module provides helper functions for file handling operations,
particularly for CSV file processing in the NexDatawork agents.
Functions:
load_csv_files: Load multiple CSV files into DataFrames
validate_csv: Validate CSV file structure and content
get_file_info: Get metadata about uploaded files
"""
import os
from typing import List, Dict, Any, Tuple
import pandas as pd
def load_csv_files(files: List[Any]) -> Tuple[pd.DataFrame, List[str]]:
"""
Load multiple CSV files and concatenate them into a single DataFrame.
This function reads all provided CSV files and combines them into
a single DataFrame. Files are concatenated vertically (row-wise).
Args:
files: List of file objects with a .name attribute pointing to CSV paths.
Typically comes from Gradio or similar file upload components.
Returns:
Tuple containing:
- pd.DataFrame: The concatenated DataFrame
- List[str]: List of loaded file names
Raises:
ValueError: If no files are provided or all files fail to load.
Example:
>>> df, names = load_csv_files(uploaded_files)
>>> print(f"Loaded {len(names)} files with {len(df)} total rows")
"""
if not files:
raise ValueError("No files provided")
dataframes = []
loaded_names = []
errors = []
for f in files:
try:
# Get file path from file object
file_path = f.name if hasattr(f, 'name') else str(f)
# Load CSV
df = pd.read_csv(file_path)
dataframes.append(df)
loaded_names.append(os.path.basename(file_path))
except Exception as e:
errors.append(f"{file_path}: {e}")
if not dataframes:
raise ValueError(f"Failed to load any files. Errors: {errors}")
# Concatenate all DataFrames
combined = pd.concat(dataframes, ignore_index=True)
return combined, loaded_names
def validate_csv(file_path: str) -> Dict[str, Any]:
"""
Validate a CSV file and return information about its structure.
This function checks if a CSV file is valid and returns metadata
including column types, row count, and any detected issues.
Args:
file_path: Path to the CSV file to validate.
Returns:
Dict with keys:
- valid (bool): Whether the file is a valid CSV
- rows (int): Number of rows
- columns (int): Number of columns
- column_names (List[str]): Column names
- dtypes (Dict): Column data types
- missing_values (Dict): Count of missing values per column
- issues (List[str]): Any detected issues
Example:
>>> info = validate_csv("sales.csv")
>>> if info["valid"]:
... print(f"Valid CSV with {info['rows']} rows")
"""
result = {
"valid": False,
"rows": 0,
"columns": 0,
"column_names": [],
"dtypes": {},
"missing_values": {},
"issues": []
}
try:
# Attempt to load the CSV
df = pd.read_csv(file_path)
result["valid"] = True
result["rows"] = len(df)
result["columns"] = len(df.columns)
result["column_names"] = list(df.columns)
result["dtypes"] = df.dtypes.astype(str).to_dict()
result["missing_values"] = df.isnull().sum().to_dict()
# Check for common issues
# Duplicate column names
if len(df.columns) != len(set(df.columns)):
result["issues"].append("Duplicate column names detected")
# Entirely empty columns
empty_cols = df.columns[df.isnull().all()].tolist()
if empty_cols:
result["issues"].append(f"Empty columns: {empty_cols}")
# High missing value ratio
high_missing = [
col for col, count in result["missing_values"].items()
if count / len(df) > 0.5
]
if high_missing:
result["issues"].append(f"Columns with >50% missing: {high_missing}")
except Exception as e:
result["issues"].append(f"Failed to read file: {e}")
return result
def get_file_info(files: List[Any]) -> List[Dict[str, Any]]:
"""
Get metadata about multiple uploaded files.
Args:
files: List of file objects with .name attribute.
Returns:
List of dictionaries containing file metadata:
- name: File name
- size_kb: File size in KB
- validation: Output from validate_csv
Example:
>>> info = get_file_info(uploaded_files)
>>> for f in info:
... print(f"{f['name']}: {f['validation']['rows']} rows")
"""
results = []
for f in files:
file_path = f.name if hasattr(f, 'name') else str(f)
file_info = {
"name": os.path.basename(file_path),
"size_kb": os.path.getsize(file_path) / 1024 if os.path.exists(file_path) else 0,
"validation": validate_csv(file_path)
}
results.append(file_info)
return results
|