""" File Operations Utilities This module provides helper functions for file handling operations, particularly for CSV file processing in the NexDatawork agents. Functions: load_csv_files: Load multiple CSV files into DataFrames validate_csv: Validate CSV file structure and content get_file_info: Get metadata about uploaded files """ import os from typing import List, Dict, Any, Tuple import pandas as pd def load_csv_files(files: List[Any]) -> Tuple[pd.DataFrame, List[str]]: """ Load multiple CSV files and concatenate them into a single DataFrame. This function reads all provided CSV files and combines them into a single DataFrame. Files are concatenated vertically (row-wise). Args: files: List of file objects with a .name attribute pointing to CSV paths. Typically comes from Gradio or similar file upload components. Returns: Tuple containing: - pd.DataFrame: The concatenated DataFrame - List[str]: List of loaded file names Raises: ValueError: If no files are provided or all files fail to load. Example: >>> df, names = load_csv_files(uploaded_files) >>> print(f"Loaded {len(names)} files with {len(df)} total rows") """ if not files: raise ValueError("No files provided") dataframes = [] loaded_names = [] errors = [] for f in files: try: # Get file path from file object file_path = f.name if hasattr(f, 'name') else str(f) # Load CSV df = pd.read_csv(file_path) dataframes.append(df) loaded_names.append(os.path.basename(file_path)) except Exception as e: errors.append(f"{file_path}: {e}") if not dataframes: raise ValueError(f"Failed to load any files. Errors: {errors}") # Concatenate all DataFrames combined = pd.concat(dataframes, ignore_index=True) return combined, loaded_names def validate_csv(file_path: str) -> Dict[str, Any]: """ Validate a CSV file and return information about its structure. This function checks if a CSV file is valid and returns metadata including column types, row count, and any detected issues. Args: file_path: Path to the CSV file to validate. Returns: Dict with keys: - valid (bool): Whether the file is a valid CSV - rows (int): Number of rows - columns (int): Number of columns - column_names (List[str]): Column names - dtypes (Dict): Column data types - missing_values (Dict): Count of missing values per column - issues (List[str]): Any detected issues Example: >>> info = validate_csv("sales.csv") >>> if info["valid"]: ... print(f"Valid CSV with {info['rows']} rows") """ result = { "valid": False, "rows": 0, "columns": 0, "column_names": [], "dtypes": {}, "missing_values": {}, "issues": [] } try: # Attempt to load the CSV df = pd.read_csv(file_path) result["valid"] = True result["rows"] = len(df) result["columns"] = len(df.columns) result["column_names"] = list(df.columns) result["dtypes"] = df.dtypes.astype(str).to_dict() result["missing_values"] = df.isnull().sum().to_dict() # Check for common issues # Duplicate column names if len(df.columns) != len(set(df.columns)): result["issues"].append("Duplicate column names detected") # Entirely empty columns empty_cols = df.columns[df.isnull().all()].tolist() if empty_cols: result["issues"].append(f"Empty columns: {empty_cols}") # High missing value ratio high_missing = [ col for col, count in result["missing_values"].items() if count / len(df) > 0.5 ] if high_missing: result["issues"].append(f"Columns with >50% missing: {high_missing}") except Exception as e: result["issues"].append(f"Failed to read file: {e}") return result def get_file_info(files: List[Any]) -> List[Dict[str, Any]]: """ Get metadata about multiple uploaded files. Args: files: List of file objects with .name attribute. Returns: List of dictionaries containing file metadata: - name: File name - size_kb: File size in KB - validation: Output from validate_csv Example: >>> info = get_file_info(uploaded_files) >>> for f in info: ... print(f"{f['name']}: {f['validation']['rows']} rows") """ results = [] for f in files: file_path = f.name if hasattr(f, 'name') else str(f) file_info = { "name": os.path.basename(file_path), "size_kb": os.path.getsize(file_path) / 1024 if os.path.exists(file_path) else 0, "validation": validate_csv(file_path) } results.append(file_info) return results