Cascade / cascade /system /folder_processor.py
tostido's picture
Initial commit - cascade-lattice 0.5.4
77bcbf1
"""
CASCADE Folder Processor
Handle batch processing of multiple files in folders
"""
import os
import zipfile
import tempfile
from pathlib import Path
from typing import List, Dict, Any, Tuple
import pandas as pd
def process_folder_upload(files: List[Any]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
"""
Process multiple uploaded files and combine them
Args:
files: List of uploaded file objects from Gradio
Returns:
Tuple of (combined_dataframe, processing_summary)
"""
if not files:
return None, {"error": "No files provided"}
all_data = []
file_summary = []
total_rows = 0
for file_obj in files:
try:
# Get file path and info
file_path = file_obj.name
file_name = Path(file_path).name
file_ext = Path(file_path).suffix.lower()
# Read file based on extension
df = None
if file_ext == ".csv":
df = pd.read_csv(file_path)
elif file_ext == ".json":
df = pd.read_json(file_path)
elif file_ext == ".jsonl":
df = pd.read_json(file_path, lines=True)
elif file_ext == ".parquet":
df = pd.read_parquet(file_path)
elif file_ext in [".xlsx", ".xls"]:
df = pd.read_excel(file_path)
else:
# For other formats, try to extract text
from .file_extractors import extract_from_file
result = extract_from_file(file_path)
if result.lines:
df = pd.DataFrame([{"text": line, "source_file": file_name}
for line in result.lines])
else:
file_summary.append({
"file": file_name,
"status": "skipped",
"reason": "Unsupported format"
})
continue
# Add source file column
if df is not None and len(df) > 0:
df["source_file"] = file_name
all_data.append(df)
file_summary.append({
"file": file_name,
"status": "success",
"rows": len(df),
"columns": len(df.columns)
})
total_rows += len(df)
except Exception as e:
file_summary.append({
"file": file_name,
"status": "error",
"error": str(e)
})
# Combine all data
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
summary = {
"total_files": len(files),
"processed_files": len([s for s in file_summary if s["status"] == "success"]),
"total_rows": total_rows,
"file_details": file_summary
}
return combined_df, summary
else:
return None, {"error": "No files could be processed", "details": file_summary}
def process_zip_file(zip_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
"""
Process a zip file containing multiple files
Args:
zip_path: Path to the zip file
Returns:
Tuple of (combined_dataframe, processing_summary)
"""
with tempfile.TemporaryDirectory() as temp_dir:
# Extract zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(temp_dir)
# Find all extracted files
extracted_files = []
for root, dirs, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
# Create a mock file object with name attribute
class MockFile:
def __init__(self, path):
self.name = path
extracted_files.append(MockFile(file_path))
return process_folder_upload(extracted_files)