Spaces:

tostido
/

Cascade

Configuration error

App Files Files Community

Cascade / cascade /system /folder_processor.py

tostido

Initial commit - cascade-lattice 0.5.4

77bcbf1 19 days ago

raw

history blame contribute delete

4.15 kB

	"""
	CASCADE Folder Processor
	Handle batch processing of multiple files in folders
	"""

	import os
	import zipfile
	import tempfile
	from pathlib import Path
	from typing import List, Dict, Any, Tuple
	import pandas as pd

	def process_folder_upload(files: List[Any]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
	"""
	Process multiple uploaded files and combine them

	Args:
	files: List of uploaded file objects from Gradio

	Returns:
	Tuple of (combined_dataframe, processing_summary)
	"""
	if not files:
	return None, {"error": "No files provided"}

	all_data = []
	file_summary = []
	total_rows = 0

	for file_obj in files:
	try:
	# Get file path and info
	file_path = file_obj.name
	file_name = Path(file_path).name
	file_ext = Path(file_path).suffix.lower()

	# Read file based on extension
	df = None

	if file_ext == ".csv":
	df = pd.read_csv(file_path)
	elif file_ext == ".json":
	df = pd.read_json(file_path)
	elif file_ext == ".jsonl":
	df = pd.read_json(file_path, lines=True)
	elif file_ext == ".parquet":
	df = pd.read_parquet(file_path)
	elif file_ext in [".xlsx", ".xls"]:
	df = pd.read_excel(file_path)
	else:
	# For other formats, try to extract text
	from .file_extractors import extract_from_file
	result = extract_from_file(file_path)
	if result.lines:
	df = pd.DataFrame([{"text": line, "source_file": file_name}
	for line in result.lines])
	else:
	file_summary.append({
	"file": file_name,
	"status": "skipped",
	"reason": "Unsupported format"
	})
	continue

	# Add source file column
	if df is not None and len(df) > 0:
	df["source_file"] = file_name
	all_data.append(df)

	file_summary.append({
	"file": file_name,
	"status": "success",
	"rows": len(df),
	"columns": len(df.columns)
	})
	total_rows += len(df)

	except Exception as e:
	file_summary.append({
	"file": file_name,
	"status": "error",
	"error": str(e)
	})

	# Combine all data
	if all_data:
	combined_df = pd.concat(all_data, ignore_index=True)

	summary = {
	"total_files": len(files),
	"processed_files": len([s for s in file_summary if s["status"] == "success"]),
	"total_rows": total_rows,
	"file_details": file_summary
	}

	return combined_df, summary
	else:
	return None, {"error": "No files could be processed", "details": file_summary}

	def process_zip_file(zip_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
	"""
	Process a zip file containing multiple files

	Args:
	zip_path: Path to the zip file

	Returns:
	Tuple of (combined_dataframe, processing_summary)
	"""
	with tempfile.TemporaryDirectory() as temp_dir:
	# Extract zip
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(temp_dir)

	# Find all extracted files
	extracted_files = []
	for root, dirs, files in os.walk(temp_dir):
	for file in files:
	file_path = os.path.join(root, file)
	# Create a mock file object with name attribute
	class MockFile:
	def __init__(self, path):
	self.name = path
	extracted_files.append(MockFile(file_path))

	return process_folder_upload(extracted_files)