File size: 4,147 Bytes
77bcbf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
CASCADE Folder Processor
Handle batch processing of multiple files in folders
"""

import os
import zipfile
import tempfile
from pathlib import Path
from typing import List, Dict, Any, Tuple
import pandas as pd

def process_folder_upload(files: List[Any]) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Process multiple uploaded files and combine them
    
    Args:
        files: List of uploaded file objects from Gradio
        
    Returns:
        Tuple of (combined_dataframe, processing_summary)
    """
    if not files:
        return None, {"error": "No files provided"}
    
    all_data = []
    file_summary = []
    total_rows = 0
    
    for file_obj in files:
        try:
            # Get file path and info
            file_path = file_obj.name
            file_name = Path(file_path).name
            file_ext = Path(file_path).suffix.lower()
            
            # Read file based on extension
            df = None
            
            if file_ext == ".csv":
                df = pd.read_csv(file_path)
            elif file_ext == ".json":
                df = pd.read_json(file_path)
            elif file_ext == ".jsonl":
                df = pd.read_json(file_path, lines=True)
            elif file_ext == ".parquet":
                df = pd.read_parquet(file_path)
            elif file_ext in [".xlsx", ".xls"]:
                df = pd.read_excel(file_path)
            else:
                # For other formats, try to extract text
                from .file_extractors import extract_from_file
                result = extract_from_file(file_path)
                if result.lines:
                    df = pd.DataFrame([{"text": line, "source_file": file_name} 
                                     for line in result.lines])
                else:
                    file_summary.append({
                        "file": file_name,
                        "status": "skipped",
                        "reason": "Unsupported format"
                    })
                    continue
            
            # Add source file column
            if df is not None and len(df) > 0:
                df["source_file"] = file_name
                all_data.append(df)
                
                file_summary.append({
                    "file": file_name,
                    "status": "success",
                    "rows": len(df),
                    "columns": len(df.columns)
                })
                total_rows += len(df)
                
        except Exception as e:
            file_summary.append({
                "file": file_name,
                "status": "error",
                "error": str(e)
            })
    
    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        
        summary = {
            "total_files": len(files),
            "processed_files": len([s for s in file_summary if s["status"] == "success"]),
            "total_rows": total_rows,
            "file_details": file_summary
        }
        
        return combined_df, summary
    else:
        return None, {"error": "No files could be processed", "details": file_summary}

def process_zip_file(zip_path: str) -> Tuple[pd.DataFrame, Dict[str, Any]]:
    """
    Process a zip file containing multiple files
    
    Args:
        zip_path: Path to the zip file
        
    Returns:
        Tuple of (combined_dataframe, processing_summary)
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        # Extract zip
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        
        # Find all extracted files
        extracted_files = []
        for root, dirs, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root, file)
                # Create a mock file object with name attribute
                class MockFile:
                    def __init__(self, path):
                        self.name = path
                extracted_files.append(MockFile(file_path))
        
        return process_folder_upload(extracted_files)