Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import json | |
| import logging | |
| import pandas as pd | |
| from flask import Flask, render_template, request, jsonify, send_file, session | |
| from werkzeug.utils import secure_filename | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| app = Flask(__name__) | |
| app.secret_key = os.urandom(24) | |
| app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB limit | |
| app.config['UPLOAD_FOLDER'] = '/tmp/uploads' | |
| # Ensure upload directory exists | |
| os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) | |
| ALLOWED_EXTENSIONS = {'csv', 'json', 'xlsx'} | |
| def allowed_file(filename): | |
| return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def check_robustness(file_stream): | |
| """Check for null bytes and other safety constraints.""" | |
| try: | |
| # Read a chunk to check for binary content | |
| chunk = file_stream.read(4096) | |
| file_stream.seek(0) | |
| # Text files shouldn't have null bytes usually, unless it's some specific encoding. | |
| # However, Excel files (xlsx) ARE binary (zip archives). | |
| # We should only check for null bytes if it claims to be CSV or JSON. | |
| # But we don't know the extension here reliably yet if we just pass the stream. | |
| # So we should probably pass the filename or extension to this function. | |
| if b'\0' in chunk: | |
| return True, "Binary content detected (warning)" # Changed to warning or handle in route | |
| return True, "" | |
| except Exception as e: | |
| return False, f"Error checking file robustness: {str(e)}" | |
| def load_df(filepath, ext): | |
| if ext == 'csv': | |
| return pd.read_csv(filepath) | |
| elif ext == 'json': | |
| return pd.read_json(filepath) | |
| elif ext == 'xlsx': | |
| return pd.read_excel(filepath) | |
| return None | |
| def df_to_json_preview(df, rows=50): | |
| """Convert first N rows of DF to JSON for preview.""" | |
| preview = df.head(rows).fillna("").to_dict(orient='records') | |
| columns = list(df.columns) | |
| stats = { | |
| "rows": len(df), | |
| "columns": len(columns), | |
| "missing_values": int(df.isnull().sum().sum()), | |
| "duplicates": int(df.duplicated().sum()) | |
| } | |
| return {"data": preview, "columns": columns, "stats": stats} | |
| def index(): | |
| return render_template('index.html') | |
| def health(): | |
| return jsonify({"status": "healthy"}), 200 | |
| def load_demo(): | |
| try: | |
| # Create a simple demo dataframe | |
| data = { | |
| "Date": pd.date_range(start='2024-01-01', periods=100), | |
| "Category": ['A', 'B', 'C', 'A', 'B'] * 20, | |
| "Value": pd.Series(range(100)) + pd.Series([1, 2, 5] * 33 + [1]), | |
| "Status": ['Active', 'Inactive', 'Pending', 'Active'] * 25 | |
| } | |
| df = pd.DataFrame(data) | |
| # Add some random missing values | |
| import numpy as np | |
| df.loc[5:10, 'Value'] = np.nan | |
| df.loc[15:20, 'Status'] = np.nan | |
| filename = "demo_data.csv" | |
| filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) | |
| df.to_csv(filepath, index=False) | |
| return jsonify({ | |
| "message": "Demo data loaded successfully", | |
| "filename": filename, | |
| "preview": df_to_json_preview(df) | |
| }) | |
| except Exception as e: | |
| logger.error(f"Demo load error: {e}") | |
| return jsonify({"error": str(e)}), 500 | |
| def upload_file(): | |
| try: | |
| if 'file' not in request.files: | |
| return jsonify({"error": "No file part"}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({"error": "No selected file"}), 400 | |
| if not allowed_file(file.filename): | |
| return jsonify({"error": "File type not allowed. Use CSV, JSON, or XLSX."}), 400 | |
| filename = secure_filename(file.filename) | |
| ext = filename.rsplit('.', 1)[1].lower() | |
| # Robustness check | |
| # Only check for null bytes if it is a text format (csv, json) | |
| if ext in ['csv', 'json']: | |
| is_safe, msg = check_robustness(file.stream) | |
| # If it returns True (safe) but with a message, it might be a warning, but for text files, binary content is usually bad. | |
| # However, my previous edit made it return True even if binary. | |
| # Let's fix that logic inline or revert/adjust check_robustness. | |
| # Actually, let's just do the check here properly. | |
| chunk = file.stream.read(4096) | |
| file.stream.seek(0) | |
| if b'\0' in chunk: | |
| return jsonify({"error": "File contains null bytes (binary suspected). Please upload a valid text file for CSV/JSON."}), 400 | |
| filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) | |
| file.save(filepath) | |
| # Load and Preview | |
| try: | |
| df = load_df(filepath, ext) | |
| except Exception as e: | |
| return jsonify({"error": f"Failed to parse file: {str(e)}"}), 400 | |
| # Store file info in session (stateless ideally, but for simplicity storing path) | |
| # For a more robust solution, we'd return a token. Let's return a token/filename. | |
| return jsonify({ | |
| "message": "File uploaded successfully", | |
| "filename": filename, | |
| "preview": df_to_json_preview(df) | |
| }) | |
| except Exception as e: | |
| logger.error(f"Upload error: {e}") | |
| return jsonify({"error": str(e)}), 500 | |
| def process_data(): | |
| try: | |
| data = request.json | |
| filename = data.get('filename') | |
| operations = data.get('operations', []) | |
| if not filename: | |
| return jsonify({"error": "Filename missing"}), 400 | |
| filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename)) | |
| if not os.path.exists(filepath): | |
| return jsonify({"error": "File not found. Please upload again."}), 404 | |
| ext = filename.rsplit('.', 1)[1].lower() | |
| df = load_df(filepath, ext) | |
| # Apply Operations Pipeline | |
| for op in operations: | |
| op_type = op.get('type') | |
| params = op.get('params', {}) | |
| if op_type == 'drop_duplicates': | |
| subset = params.get('subset') | |
| if subset: | |
| df = df.drop_duplicates(subset=subset) | |
| else: | |
| df = df.drop_duplicates() | |
| elif op_type == 'dropna': | |
| how = params.get('how', 'any') | |
| subset = params.get('subset') | |
| if subset: | |
| df = df.dropna(how=how, subset=subset) | |
| else: | |
| df = df.dropna(how=how) | |
| elif op_type == 'fillna': | |
| value = params.get('value') | |
| method = params.get('method') # ffill, bfill | |
| subset = params.get('subset') # columns to apply | |
| if subset: | |
| if method: | |
| df[subset] = df[subset].fillna(method=method) | |
| else: | |
| df[subset] = df[subset].fillna(value) | |
| else: | |
| if method: | |
| df = df.fillna(method=method) | |
| else: | |
| df = df.fillna(value) | |
| elif op_type == 'filter': | |
| # Simple filtering: col operator value | |
| col = params.get('column') | |
| operator = params.get('operator') # ==, !=, >, <, contains | |
| value = params.get('value') | |
| if col in df.columns: | |
| if operator == '==': | |
| df = df[df[col] == value] | |
| elif operator == '!=': | |
| df = df[df[col] != value] | |
| elif operator == '>': | |
| df = df[pd.to_numeric(df[col], errors='coerce') > float(value)] | |
| elif operator == '<': | |
| df = df[pd.to_numeric(df[col], errors='coerce') < float(value)] | |
| elif operator == 'contains': | |
| df = df[df[col].astype(str).str.contains(value, na=False)] | |
| elif op_type == 'sort': | |
| col = params.get('column') | |
| ascending = params.get('ascending', True) | |
| if col in df.columns: | |
| df = df.sort_values(by=col, ascending=ascending) | |
| elif op_type == 'rename': | |
| mapping = params.get('mapping') # {old: new} | |
| if mapping: | |
| df = df.rename(columns=mapping) | |
| elif op_type == 'select_columns': | |
| cols = params.get('columns') | |
| if cols: | |
| valid_cols = [c for c in cols if c in df.columns] | |
| df = df[valid_cols] | |
| return jsonify({ | |
| "message": "Processed successfully", | |
| "preview": df_to_json_preview(df) | |
| }) | |
| except Exception as e: | |
| logger.error(f"Processing error: {e}") | |
| return jsonify({"error": str(e)}), 500 | |
| def export_data(): | |
| try: | |
| data = request.json | |
| filename = data.get('filename') | |
| operations = data.get('operations', []) | |
| format_type = data.get('format', 'csv') | |
| filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename)) | |
| ext = filename.rsplit('.', 1)[1].lower() | |
| df = load_df(filepath, ext) | |
| # Re-apply operations (stateless) | |
| for op in operations: | |
| # ... (Duplicate logic, ideally refactor to function) | |
| # For simplicity, assuming same logic. | |
| # Let's refactor 'apply_operations' | |
| pass | |
| # Actually, let's just copy-paste the logic for now to ensure it works, | |
| # or better: refactor. | |
| df = apply_operations(df, operations) | |
| output = io.BytesIO() | |
| if format_type == 'csv': | |
| df.to_csv(output, index=False) | |
| mimetype = 'text/csv' | |
| download_name = 'processed_data.csv' | |
| elif format_type == 'json': | |
| df.to_json(output, orient='records') | |
| mimetype = 'application/json' | |
| download_name = 'processed_data.json' | |
| elif format_type == 'xlsx': | |
| df.to_excel(output, index=False) | |
| mimetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | |
| download_name = 'processed_data.xlsx' | |
| else: | |
| return jsonify({"error": "Invalid format"}), 400 | |
| output.seek(0) | |
| return send_file( | |
| output, | |
| mimetype=mimetype, | |
| as_attachment=True, | |
| download_name=download_name | |
| ) | |
| except Exception as e: | |
| logger.error(f"Export error: {e}") | |
| return jsonify({"error": str(e)}), 500 | |
| def apply_operations(df, operations): | |
| """Helper to apply operations to DF.""" | |
| for op in operations: | |
| op_type = op.get('type') | |
| params = op.get('params', {}) | |
| if op_type == 'drop_duplicates': | |
| subset = params.get('subset') | |
| if subset: | |
| df = df.drop_duplicates(subset=subset) | |
| else: | |
| df = df.drop_duplicates() | |
| elif op_type == 'dropna': | |
| how = params.get('how', 'any') | |
| subset = params.get('subset') | |
| if subset: | |
| df = df.dropna(how=how, subset=subset) | |
| else: | |
| df = df.dropna(how=how) | |
| elif op_type == 'fillna': | |
| value = params.get('value') | |
| method = params.get('method') | |
| subset = params.get('subset') | |
| if subset: | |
| # Handle list of columns | |
| if isinstance(subset, str): | |
| subset = [subset] | |
| # Check if columns exist | |
| valid_subset = [c for c in subset if c in df.columns] | |
| if method: | |
| df[valid_subset] = df[valid_subset].fillna(method=method) | |
| else: | |
| df[valid_subset] = df[valid_subset].fillna(value) | |
| else: | |
| if method: | |
| df = df.fillna(method=method) | |
| else: | |
| df = df.fillna(value) | |
| elif op_type == 'filter': | |
| col = params.get('column') | |
| operator = params.get('operator') | |
| value = params.get('value') | |
| if col in df.columns: | |
| if operator == '==': | |
| df = df[df[col].astype(str) == str(value)] | |
| elif operator == '!=': | |
| df = df[df[col].astype(str) != str(value)] | |
| elif operator == '>': | |
| try: | |
| df = df[pd.to_numeric(df[col], errors='coerce') > float(value)] | |
| except: pass | |
| elif operator == '<': | |
| try: | |
| df = df[pd.to_numeric(df[col], errors='coerce') < float(value)] | |
| except: pass | |
| elif operator == 'contains': | |
| df = df[df[col].astype(str).str.contains(str(value), na=False)] | |
| elif op_type == 'sort': | |
| col = params.get('column') | |
| ascending = params.get('ascending', True) | |
| if col in df.columns: | |
| df = df.sort_values(by=col, ascending=ascending) | |
| elif op_type == 'rename': | |
| mapping = params.get('mapping') | |
| if mapping: | |
| df = df.rename(columns=mapping) | |
| elif op_type == 'select_columns': | |
| cols = params.get('columns') | |
| if cols: | |
| valid_cols = [c for c in cols if c in df.columns] | |
| df = df[valid_cols] | |
| return df | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860, debug=False) | |