import os import io import json import logging import pandas as pd from flask import Flask, render_template, request, jsonify, send_file, session from werkzeug.utils import secure_filename # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) app = Flask(__name__) app.secret_key = os.urandom(24) app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB limit app.config['UPLOAD_FOLDER'] = '/tmp/uploads' # Ensure upload directory exists os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) ALLOWED_EXTENSIONS = {'csv', 'json', 'xlsx'} def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def check_robustness(file_stream): """Check for null bytes and other safety constraints.""" try: # Read a chunk to check for binary content chunk = file_stream.read(4096) file_stream.seek(0) # Text files shouldn't have null bytes usually, unless it's some specific encoding. # However, Excel files (xlsx) ARE binary (zip archives). # We should only check for null bytes if it claims to be CSV or JSON. # But we don't know the extension here reliably yet if we just pass the stream. # So we should probably pass the filename or extension to this function. if b'\0' in chunk: return True, "Binary content detected (warning)" # Changed to warning or handle in route return True, "" except Exception as e: return False, f"Error checking file robustness: {str(e)}" def load_df(filepath, ext): if ext == 'csv': return pd.read_csv(filepath) elif ext == 'json': return pd.read_json(filepath) elif ext == 'xlsx': return pd.read_excel(filepath) return None def df_to_json_preview(df, rows=50): """Convert first N rows of DF to JSON for preview.""" preview = df.head(rows).fillna("").to_dict(orient='records') columns = list(df.columns) stats = { "rows": len(df), "columns": len(columns), "missing_values": int(df.isnull().sum().sum()), "duplicates": int(df.duplicated().sum()) } return {"data": preview, "columns": columns, "stats": stats} @app.route('/') def index(): return render_template('index.html') @app.route('/health') def health(): return jsonify({"status": "healthy"}), 200 @app.route('/api/load_demo', methods=['POST']) def load_demo(): try: # Create a simple demo dataframe data = { "Date": pd.date_range(start='2024-01-01', periods=100), "Category": ['A', 'B', 'C', 'A', 'B'] * 20, "Value": pd.Series(range(100)) + pd.Series([1, 2, 5] * 33 + [1]), "Status": ['Active', 'Inactive', 'Pending', 'Active'] * 25 } df = pd.DataFrame(data) # Add some random missing values import numpy as np df.loc[5:10, 'Value'] = np.nan df.loc[15:20, 'Status'] = np.nan filename = "demo_data.csv" filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) df.to_csv(filepath, index=False) return jsonify({ "message": "Demo data loaded successfully", "filename": filename, "preview": df_to_json_preview(df) }) except Exception as e: logger.error(f"Demo load error: {e}") return jsonify({"error": str(e)}), 500 @app.route('/api/upload', methods=['POST']) def upload_file(): try: if 'file' not in request.files: return jsonify({"error": "No file part"}), 400 file = request.files['file'] if file.filename == '': return jsonify({"error": "No selected file"}), 400 if not allowed_file(file.filename): return jsonify({"error": "File type not allowed. Use CSV, JSON, or XLSX."}), 400 filename = secure_filename(file.filename) ext = filename.rsplit('.', 1)[1].lower() # Robustness check # Only check for null bytes if it is a text format (csv, json) if ext in ['csv', 'json']: is_safe, msg = check_robustness(file.stream) # If it returns True (safe) but with a message, it might be a warning, but for text files, binary content is usually bad. # However, my previous edit made it return True even if binary. # Let's fix that logic inline or revert/adjust check_robustness. # Actually, let's just do the check here properly. chunk = file.stream.read(4096) file.stream.seek(0) if b'\0' in chunk: return jsonify({"error": "File contains null bytes (binary suspected). Please upload a valid text file for CSV/JSON."}), 400 filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(filepath) # Load and Preview try: df = load_df(filepath, ext) except Exception as e: return jsonify({"error": f"Failed to parse file: {str(e)}"}), 400 # Store file info in session (stateless ideally, but for simplicity storing path) # For a more robust solution, we'd return a token. Let's return a token/filename. return jsonify({ "message": "File uploaded successfully", "filename": filename, "preview": df_to_json_preview(df) }) except Exception as e: logger.error(f"Upload error: {e}") return jsonify({"error": str(e)}), 500 @app.route('/api/process', methods=['POST']) def process_data(): try: data = request.json filename = data.get('filename') operations = data.get('operations', []) if not filename: return jsonify({"error": "Filename missing"}), 400 filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename)) if not os.path.exists(filepath): return jsonify({"error": "File not found. Please upload again."}), 404 ext = filename.rsplit('.', 1)[1].lower() df = load_df(filepath, ext) # Apply Operations Pipeline for op in operations: op_type = op.get('type') params = op.get('params', {}) if op_type == 'drop_duplicates': subset = params.get('subset') if subset: df = df.drop_duplicates(subset=subset) else: df = df.drop_duplicates() elif op_type == 'dropna': how = params.get('how', 'any') subset = params.get('subset') if subset: df = df.dropna(how=how, subset=subset) else: df = df.dropna(how=how) elif op_type == 'fillna': value = params.get('value') method = params.get('method') # ffill, bfill subset = params.get('subset') # columns to apply if subset: if method: df[subset] = df[subset].fillna(method=method) else: df[subset] = df[subset].fillna(value) else: if method: df = df.fillna(method=method) else: df = df.fillna(value) elif op_type == 'filter': # Simple filtering: col operator value col = params.get('column') operator = params.get('operator') # ==, !=, >, <, contains value = params.get('value') if col in df.columns: if operator == '==': df = df[df[col] == value] elif operator == '!=': df = df[df[col] != value] elif operator == '>': df = df[pd.to_numeric(df[col], errors='coerce') > float(value)] elif operator == '<': df = df[pd.to_numeric(df[col], errors='coerce') < float(value)] elif operator == 'contains': df = df[df[col].astype(str).str.contains(value, na=False)] elif op_type == 'sort': col = params.get('column') ascending = params.get('ascending', True) if col in df.columns: df = df.sort_values(by=col, ascending=ascending) elif op_type == 'rename': mapping = params.get('mapping') # {old: new} if mapping: df = df.rename(columns=mapping) elif op_type == 'select_columns': cols = params.get('columns') if cols: valid_cols = [c for c in cols if c in df.columns] df = df[valid_cols] return jsonify({ "message": "Processed successfully", "preview": df_to_json_preview(df) }) except Exception as e: logger.error(f"Processing error: {e}") return jsonify({"error": str(e)}), 500 @app.route('/api/export', methods=['POST']) def export_data(): try: data = request.json filename = data.get('filename') operations = data.get('operations', []) format_type = data.get('format', 'csv') filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename)) ext = filename.rsplit('.', 1)[1].lower() df = load_df(filepath, ext) # Re-apply operations (stateless) for op in operations: # ... (Duplicate logic, ideally refactor to function) # For simplicity, assuming same logic. # Let's refactor 'apply_operations' pass # Actually, let's just copy-paste the logic for now to ensure it works, # or better: refactor. df = apply_operations(df, operations) output = io.BytesIO() if format_type == 'csv': df.to_csv(output, index=False) mimetype = 'text/csv' download_name = 'processed_data.csv' elif format_type == 'json': df.to_json(output, orient='records') mimetype = 'application/json' download_name = 'processed_data.json' elif format_type == 'xlsx': df.to_excel(output, index=False) mimetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' download_name = 'processed_data.xlsx' else: return jsonify({"error": "Invalid format"}), 400 output.seek(0) return send_file( output, mimetype=mimetype, as_attachment=True, download_name=download_name ) except Exception as e: logger.error(f"Export error: {e}") return jsonify({"error": str(e)}), 500 def apply_operations(df, operations): """Helper to apply operations to DF.""" for op in operations: op_type = op.get('type') params = op.get('params', {}) if op_type == 'drop_duplicates': subset = params.get('subset') if subset: df = df.drop_duplicates(subset=subset) else: df = df.drop_duplicates() elif op_type == 'dropna': how = params.get('how', 'any') subset = params.get('subset') if subset: df = df.dropna(how=how, subset=subset) else: df = df.dropna(how=how) elif op_type == 'fillna': value = params.get('value') method = params.get('method') subset = params.get('subset') if subset: # Handle list of columns if isinstance(subset, str): subset = [subset] # Check if columns exist valid_subset = [c for c in subset if c in df.columns] if method: df[valid_subset] = df[valid_subset].fillna(method=method) else: df[valid_subset] = df[valid_subset].fillna(value) else: if method: df = df.fillna(method=method) else: df = df.fillna(value) elif op_type == 'filter': col = params.get('column') operator = params.get('operator') value = params.get('value') if col in df.columns: if operator == '==': df = df[df[col].astype(str) == str(value)] elif operator == '!=': df = df[df[col].astype(str) != str(value)] elif operator == '>': try: df = df[pd.to_numeric(df[col], errors='coerce') > float(value)] except: pass elif operator == '<': try: df = df[pd.to_numeric(df[col], errors='coerce') < float(value)] except: pass elif operator == 'contains': df = df[df[col].astype(str).str.contains(str(value), na=False)] elif op_type == 'sort': col = params.get('column') ascending = params.get('ascending', True) if col in df.columns: df = df.sort_values(by=col, ascending=ascending) elif op_type == 'rename': mapping = params.get('mapping') if mapping: df = df.rename(columns=mapping) elif op_type == 'select_columns': cols = params.get('columns') if cols: valid_cols = [c for c in cols if c in df.columns] df = df[valid_cols] return df if __name__ == '__main__': app.run(host='0.0.0.0', port=7860, debug=False)