Spaces:

duqing26
/

smart-data-refinery

Sleeping

File size: 14,490 Bytes

e15a3ce

import os
import io
import json
import logging
import pandas as pd
from flask import Flask, render_template, request, jsonify, send_file, session
from werkzeug.utils import secure_filename

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

app = Flask(__name__)
app.secret_key = os.urandom(24)
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50MB limit
app.config['UPLOAD_FOLDER'] = '/tmp/uploads'

# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

ALLOWED_EXTENSIONS = {'csv', 'json', 'xlsx'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def check_robustness(file_stream):
    """Check for null bytes and other safety constraints."""
    try:
        # Read a chunk to check for binary content
        chunk = file_stream.read(4096)
        file_stream.seek(0)
        
        # Text files shouldn't have null bytes usually, unless it's some specific encoding.
        # However, Excel files (xlsx) ARE binary (zip archives).
        # We should only check for null bytes if it claims to be CSV or JSON.
        # But we don't know the extension here reliably yet if we just pass the stream.
        # So we should probably pass the filename or extension to this function.
        if b'\0' in chunk:
             return True, "Binary content detected (warning)" # Changed to warning or handle in route
        return True, ""
    except Exception as e:
        return False, f"Error checking file robustness: {str(e)}"

def load_df(filepath, ext):
    if ext == 'csv':
        return pd.read_csv(filepath)
    elif ext == 'json':
        return pd.read_json(filepath)
    elif ext == 'xlsx':
        return pd.read_excel(filepath)
    return None

def df_to_json_preview(df, rows=50):
    """Convert first N rows of DF to JSON for preview."""
    preview = df.head(rows).fillna("").to_dict(orient='records')
    columns = list(df.columns)
    stats = {
        "rows": len(df),
        "columns": len(columns),
        "missing_values": int(df.isnull().sum().sum()),
        "duplicates": int(df.duplicated().sum())
    }
    return {"data": preview, "columns": columns, "stats": stats}

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/health')
def health():
    return jsonify({"status": "healthy"}), 200

@app.route('/api/load_demo', methods=['POST'])
def load_demo():
    try:
        # Create a simple demo dataframe
        data = {
            "Date": pd.date_range(start='2024-01-01', periods=100),
            "Category": ['A', 'B', 'C', 'A', 'B'] * 20,
            "Value": pd.Series(range(100)) + pd.Series([1, 2, 5] * 33 + [1]),
            "Status": ['Active', 'Inactive', 'Pending', 'Active'] * 25
        }
        df = pd.DataFrame(data)
        # Add some random missing values
        import numpy as np
        df.loc[5:10, 'Value'] = np.nan
        df.loc[15:20, 'Status'] = np.nan
        
        filename = "demo_data.csv"
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        df.to_csv(filepath, index=False)
        
        return jsonify({
            "message": "Demo data loaded successfully",
            "filename": filename,
            "preview": df_to_json_preview(df)
        })
    except Exception as e:
        logger.error(f"Demo load error: {e}")
        return jsonify({"error": str(e)}), 500

@app.route('/api/upload', methods=['POST'])
def upload_file():
    try:
        if 'file' not in request.files:
            return jsonify({"error": "No file part"}), 400
        file = request.files['file']
        if file.filename == '':
            return jsonify({"error": "No selected file"}), 400
        
        if not allowed_file(file.filename):
            return jsonify({"error": "File type not allowed. Use CSV, JSON, or XLSX."}), 400

        filename = secure_filename(file.filename)
        ext = filename.rsplit('.', 1)[1].lower()

        # Robustness check
        # Only check for null bytes if it is a text format (csv, json)
        if ext in ['csv', 'json']:
            is_safe, msg = check_robustness(file.stream)
            # If it returns True (safe) but with a message, it might be a warning, but for text files, binary content is usually bad.
            # However, my previous edit made it return True even if binary.
            # Let's fix that logic inline or revert/adjust check_robustness.
            # Actually, let's just do the check here properly.
            chunk = file.stream.read(4096)
            file.stream.seek(0)
            if b'\0' in chunk:
                 return jsonify({"error": "File contains null bytes (binary suspected). Please upload a valid text file for CSV/JSON."}), 400

        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(filepath)

        # Load and Preview
        try:
            df = load_df(filepath, ext)
        except Exception as e:
            return jsonify({"error": f"Failed to parse file: {str(e)}"}), 400
        
        # Store file info in session (stateless ideally, but for simplicity storing path)
        # For a more robust solution, we'd return a token. Let's return a token/filename.
        
        return jsonify({
            "message": "File uploaded successfully",
            "filename": filename,
            "preview": df_to_json_preview(df)
        })

    except Exception as e:
        logger.error(f"Upload error: {e}")
        return jsonify({"error": str(e)}), 500

@app.route('/api/process', methods=['POST'])
def process_data():
    try:
        data = request.json
        filename = data.get('filename')
        operations = data.get('operations', [])
        
        if not filename:
            return jsonify({"error": "Filename missing"}), 400
            
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
        if not os.path.exists(filepath):
            return jsonify({"error": "File not found. Please upload again."}), 404
            
        ext = filename.rsplit('.', 1)[1].lower()
        df = load_df(filepath, ext)
        
        # Apply Operations Pipeline
        for op in operations:
            op_type = op.get('type')
            params = op.get('params', {})
            
            if op_type == 'drop_duplicates':
                subset = params.get('subset')
                if subset:
                    df = df.drop_duplicates(subset=subset)
                else:
                    df = df.drop_duplicates()
                    
            elif op_type == 'dropna':
                how = params.get('how', 'any')
                subset = params.get('subset')
                if subset:
                    df = df.dropna(how=how, subset=subset)
                else:
                    df = df.dropna(how=how)
                    
            elif op_type == 'fillna':
                value = params.get('value')
                method = params.get('method') # ffill, bfill
                subset = params.get('subset') # columns to apply
                
                if subset:
                    if method:
                        df[subset] = df[subset].fillna(method=method)
                    else:
                        df[subset] = df[subset].fillna(value)
                else:
                    if method:
                        df = df.fillna(method=method)
                    else:
                        df = df.fillna(value)
                        
            elif op_type == 'filter':
                # Simple filtering: col operator value
                col = params.get('column')
                operator = params.get('operator') # ==, !=, >, <, contains
                value = params.get('value')
                
                if col in df.columns:
                    if operator == '==':
                        df = df[df[col] == value]
                    elif operator == '!=':
                        df = df[df[col] != value]
                    elif operator == '>':
                        df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
                    elif operator == '<':
                        df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
                    elif operator == 'contains':
                        df = df[df[col].astype(str).str.contains(value, na=False)]
                        
            elif op_type == 'sort':
                col = params.get('column')
                ascending = params.get('ascending', True)
                if col in df.columns:
                    df = df.sort_values(by=col, ascending=ascending)
            
            elif op_type == 'rename':
                mapping = params.get('mapping') # {old: new}
                if mapping:
                    df = df.rename(columns=mapping)
                    
            elif op_type == 'select_columns':
                cols = params.get('columns')
                if cols:
                    valid_cols = [c for c in cols if c in df.columns]
                    df = df[valid_cols]

        return jsonify({
            "message": "Processed successfully",
            "preview": df_to_json_preview(df)
        })

    except Exception as e:
        logger.error(f"Processing error: {e}")
        return jsonify({"error": str(e)}), 500

@app.route('/api/export', methods=['POST'])
def export_data():
    try:
        data = request.json
        filename = data.get('filename')
        operations = data.get('operations', [])
        format_type = data.get('format', 'csv')
        
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
        ext = filename.rsplit('.', 1)[1].lower()
        df = load_df(filepath, ext)
        
        # Re-apply operations (stateless)
        for op in operations:
            # ... (Duplicate logic, ideally refactor to function)
            # For simplicity, assuming same logic.
            # Let's refactor 'apply_operations'
            pass 
        
        # Actually, let's just copy-paste the logic for now to ensure it works, 
        # or better: refactor.
        df = apply_operations(df, operations)

        output = io.BytesIO()
        if format_type == 'csv':
            df.to_csv(output, index=False)
            mimetype = 'text/csv'
            download_name = 'processed_data.csv'
        elif format_type == 'json':
            df.to_json(output, orient='records')
            mimetype = 'application/json'
            download_name = 'processed_data.json'
        elif format_type == 'xlsx':
            df.to_excel(output, index=False)
            mimetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
            download_name = 'processed_data.xlsx'
        else:
            return jsonify({"error": "Invalid format"}), 400
            
        output.seek(0)
        return send_file(
            output,
            mimetype=mimetype,
            as_attachment=True,
            download_name=download_name
        )

    except Exception as e:
        logger.error(f"Export error: {e}")
        return jsonify({"error": str(e)}), 500

def apply_operations(df, operations):
    """Helper to apply operations to DF."""
    for op in operations:
        op_type = op.get('type')
        params = op.get('params', {})
        
        if op_type == 'drop_duplicates':
            subset = params.get('subset')
            if subset:
                df = df.drop_duplicates(subset=subset)
            else:
                df = df.drop_duplicates()
                
        elif op_type == 'dropna':
            how = params.get('how', 'any')
            subset = params.get('subset')
            if subset:
                df = df.dropna(how=how, subset=subset)
            else:
                df = df.dropna(how=how)
                
        elif op_type == 'fillna':
            value = params.get('value')
            method = params.get('method')
            subset = params.get('subset')
            
            if subset:
                # Handle list of columns
                if isinstance(subset, str):
                    subset = [subset]
                
                # Check if columns exist
                valid_subset = [c for c in subset if c in df.columns]
                
                if method:
                    df[valid_subset] = df[valid_subset].fillna(method=method)
                else:
                    df[valid_subset] = df[valid_subset].fillna(value)
            else:
                if method:
                    df = df.fillna(method=method)
                else:
                    df = df.fillna(value)
                    
        elif op_type == 'filter':
            col = params.get('column')
            operator = params.get('operator')
            value = params.get('value')
            
            if col in df.columns:
                if operator == '==':
                    df = df[df[col].astype(str) == str(value)]
                elif operator == '!=':
                    df = df[df[col].astype(str) != str(value)]
                elif operator == '>':
                    try:
                        df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
                    except: pass
                elif operator == '<':
                    try:
                        df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
                    except: pass
                elif operator == 'contains':
                    df = df[df[col].astype(str).str.contains(str(value), na=False)]
                    
        elif op_type == 'sort':
            col = params.get('column')
            ascending = params.get('ascending', True)
            if col in df.columns:
                df = df.sort_values(by=col, ascending=ascending)
        
        elif op_type == 'rename':
            mapping = params.get('mapping')
            if mapping:
                df = df.rename(columns=mapping)
                
        elif op_type == 'select_columns':
            cols = params.get('columns')
            if cols:
                valid_cols = [c for c in cols if c in df.columns]
                df = df[valid_cols]
                
    return df

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860, debug=False)