3v324v23's picture
Initial commit with robust upload and demo data
e15a3ce
import os
import io
import json
import logging
import pandas as pd
from flask import Flask, render_template, request, jsonify, send_file, session
from werkzeug.utils import secure_filename
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
app = Flask(__name__)
app.secret_key = os.urandom(24)
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB limit
app.config['UPLOAD_FOLDER'] = '/tmp/uploads'
# Ensure upload directory exists
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
ALLOWED_EXTENSIONS = {'csv', 'json', 'xlsx'}
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
def check_robustness(file_stream):
"""Check for null bytes and other safety constraints."""
try:
# Read a chunk to check for binary content
chunk = file_stream.read(4096)
file_stream.seek(0)
# Text files shouldn't have null bytes usually, unless it's some specific encoding.
# However, Excel files (xlsx) ARE binary (zip archives).
# We should only check for null bytes if it claims to be CSV or JSON.
# But we don't know the extension here reliably yet if we just pass the stream.
# So we should probably pass the filename or extension to this function.
if b'\0' in chunk:
return True, "Binary content detected (warning)" # Changed to warning or handle in route
return True, ""
except Exception as e:
return False, f"Error checking file robustness: {str(e)}"
def load_df(filepath, ext):
if ext == 'csv':
return pd.read_csv(filepath)
elif ext == 'json':
return pd.read_json(filepath)
elif ext == 'xlsx':
return pd.read_excel(filepath)
return None
def df_to_json_preview(df, rows=50):
"""Convert first N rows of DF to JSON for preview."""
preview = df.head(rows).fillna("").to_dict(orient='records')
columns = list(df.columns)
stats = {
"rows": len(df),
"columns": len(columns),
"missing_values": int(df.isnull().sum().sum()),
"duplicates": int(df.duplicated().sum())
}
return {"data": preview, "columns": columns, "stats": stats}
@app.route('/')
def index():
return render_template('index.html')
@app.route('/health')
def health():
return jsonify({"status": "healthy"}), 200
@app.route('/api/load_demo', methods=['POST'])
def load_demo():
try:
# Create a simple demo dataframe
data = {
"Date": pd.date_range(start='2024-01-01', periods=100),
"Category": ['A', 'B', 'C', 'A', 'B'] * 20,
"Value": pd.Series(range(100)) + pd.Series([1, 2, 5] * 33 + [1]),
"Status": ['Active', 'Inactive', 'Pending', 'Active'] * 25
}
df = pd.DataFrame(data)
# Add some random missing values
import numpy as np
df.loc[5:10, 'Value'] = np.nan
df.loc[15:20, 'Status'] = np.nan
filename = "demo_data.csv"
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
df.to_csv(filepath, index=False)
return jsonify({
"message": "Demo data loaded successfully",
"filename": filename,
"preview": df_to_json_preview(df)
})
except Exception as e:
logger.error(f"Demo load error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/upload', methods=['POST'])
def upload_file():
try:
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
if not allowed_file(file.filename):
return jsonify({"error": "File type not allowed. Use CSV, JSON, or XLSX."}), 400
filename = secure_filename(file.filename)
ext = filename.rsplit('.', 1)[1].lower()
# Robustness check
# Only check for null bytes if it is a text format (csv, json)
if ext in ['csv', 'json']:
is_safe, msg = check_robustness(file.stream)
# If it returns True (safe) but with a message, it might be a warning, but for text files, binary content is usually bad.
# However, my previous edit made it return True even if binary.
# Let's fix that logic inline or revert/adjust check_robustness.
# Actually, let's just do the check here properly.
chunk = file.stream.read(4096)
file.stream.seek(0)
if b'\0' in chunk:
return jsonify({"error": "File contains null bytes (binary suspected). Please upload a valid text file for CSV/JSON."}), 400
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
# Load and Preview
try:
df = load_df(filepath, ext)
except Exception as e:
return jsonify({"error": f"Failed to parse file: {str(e)}"}), 400
# Store file info in session (stateless ideally, but for simplicity storing path)
# For a more robust solution, we'd return a token. Let's return a token/filename.
return jsonify({
"message": "File uploaded successfully",
"filename": filename,
"preview": df_to_json_preview(df)
})
except Exception as e:
logger.error(f"Upload error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/process', methods=['POST'])
def process_data():
try:
data = request.json
filename = data.get('filename')
operations = data.get('operations', [])
if not filename:
return jsonify({"error": "Filename missing"}), 400
filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
if not os.path.exists(filepath):
return jsonify({"error": "File not found. Please upload again."}), 404
ext = filename.rsplit('.', 1)[1].lower()
df = load_df(filepath, ext)
# Apply Operations Pipeline
for op in operations:
op_type = op.get('type')
params = op.get('params', {})
if op_type == 'drop_duplicates':
subset = params.get('subset')
if subset:
df = df.drop_duplicates(subset=subset)
else:
df = df.drop_duplicates()
elif op_type == 'dropna':
how = params.get('how', 'any')
subset = params.get('subset')
if subset:
df = df.dropna(how=how, subset=subset)
else:
df = df.dropna(how=how)
elif op_type == 'fillna':
value = params.get('value')
method = params.get('method') # ffill, bfill
subset = params.get('subset') # columns to apply
if subset:
if method:
df[subset] = df[subset].fillna(method=method)
else:
df[subset] = df[subset].fillna(value)
else:
if method:
df = df.fillna(method=method)
else:
df = df.fillna(value)
elif op_type == 'filter':
# Simple filtering: col operator value
col = params.get('column')
operator = params.get('operator') # ==, !=, >, <, contains
value = params.get('value')
if col in df.columns:
if operator == '==':
df = df[df[col] == value]
elif operator == '!=':
df = df[df[col] != value]
elif operator == '>':
df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
elif operator == '<':
df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
elif operator == 'contains':
df = df[df[col].astype(str).str.contains(value, na=False)]
elif op_type == 'sort':
col = params.get('column')
ascending = params.get('ascending', True)
if col in df.columns:
df = df.sort_values(by=col, ascending=ascending)
elif op_type == 'rename':
mapping = params.get('mapping') # {old: new}
if mapping:
df = df.rename(columns=mapping)
elif op_type == 'select_columns':
cols = params.get('columns')
if cols:
valid_cols = [c for c in cols if c in df.columns]
df = df[valid_cols]
return jsonify({
"message": "Processed successfully",
"preview": df_to_json_preview(df)
})
except Exception as e:
logger.error(f"Processing error: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/api/export', methods=['POST'])
def export_data():
try:
data = request.json
filename = data.get('filename')
operations = data.get('operations', [])
format_type = data.get('format', 'csv')
filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
ext = filename.rsplit('.', 1)[1].lower()
df = load_df(filepath, ext)
# Re-apply operations (stateless)
for op in operations:
# ... (Duplicate logic, ideally refactor to function)
# For simplicity, assuming same logic.
# Let's refactor 'apply_operations'
pass
# Actually, let's just copy-paste the logic for now to ensure it works,
# or better: refactor.
df = apply_operations(df, operations)
output = io.BytesIO()
if format_type == 'csv':
df.to_csv(output, index=False)
mimetype = 'text/csv'
download_name = 'processed_data.csv'
elif format_type == 'json':
df.to_json(output, orient='records')
mimetype = 'application/json'
download_name = 'processed_data.json'
elif format_type == 'xlsx':
df.to_excel(output, index=False)
mimetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
download_name = 'processed_data.xlsx'
else:
return jsonify({"error": "Invalid format"}), 400
output.seek(0)
return send_file(
output,
mimetype=mimetype,
as_attachment=True,
download_name=download_name
)
except Exception as e:
logger.error(f"Export error: {e}")
return jsonify({"error": str(e)}), 500
def apply_operations(df, operations):
"""Helper to apply operations to DF."""
for op in operations:
op_type = op.get('type')
params = op.get('params', {})
if op_type == 'drop_duplicates':
subset = params.get('subset')
if subset:
df = df.drop_duplicates(subset=subset)
else:
df = df.drop_duplicates()
elif op_type == 'dropna':
how = params.get('how', 'any')
subset = params.get('subset')
if subset:
df = df.dropna(how=how, subset=subset)
else:
df = df.dropna(how=how)
elif op_type == 'fillna':
value = params.get('value')
method = params.get('method')
subset = params.get('subset')
if subset:
# Handle list of columns
if isinstance(subset, str):
subset = [subset]
# Check if columns exist
valid_subset = [c for c in subset if c in df.columns]
if method:
df[valid_subset] = df[valid_subset].fillna(method=method)
else:
df[valid_subset] = df[valid_subset].fillna(value)
else:
if method:
df = df.fillna(method=method)
else:
df = df.fillna(value)
elif op_type == 'filter':
col = params.get('column')
operator = params.get('operator')
value = params.get('value')
if col in df.columns:
if operator == '==':
df = df[df[col].astype(str) == str(value)]
elif operator == '!=':
df = df[df[col].astype(str) != str(value)]
elif operator == '>':
try:
df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
except: pass
elif operator == '<':
try:
df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
except: pass
elif operator == 'contains':
df = df[df[col].astype(str).str.contains(str(value), na=False)]
elif op_type == 'sort':
col = params.get('column')
ascending = params.get('ascending', True)
if col in df.columns:
df = df.sort_values(by=col, ascending=ascending)
elif op_type == 'rename':
mapping = params.get('mapping')
if mapping:
df = df.rename(columns=mapping)
elif op_type == 'select_columns':
cols = params.get('columns')
if cols:
valid_cols = [c for c in cols if c in df.columns]
df = df[valid_cols]
return df
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=False)