Spaces:

duqing26
/

smart-data-refinery

Sleeping

App Files Files Community

smart-data-refinery / app.py

3v324v23

Initial commit with robust upload and demo data

e15a3ce about 1 month ago

raw

history blame contribute delete

14.5 kB

	import os
	import io
	import json
	import logging
	import pandas as pd
	from flask import Flask, render_template, request, jsonify, send_file, session
	from werkzeug.utils import secure_filename

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	app = Flask(__name__)
	app.secret_key = os.urandom(24)
	app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB limit
	app.config['UPLOAD_FOLDER'] = '/tmp/uploads'

	# Ensure upload directory exists
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

	ALLOWED_EXTENSIONS = {'csv', 'json', 'xlsx'}

	def allowed_file(filename):
	return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

	def check_robustness(file_stream):
	"""Check for null bytes and other safety constraints."""
	try:
	# Read a chunk to check for binary content
	chunk = file_stream.read(4096)
	file_stream.seek(0)

	# Text files shouldn't have null bytes usually, unless it's some specific encoding.
	# However, Excel files (xlsx) ARE binary (zip archives).
	# We should only check for null bytes if it claims to be CSV or JSON.
	# But we don't know the extension here reliably yet if we just pass the stream.
	# So we should probably pass the filename or extension to this function.
	if b'\0' in chunk:
	return True, "Binary content detected (warning)" # Changed to warning or handle in route
	return True, ""
	except Exception as e:
	return False, f"Error checking file robustness: {str(e)}"

	def load_df(filepath, ext):
	if ext == 'csv':
	return pd.read_csv(filepath)
	elif ext == 'json':
	return pd.read_json(filepath)
	elif ext == 'xlsx':
	return pd.read_excel(filepath)
	return None

	def df_to_json_preview(df, rows=50):
	"""Convert first N rows of DF to JSON for preview."""
	preview = df.head(rows).fillna("").to_dict(orient='records')
	columns = list(df.columns)
	stats = {
	"rows": len(df),
	"columns": len(columns),
	"missing_values": int(df.isnull().sum().sum()),
	"duplicates": int(df.duplicated().sum())
	}
	return {"data": preview, "columns": columns, "stats": stats}

	@app.route('/')
	def index():
	return render_template('index.html')

	@app.route('/health')
	def health():
	return jsonify({"status": "healthy"}), 200

	@app.route('/api/load_demo', methods=['POST'])
	def load_demo():
	try:
	# Create a simple demo dataframe
	data = {
	"Date": pd.date_range(start='2024-01-01', periods=100),
	"Category": ['A', 'B', 'C', 'A', 'B'] * 20,
	"Value": pd.Series(range(100)) + pd.Series([1, 2, 5] * 33 + [1]),
	"Status": ['Active', 'Inactive', 'Pending', 'Active'] * 25
	}
	df = pd.DataFrame(data)
	# Add some random missing values
	import numpy as np
	df.loc[5:10, 'Value'] = np.nan
	df.loc[15:20, 'Status'] = np.nan

	filename = "demo_data.csv"
	filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
	df.to_csv(filepath, index=False)

	return jsonify({
	"message": "Demo data loaded successfully",
	"filename": filename,
	"preview": df_to_json_preview(df)
	})
	except Exception as e:
	logger.error(f"Demo load error: {e}")
	return jsonify({"error": str(e)}), 500

	@app.route('/api/upload', methods=['POST'])
	def upload_file():
	try:
	if 'file' not in request.files:
	return jsonify({"error": "No file part"}), 400
	file = request.files['file']
	if file.filename == '':
	return jsonify({"error": "No selected file"}), 400

	if not allowed_file(file.filename):
	return jsonify({"error": "File type not allowed. Use CSV, JSON, or XLSX."}), 400

	filename = secure_filename(file.filename)
	ext = filename.rsplit('.', 1)[1].lower()

	# Robustness check
	# Only check for null bytes if it is a text format (csv, json)
	if ext in ['csv', 'json']:
	is_safe, msg = check_robustness(file.stream)
	# If it returns True (safe) but with a message, it might be a warning, but for text files, binary content is usually bad.
	# However, my previous edit made it return True even if binary.
	# Let's fix that logic inline or revert/adjust check_robustness.
	# Actually, let's just do the check here properly.
	chunk = file.stream.read(4096)
	file.stream.seek(0)
	if b'\0' in chunk:
	return jsonify({"error": "File contains null bytes (binary suspected). Please upload a valid text file for CSV/JSON."}), 400

	filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
	file.save(filepath)

	# Load and Preview
	try:
	df = load_df(filepath, ext)
	except Exception as e:
	return jsonify({"error": f"Failed to parse file: {str(e)}"}), 400

	# Store file info in session (stateless ideally, but for simplicity storing path)
	# For a more robust solution, we'd return a token. Let's return a token/filename.

	return jsonify({
	"message": "File uploaded successfully",
	"filename": filename,
	"preview": df_to_json_preview(df)
	})

	except Exception as e:
	logger.error(f"Upload error: {e}")
	return jsonify({"error": str(e)}), 500

	@app.route('/api/process', methods=['POST'])
	def process_data():
	try:
	data = request.json
	filename = data.get('filename')
	operations = data.get('operations', [])

	if not filename:
	return jsonify({"error": "Filename missing"}), 400

	filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
	if not os.path.exists(filepath):
	return jsonify({"error": "File not found. Please upload again."}), 404

	ext = filename.rsplit('.', 1)[1].lower()
	df = load_df(filepath, ext)

	# Apply Operations Pipeline
	for op in operations:
	op_type = op.get('type')
	params = op.get('params', {})

	if op_type == 'drop_duplicates':
	subset = params.get('subset')
	if subset:
	df = df.drop_duplicates(subset=subset)
	else:
	df = df.drop_duplicates()

	elif op_type == 'dropna':
	how = params.get('how', 'any')
	subset = params.get('subset')
	if subset:
	df = df.dropna(how=how, subset=subset)
	else:
	df = df.dropna(how=how)

	elif op_type == 'fillna':
	value = params.get('value')
	method = params.get('method') # ffill, bfill
	subset = params.get('subset') # columns to apply

	if subset:
	if method:
	df[subset] = df[subset].fillna(method=method)
	else:
	df[subset] = df[subset].fillna(value)
	else:
	if method:
	df = df.fillna(method=method)
	else:
	df = df.fillna(value)

	elif op_type == 'filter':
	# Simple filtering: col operator value
	col = params.get('column')
	operator = params.get('operator') # ==, !=, >, <, contains
	value = params.get('value')

	if col in df.columns:
	if operator == '==':
	df = df[df[col] == value]
	elif operator == '!=':
	df = df[df[col] != value]
	elif operator == '>':
	df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
	elif operator == '<':
	df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
	elif operator == 'contains':
	df = df[df[col].astype(str).str.contains(value, na=False)]

	elif op_type == 'sort':
	col = params.get('column')
	ascending = params.get('ascending', True)
	if col in df.columns:
	df = df.sort_values(by=col, ascending=ascending)

	elif op_type == 'rename':
	mapping = params.get('mapping') # {old: new}
	if mapping:
	df = df.rename(columns=mapping)

	elif op_type == 'select_columns':
	cols = params.get('columns')
	if cols:
	valid_cols = [c for c in cols if c in df.columns]
	df = df[valid_cols]

	return jsonify({
	"message": "Processed successfully",
	"preview": df_to_json_preview(df)
	})

	except Exception as e:
	logger.error(f"Processing error: {e}")
	return jsonify({"error": str(e)}), 500

	@app.route('/api/export', methods=['POST'])
	def export_data():
	try:
	data = request.json
	filename = data.get('filename')
	operations = data.get('operations', [])
	format_type = data.get('format', 'csv')

	filepath = os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(filename))
	ext = filename.rsplit('.', 1)[1].lower()
	df = load_df(filepath, ext)

	# Re-apply operations (stateless)
	for op in operations:
	# ... (Duplicate logic, ideally refactor to function)
	# For simplicity, assuming same logic.
	# Let's refactor 'apply_operations'
	pass

	# Actually, let's just copy-paste the logic for now to ensure it works,
	# or better: refactor.
	df = apply_operations(df, operations)

	output = io.BytesIO()
	if format_type == 'csv':
	df.to_csv(output, index=False)
	mimetype = 'text/csv'
	download_name = 'processed_data.csv'
	elif format_type == 'json':
	df.to_json(output, orient='records')
	mimetype = 'application/json'
	download_name = 'processed_data.json'
	elif format_type == 'xlsx':
	df.to_excel(output, index=False)
	mimetype = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
	download_name = 'processed_data.xlsx'
	else:
	return jsonify({"error": "Invalid format"}), 400

	output.seek(0)
	return send_file(
	output,
	mimetype=mimetype,
	as_attachment=True,
	download_name=download_name
	)

	except Exception as e:
	logger.error(f"Export error: {e}")
	return jsonify({"error": str(e)}), 500

	def apply_operations(df, operations):
	"""Helper to apply operations to DF."""
	for op in operations:
	op_type = op.get('type')
	params = op.get('params', {})

	if op_type == 'drop_duplicates':
	subset = params.get('subset')
	if subset:
	df = df.drop_duplicates(subset=subset)
	else:
	df = df.drop_duplicates()

	elif op_type == 'dropna':
	how = params.get('how', 'any')
	subset = params.get('subset')
	if subset:
	df = df.dropna(how=how, subset=subset)
	else:
	df = df.dropna(how=how)

	elif op_type == 'fillna':
	value = params.get('value')
	method = params.get('method')
	subset = params.get('subset')

	if subset:
	# Handle list of columns
	if isinstance(subset, str):
	subset = [subset]

	# Check if columns exist
	valid_subset = [c for c in subset if c in df.columns]

	if method:
	df[valid_subset] = df[valid_subset].fillna(method=method)
	else:
	df[valid_subset] = df[valid_subset].fillna(value)
	else:
	if method:
	df = df.fillna(method=method)
	else:
	df = df.fillna(value)

	elif op_type == 'filter':
	col = params.get('column')
	operator = params.get('operator')
	value = params.get('value')

	if col in df.columns:
	if operator == '==':
	df = df[df[col].astype(str) == str(value)]
	elif operator == '!=':
	df = df[df[col].astype(str) != str(value)]
	elif operator == '>':
	try:
	df = df[pd.to_numeric(df[col], errors='coerce') > float(value)]
	except: pass
	elif operator == '<':
	try:
	df = df[pd.to_numeric(df[col], errors='coerce') < float(value)]
	except: pass
	elif operator == 'contains':
	df = df[df[col].astype(str).str.contains(str(value), na=False)]

	elif op_type == 'sort':
	col = params.get('column')
	ascending = params.get('ascending', True)
	if col in df.columns:
	df = df.sort_values(by=col, ascending=ascending)

	elif op_type == 'rename':
	mapping = params.get('mapping')
	if mapping:
	df = df.rename(columns=mapping)

	elif op_type == 'select_columns':
	cols = params.get('columns')
	if cols:
	valid_cols = [c for c in cols if c in df.columns]
	df = df[valid_cols]

	return df

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860, debug=False)