Spaces:

duqing026
/

process-xray-pro

Sleeping

File size: 10,356 Bytes

3e57f30

import os
import json
import random
import csv
import io
from datetime import datetime, timedelta
from flask import Flask, request, jsonify, render_template

# Try to import pandas, fallback to mock if unavailable (e.g. Python 3.14 env)
try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    HAS_PANDAS = False
    print("Warning: Pandas not found. Running in fallback mode.")

app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024  # 50MB max upload size

# Configuration
UPLOAD_FOLDER = '/tmp'
ALLOWED_EXTENSIONS = {'csv'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def generate_demo_data_raw():
    """Generates demo data as list of dicts (Fallback)"""
    data = []
    activities = [
        "Create Purchase Requisition", 
        "Approve Requisition", 
        "Create Purchase Order", 
        "Receive Goods", 
        "Receive Invoice", 
        "Match Invoice", 
        "Pay Invoice", 
        "Close Case"
    ]
    
    # Generate 50 cases
    for i in range(1, 51):
        case_id = f"CASE-{i:03d}"
        current_time = datetime.now() - timedelta(days=random.randint(1, 30))
        
        path = activities[:]
        if random.random() < 0.2:
            path.remove("Approve Requisition")
        
        if random.random() < 0.1:
            idx = path.index("Receive Invoice")
            path.insert(idx + 1, "Reject Invoice")
            path.insert(idx + 2, "Receive Invoice")
            
        for activity in path:
            duration_minutes = random.randint(60, 2880)
            current_time += timedelta(minutes=duration_minutes)
            
            data.append({
                "case_id": case_id,
                "activity": activity,
                "timestamp": current_time.strftime("%Y-%m-%d %H:%M:%S")
            })
    return data

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/api/demo', methods=['GET'])
def get_demo_data():
    if HAS_PANDAS:
        df = pd.DataFrame(generate_demo_data_raw())
        return df.to_csv(index=False)
    else:
        # Manual CSV generation
        data = generate_demo_data_raw()
        output = io.StringIO()
        if data:
            writer = csv.DictWriter(output, fieldnames=data[0].keys())
            writer.writeheader()
            writer.writerows(data)
        return output.getvalue()

@app.route('/api/analyze', methods=['POST'])
def analyze():
    try:
        if 'file' not in request.files:
            return jsonify({"error": "没有上传文件"}), 400
            
        file = request.files['file']
        if file.filename == '':
            return jsonify({"error": "未选择文件"}), 400
            
        if file and allowed_file(file.filename):
            # Process Logic
            nodes = {} 
            links = {}
            total_cases = 0
            total_events = 0
            
            # Use Pandas if available, otherwise fallback
            if HAS_PANDAS:
                try:
                    df = pd.read_csv(file)
                except Exception as e:
                    return jsonify({"error": f"CSV读取失败: {str(e)}"}), 400
                    
                cols = [c.lower() for c in df.columns]
                df.columns = cols
                
                case_col = next((c for c in cols if 'case' in c or 'id' in c), None)
                act_col = next((c for c in cols if 'activity' in c or 'event' in c or 'name' in c), None)
                time_col = next((c for c in cols if 'time' in c or 'date' in c), None)
                
                if not (case_col and act_col and time_col):
                    return jsonify({"error": "缺少必要列: 需包含 CaseID, Activity, Timestamp"}), 400
                
                try:
                    df[time_col] = pd.to_datetime(df[time_col])
                except:
                    return jsonify({"error": "时间戳格式无效"}), 400
                    
                df = df.sort_values(by=[case_col, time_col])
                
                cases = df.groupby(case_col)
                total_cases = len(cases)
                total_events = len(df)
                
                for case_id, group in cases:
                    events = group.to_dict('records')
                    process_case_events(events, nodes, links, act_col, time_col)

            else:
                # --- FALLBACK IMPLEMENTATION (Standard Lib) ---
                stream = io.StringIO(file.stream.read().decode("UTF8"), newline=None)
                reader = csv.DictReader(stream)
                rows = list(reader)
                
                if not rows:
                    return jsonify({"error": "空文件"}), 400
                    
                # Detect columns
                headers = [h.lower() for h in reader.fieldnames]
                case_key = next((h for h in reader.fieldnames if 'case' in h.lower() or 'id' in h.lower()), None)
                act_key = next((h for h in reader.fieldnames if 'activity' in h.lower() or 'event' in h.lower() or 'name' in h.lower()), None)
                time_key = next((h for h in reader.fieldnames if 'time' in h.lower() or 'date' in h.lower()), None)
                
                if not (case_key and act_key and time_key):
                    return jsonify({"error": "缺少必要列: 需包含 CaseID, Activity, Timestamp"}), 400
                
                # Group by Case
                case_map = {}
                for row in rows:
                    c_id = row[case_key]
                    if c_id not in case_map:
                        case_map[c_id] = []
                    case_map[c_id].append(row)
                
                total_cases = len(case_map)
                total_events = len(rows)
                
                # Sort and Process
                for c_id, events in case_map.items():
                    # Parse dates
                    for e in events:
                        try:
                            # Try ISO format first, then others
                            e['_dt'] = datetime.fromisoformat(e[time_key].replace('Z', '+00:00'))
                        except:
                            try:
                                e['_dt'] = datetime.strptime(e[time_key], "%Y-%m-%d %H:%M:%S")
                            except:
                                # Fallback for demo data format if generated locally
                                e['_dt'] = datetime.now() 
                    
                    events.sort(key=lambda x: x['_dt'])
                    process_case_events(events, nodes, links, act_key, '_dt')

            # --- COMMON FORMATTING ---
            echarts_nodes = []
            max_count = 0
            for name, data in nodes.items():
                max_count = max(max_count, data["count"])
                
            for name, data in nodes.items():
                symbol_size = 20 + (data["count"] / max_count) * 40 if max_count > 0 else 30
                echarts_nodes.append({
                    "name": name,
                    "value": data["count"],
                    "symbolSize": symbol_size,
                    "itemStyle": {
                        "color": "#5470c6" if data["in_degree"] > 0 and data["out_degree"] > 0 else ("#91cc75" if data["in_degree"] == 0 else "#ee6666")
                    },
                    "category": "Start" if data["in_degree"] == 0 else ("End" if data["out_degree"] == 0 else "Activity")
                })
                
            echarts_links = []
            for (source, target), data in links.items():
                avg_duration = data["total_duration"] / data["count"]
                echarts_links.append({
                    "source": source,
                    "target": target,
                    "value": data["count"],
                    "label": {
                        "show": True,
                        "formatter": f"{data['count']} ({avg_duration:.1f}h)"
                    },
                    "lineStyle": {
                        "width": 1 + (data["count"] / total_cases) * 5,
                        "curveness": 0.2
                    }
                })
                
            return jsonify({
                "nodes": echarts_nodes,
                "links": echarts_links,
                "stats": {
                    "total_cases": total_cases,
                    "total_events": total_events,
                    "avg_events_per_case": round(total_events / total_cases, 1) if total_cases else 0
                }
            })
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        return jsonify({"error": str(e)}), 500

def process_case_events(events, nodes, links, act_key, time_key):
    """Helper to process a sorted list of events for a single case"""
    for i in range(len(events)):
        curr = events[i]
        act = curr[act_key]
        
        # Update Node
        if act not in nodes:
            nodes[act] = {"count": 0, "in_degree": 0, "out_degree": 0}
        nodes[act]["count"] += 1
        
        # Update Link
        if i < len(events) - 1:
            next_event = events[i+1]
            next_act = next_event[act_key]
            
            # Duration in hours
            t1 = curr[time_key]
            t2 = next_event[time_key]
            
            # Handle pandas timestamp vs python datetime
            if hasattr(t1, 'to_pydatetime'): t1 = t1.to_pydatetime()
            if hasattr(t2, 'to_pydatetime'): t2 = t2.to_pydatetime()
                
            duration = (t2 - t1).total_seconds() / 3600.0
            
            link_key = (act, next_act)
            if link_key not in links:
                links[link_key] = {"count": 0, "total_duration": 0.0}
            
            links[link_key]["count"] += 1
            links[link_key]["total_duration"] += duration
            
            nodes[act]["out_degree"] += 1
            
            if next_act not in nodes:
                nodes[next_act] = {"count": 0, "in_degree": 0, "out_degree": 0}
            nodes[next_act]["in_degree"] += 1

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=7860)