Trae Assistant
Enhance: Add file upload, localization and fixes
476df4f
import os
import random
import json
import pandas as pd
import numpy as np
import networkx as nx
from flask import Flask, render_template, jsonify, request
app = Flask(__name__)
app.secret_key = os.urandom(24)
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024
# --- Logic Core ---
def generate_mock_data(n_paths=500):
"""Generates synthetic conversion path data."""
channels = ['付费搜索', '社交媒体', '邮件营销', '展示广告', '直接访问', '推介引流']
data = []
for _ in range(n_paths):
# Determine path length (1 to 5 steps)
length = random.choices([1, 2, 3, 4, 5], weights=[0.3, 0.3, 0.2, 0.1, 0.1])[0]
path = random.choices(channels, k=length)
# Determine conversion (simulating different channel efficiencies)
conversion_prob = 0.05
if '邮件营销' in path: conversion_prob += 0.1
if '付费搜索' in path: conversion_prob += 0.05
if '直接访问' in path and path[-1] == '直接访问': conversion_prob += 0.15
converted = 1 if random.random() < min(conversion_prob, 0.8) else 0
conversion_value = random.randint(50, 200) if converted else 0
path_str = ' > '.join(path)
data.append({
'path': path_str,
'total_conversions': converted,
'total_conversion_value': conversion_value,
'total_null': 1 - converted
})
# Aggregate to reduce rows and match "Path Data" format
df = pd.DataFrame(data)
df = df.groupby('path').agg({
'total_conversions': 'sum',
'total_conversion_value': 'sum',
'total_null': 'sum'
}).reset_index()
return df
def run_markov_model(df):
"""
Calculates Markov Chain Attribution.
Expects DataFrame with 'path' (channel1 > channel2), 'total_conversions', 'total_conversion_value'.
"""
# 1. Prepare Transition Matrix
transitions = {}
for idx, row in df.iterrows():
path = row['path'].split(' > ')
total_conversions = row['total_conversions']
total_null = row['total_null']
# Add Start and End states
full_path = ['(start)'] + path
if total_conversions > 0:
full_path.append('(conversion)')
else:
full_path.append('(null)')
for i in range(len(full_path) - 1):
source = full_path[i]
target = full_path[i+1]
if source not in transitions: transitions[source] = {}
if target not in transitions[source]: transitions[source][target] = 0
transitions[source][target] += 1
# 2. Calculate Probabilities
trans_matrix = {}
for source, targets in transitions.items():
total_visits = sum(targets.values())
trans_matrix[source] = {t: count / total_visits for t, count in targets.items()}
# 3. Removal Effect Calculation
# Build graph
G = nx.DiGraph()
for source, targets in trans_matrix.items():
for target, prob in targets.items():
G.add_edge(source, target, weight=prob)
channels = [n for n in G.nodes() if n not in ['(start)', '(conversion)', '(null)']]
# Calculate total conversion probability from (start) to (conversion)
try:
base_prob = _calculate_conversion_prob(G)
except:
base_prob = 0
attribution = {}
total_removal_effect = 0
removal_effects = {}
if base_prob > 0:
for channel in channels:
# Create subgraph without this channel
G_temp = G.copy()
G_temp.remove_node(channel)
new_prob = _calculate_conversion_prob(G_temp)
removal_effect = 1 - (new_prob / base_prob)
removal_effects[channel] = removal_effect
total_removal_effect += removal_effect
# Normalize to get share
total_conversions = df['total_conversions'].sum()
total_value = df['total_conversion_value'].sum()
for channel, effect in removal_effects.items():
share = effect / total_removal_effect if total_removal_effect > 0 else 0
attribution[channel] = {
'conversions': share * total_conversions,
'value': share * total_value
}
else:
# Fallback if graph is broken
for channel in channels:
attribution[channel] = {'conversions': 0, 'value': 0}
return attribution, trans_matrix
def _calculate_conversion_prob(G):
"""Calculates probability of reaching (conversion) from (start)."""
if not G.has_node('(start)') or not G.has_node('(conversion)'):
return 0
# Simple path multiplication (can be slow for complex graphs, but okay for marketing paths)
# Better approach: Matrix multiplication or solving linear equations
# For simplicity and speed in this demo, we use a simplified approach or NetworkX algorithms
# Actually, for small graphs, we can use simple flow or just assume acyclic (which is not always true)
# Let's use a robust method: Matrix Power or Absorbing Markov Chains
nodes = list(G.nodes())
idx = {n: i for i, n in enumerate(nodes)}
n = len(nodes)
M = np.zeros((n, n))
for u, v, data in G.edges(data=True):
M[idx[u]][idx[v]] = data['weight']
# Start vector
start_vec = np.zeros(n)
start_vec[idx['(start)']] = 1
# Iterate (approximate infinite steps)
curr = start_vec
for _ in range(20): # Marketing paths are rarely > 20 deep
curr = curr @ M
return curr[idx['(conversion)']]
def run_heuristic_models(df):
"""Calculates First Touch, Last Touch, Linear models."""
models = {
'Last Touch': {},
'First Touch': {},
'Linear': {}
}
total_conv = df['total_conversions'].sum()
for _, row in df.iterrows():
if row['total_conversions'] == 0: continue
path = row['path'].split(' > ')
val = row['total_conversion_value']
# Last Touch
lt = path[-1]
models['Last Touch'][lt] = models['Last Touch'].get(lt, 0) + val
# First Touch
ft = path[0]
models['First Touch'][ft] = models['First Touch'].get(ft, 0) + val
# Linear
share = val / len(path)
for channel in path:
models['Linear'][channel] = models['Linear'].get(channel, 0) + share
return models
# --- Routes ---
@app.route('/')
def index():
return render_template('index.html')
@app.route('/api/simulate', methods=['POST'])
def simulate():
try:
df = generate_mock_data(n_paths=1000)
# Convert to records for frontend
raw_data = df.head(50).to_dict(orient='records')
# Run Models
markov_res, trans_matrix = run_markov_model(df)
heuristic_res = run_heuristic_models(df)
# Format results for ECharts
channels = set(markov_res.keys()) | set(heuristic_res['Last Touch'].keys())
channels = sorted(list(channels))
comparison_data = {
'channels': channels,
'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels],
'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels],
'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels],
'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels],
}
# Better Sankey: Unique nodes per step to avoid cycles (DAG)
sankey_nodes = set()
sankey_links = []
# Take top 50 paths to avoid clutter
top_paths = df[df['total_conversions'] > 0]['path'].value_counts().head(50)
for path_str, count in top_paths.items():
steps = path_str.split(' > ')
for i in range(len(steps) - 1):
source = f"{steps[i]} (Step {i+1})"
target = f"{steps[i+1]} (Step {i+2})"
sankey_nodes.add(source)
sankey_nodes.add(target)
sankey_links.append({
'source': source,
'target': target,
'value': count
})
# Aggregate links
links_dict = {}
for link in sankey_links:
key = f"{link['source']}->{link['target']}"
if key not in links_dict:
links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0}
links_dict[key]['value'] += link['value']
final_links = list(links_dict.values())
final_nodes = [{'name': n} for n in sankey_nodes]
return jsonify({
'status': 'success',
'comparison': comparison_data,
'sankey': {'nodes': final_nodes, 'links': final_links},
'raw_sample': raw_data
})
except Exception as e:
import traceback
traceback.print_exc()
return jsonify({'error': str(e)}), 500
@app.route('/api/upload', methods=['POST'])
def upload_file():
try:
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
try:
# Read CSV
df = pd.read_csv(file)
# Validation
required_cols = ['path', 'total_conversions', 'total_conversion_value', 'total_null']
if not all(col in df.columns for col in required_cols):
# Try to adapt if columns are different or basic
if 'path' in df.columns and 'conversions' in df.columns:
df.rename(columns={'conversions': 'total_conversions', 'value': 'total_conversion_value', 'nulls': 'total_null'}, inplace=True)
if 'total_conversion_value' not in df.columns: df['total_conversion_value'] = df['total_conversions'] * 100 # Dummy value
if 'total_null' not in df.columns: df['total_null'] = 0
else:
return jsonify({'error': f'CSV format error. Required columns: {required_cols}'}), 400
# Fill NaNs
df.fillna(0, inplace=True)
# Process
raw_data = df.head(50).to_dict(orient='records')
markov_res, trans_matrix = run_markov_model(df)
heuristic_res = run_heuristic_models(df)
# Format results
channels = set(markov_res.keys()) | set(heuristic_res['Last Touch'].keys())
channels = sorted(list(channels))
comparison_data = {
'channels': channels,
'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels],
'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels],
'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels],
'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels],
}
# Sankey Logic (Duplicated from simulate - refactor would be better but keeping it inline for speed)
sankey_nodes = set()
sankey_links = []
top_paths = df[df['total_conversions'] > 0].sort_values('total_conversions', ascending=False).head(50)
for _, row in top_paths.iterrows():
path_str = row['path']
count = row['total_conversions']
steps = path_str.split(' > ')
for i in range(len(steps) - 1):
source = f"{steps[i]} (Step {i+1})"
target = f"{steps[i+1]} (Step {i+2})"
sankey_nodes.add(source)
sankey_nodes.add(target)
sankey_links.append({'source': source, 'target': target, 'value': count})
links_dict = {}
for link in sankey_links:
key = f"{link['source']}->{link['target']}"
if key not in links_dict: links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0}
links_dict[key]['value'] += link['value']
final_links = list(links_dict.values())
final_nodes = [{'name': n} for n in sankey_nodes]
return jsonify({
'status': 'success',
'comparison': comparison_data,
'sankey': {'nodes': final_nodes, 'links': final_links},
'raw_sample': raw_data
})
except Exception as e:
return jsonify({'error': f'Process Error: {str(e)}'}), 500
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/api/export_template')
def export_template():
# Return a CSV template
csv_content = "path,total_conversions,total_conversion_value,total_null\nPaid Search > Email,1,100,0\nDirect > Direct,0,0,1"
from flask import Response
return Response(
csv_content,
mimetype="text/csv",
headers={"Content-disposition": "attachment; filename=template.csv"}
)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)