Spaces:

duqing026
/

algo-attribution-engine

Sleeping

Trae Assistant

Enhance: Add file upload, localization and fixes

476df4f 2 months ago

13.9 kB

	import os
	import random
	import json
	import pandas as pd
	import numpy as np
	import networkx as nx
	from flask import Flask, render_template, jsonify, request

	app = Flask(__name__)
	app.secret_key = os.urandom(24)
	app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024

	# --- Logic Core ---

	def generate_mock_data(n_paths=500):
	"""Generates synthetic conversion path data."""
	channels = ['付费搜索', '社交媒体', '邮件营销', '展示广告', '直接访问', '推介引流']
	data = []

	for _ in range(n_paths):
	# Determine path length (1 to 5 steps)
	length = random.choices([1, 2, 3, 4, 5], weights=[0.3, 0.3, 0.2, 0.1, 0.1])[0]
	path = random.choices(channels, k=length)

	# Determine conversion (simulating different channel efficiencies)
	conversion_prob = 0.05
	if '邮件营销' in path: conversion_prob += 0.1
	if '付费搜索' in path: conversion_prob += 0.05
	if '直接访问' in path and path[-1] == '直接访问': conversion_prob += 0.15

	converted = 1 if random.random() < min(conversion_prob, 0.8) else 0
	conversion_value = random.randint(50, 200) if converted else 0

	path_str = ' > '.join(path)
	data.append({
	'path': path_str,
	'total_conversions': converted,
	'total_conversion_value': conversion_value,
	'total_null': 1 - converted
	})

	# Aggregate to reduce rows and match "Path Data" format
	df = pd.DataFrame(data)
	df = df.groupby('path').agg({
	'total_conversions': 'sum',
	'total_conversion_value': 'sum',
	'total_null': 'sum'
	}).reset_index()

	return df

	def run_markov_model(df):
	"""
	Calculates Markov Chain Attribution.
	Expects DataFrame with 'path' (channel1 > channel2), 'total_conversions', 'total_conversion_value'.
	"""
	# 1. Prepare Transition Matrix
	transitions = {}

	for idx, row in df.iterrows():
	path = row['path'].split(' > ')
	total_conversions = row['total_conversions']
	total_null = row['total_null']

	# Add Start and End states
	full_path = ['(start)'] + path
	if total_conversions > 0:
	full_path.append('(conversion)')
	else:
	full_path.append('(null)')

	for i in range(len(full_path) - 1):
	source = full_path[i]
	target = full_path[i+1]
	if source not in transitions: transitions[source] = {}
	if target not in transitions[source]: transitions[source][target] = 0
	transitions[source][target] += 1

	# 2. Calculate Probabilities
	trans_matrix = {}
	for source, targets in transitions.items():
	total_visits = sum(targets.values())
	trans_matrix[source] = {t: count / total_visits for t, count in targets.items()}

	# 3. Removal Effect Calculation
	# Build graph
	G = nx.DiGraph()
	for source, targets in trans_matrix.items():
	for target, prob in targets.items():
	G.add_edge(source, target, weight=prob)

	channels = [n for n in G.nodes() if n not in ['(start)', '(conversion)', '(null)']]

	# Calculate total conversion probability from (start) to (conversion)
	try:
	base_prob = _calculate_conversion_prob(G)
	except:
	base_prob = 0

	attribution = {}
	total_removal_effect = 0

	removal_effects = {}

	if base_prob > 0:
	for channel in channels:
	# Create subgraph without this channel
	G_temp = G.copy()
	G_temp.remove_node(channel)
	new_prob = _calculate_conversion_prob(G_temp)
	removal_effect = 1 - (new_prob / base_prob)
	removal_effects[channel] = removal_effect
	total_removal_effect += removal_effect

	# Normalize to get share
	total_conversions = df['total_conversions'].sum()
	total_value = df['total_conversion_value'].sum()

	for channel, effect in removal_effects.items():
	share = effect / total_removal_effect if total_removal_effect > 0 else 0
	attribution[channel] = {
	'conversions': share * total_conversions,
	'value': share * total_value
	}
	else:
	# Fallback if graph is broken
	for channel in channels:
	attribution[channel] = {'conversions': 0, 'value': 0}

	return attribution, trans_matrix

	def _calculate_conversion_prob(G):
	"""Calculates probability of reaching (conversion) from (start)."""
	if not G.has_node('(start)') or not G.has_node('(conversion)'):
	return 0

	# Simple path multiplication (can be slow for complex graphs, but okay for marketing paths)
	# Better approach: Matrix multiplication or solving linear equations
	# For simplicity and speed in this demo, we use a simplified approach or NetworkX algorithms
	# Actually, for small graphs, we can use simple flow or just assume acyclic (which is not always true)
	# Let's use a robust method: Matrix Power or Absorbing Markov Chains

	nodes = list(G.nodes())
	idx = {n: i for i, n in enumerate(nodes)}
	n = len(nodes)
	M = np.zeros((n, n))

	for u, v, data in G.edges(data=True):
	M[idx[u]][idx[v]] = data['weight']

	# Start vector
	start_vec = np.zeros(n)
	start_vec[idx['(start)']] = 1

	# Iterate (approximate infinite steps)
	curr = start_vec
	for _ in range(20): # Marketing paths are rarely > 20 deep
	curr = curr @ M

	return curr[idx['(conversion)']]

	def run_heuristic_models(df):
	"""Calculates First Touch, Last Touch, Linear models."""
	models = {
	'Last Touch': {},
	'First Touch': {},
	'Linear': {}
	}

	total_conv = df['total_conversions'].sum()

	for _, row in df.iterrows():
	if row['total_conversions'] == 0: continue

	path = row['path'].split(' > ')
	val = row['total_conversion_value']

	# Last Touch
	lt = path[-1]
	models['Last Touch'][lt] = models['Last Touch'].get(lt, 0) + val

	# First Touch
	ft = path[0]
	models['First Touch'][ft] = models['First Touch'].get(ft, 0) + val

	# Linear
	share = val / len(path)
	for channel in path:
	models['Linear'][channel] = models['Linear'].get(channel, 0) + share

	return models

	# --- Routes ---

	@app.route('/')
	def index():
	return render_template('index.html')

	@app.route('/api/simulate', methods=['POST'])
	def simulate():
	try:
	df = generate_mock_data(n_paths=1000)
	# Convert to records for frontend
	raw_data = df.head(50).to_dict(orient='records')

	# Run Models
	markov_res, trans_matrix = run_markov_model(df)
	heuristic_res = run_heuristic_models(df)

	# Format results for ECharts
	channels = set(markov_res.keys()) \| set(heuristic_res['Last Touch'].keys())
	channels = sorted(list(channels))

	comparison_data = {
	'channels': channels,
	'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels],
	'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels],
	'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels],
	'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels],
	}

	# Better Sankey: Unique nodes per step to avoid cycles (DAG)
	sankey_nodes = set()
	sankey_links = []

	# Take top 50 paths to avoid clutter
	top_paths = df[df['total_conversions'] > 0]['path'].value_counts().head(50)

	for path_str, count in top_paths.items():
	steps = path_str.split(' > ')
	for i in range(len(steps) - 1):
	source = f"{steps[i]} (Step {i+1})"
	target = f"{steps[i+1]} (Step {i+2})"

	sankey_nodes.add(source)
	sankey_nodes.add(target)

	sankey_links.append({
	'source': source,
	'target': target,
	'value': count
	})

	# Aggregate links
	links_dict = {}
	for link in sankey_links:
	key = f"{link['source']}->{link['target']}"
	if key not in links_dict:
	links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0}
	links_dict[key]['value'] += link['value']

	final_links = list(links_dict.values())
	final_nodes = [{'name': n} for n in sankey_nodes]

	return jsonify({
	'status': 'success',
	'comparison': comparison_data,
	'sankey': {'nodes': final_nodes, 'links': final_links},
	'raw_sample': raw_data
	})

	except Exception as e:
	import traceback
	traceback.print_exc()
	return jsonify({'error': str(e)}), 500

	@app.route('/api/upload', methods=['POST'])
	def upload_file():
	try:
	if 'file' not in request.files:
	return jsonify({'error': 'No file part'}), 400
	file = request.files['file']
	if file.filename == '':
	return jsonify({'error': 'No selected file'}), 400

	if file:
	try:
	# Read CSV
	df = pd.read_csv(file)

	# Validation
	required_cols = ['path', 'total_conversions', 'total_conversion_value', 'total_null']
	if not all(col in df.columns for col in required_cols):
	# Try to adapt if columns are different or basic
	if 'path' in df.columns and 'conversions' in df.columns:
	df.rename(columns={'conversions': 'total_conversions', 'value': 'total_conversion_value', 'nulls': 'total_null'}, inplace=True)
	if 'total_conversion_value' not in df.columns: df['total_conversion_value'] = df['total_conversions'] * 100 # Dummy value
	if 'total_null' not in df.columns: df['total_null'] = 0
	else:
	return jsonify({'error': f'CSV format error. Required columns: {required_cols}'}), 400

	# Fill NaNs
	df.fillna(0, inplace=True)

	# Process
	raw_data = df.head(50).to_dict(orient='records')
	markov_res, trans_matrix = run_markov_model(df)
	heuristic_res = run_heuristic_models(df)

	# Format results
	channels = set(markov_res.keys()) \| set(heuristic_res['Last Touch'].keys())
	channels = sorted(list(channels))

	comparison_data = {
	'channels': channels,
	'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels],
	'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels],
	'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels],
	'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels],
	}

	# Sankey Logic (Duplicated from simulate - refactor would be better but keeping it inline for speed)
	sankey_nodes = set()
	sankey_links = []
	top_paths = df[df['total_conversions'] > 0].sort_values('total_conversions', ascending=False).head(50)

	for _, row in top_paths.iterrows():
	path_str = row['path']
	count = row['total_conversions']
	steps = path_str.split(' > ')
	for i in range(len(steps) - 1):
	source = f"{steps[i]} (Step {i+1})"
	target = f"{steps[i+1]} (Step {i+2})"
	sankey_nodes.add(source)
	sankey_nodes.add(target)
	sankey_links.append({'source': source, 'target': target, 'value': count})

	links_dict = {}
	for link in sankey_links:
	key = f"{link['source']}->{link['target']}"
	if key not in links_dict: links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0}
	links_dict[key]['value'] += link['value']

	final_links = list(links_dict.values())
	final_nodes = [{'name': n} for n in sankey_nodes]

	return jsonify({
	'status': 'success',
	'comparison': comparison_data,
	'sankey': {'nodes': final_nodes, 'links': final_links},
	'raw_sample': raw_data
	})

	except Exception as e:
	return jsonify({'error': f'Process Error: {str(e)}'}), 500

	except Exception as e:
	return jsonify({'error': str(e)}), 500

	@app.route('/api/export_template')
	def export_template():
	# Return a CSV template
	csv_content = "path,total_conversions,total_conversion_value,total_null\nPaid Search > Email,1,100,0\nDirect > Direct,0,0,1"
	from flask import Response
	return Response(
	csv_content,
	mimetype="text/csv",
	headers={"Content-disposition": "attachment; filename=template.csv"}
	)

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860)