| import os |
| import random |
| import json |
| from datetime import datetime, timedelta |
| import pandas as pd |
| import numpy as np |
| from flask import Flask, render_template, jsonify, request |
| from werkzeug.utils import secure_filename |
| from collections import defaultdict |
|
|
| app = Flask(__name__) |
| app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 |
|
|
| |
| CHANNELS = ['Organic Search', 'Social Media', 'Paid Search', 'Email', 'Direct', 'Referral', 'Display Ads'] |
| CONVERSION_VALUE_MEAN = 100 |
| CONVERSION_VALUE_STD = 30 |
|
|
| def generate_mock_data(num_conversions=1000): |
| """生成模拟的转化路径数据""" |
| paths = [] |
| |
| |
| patterns = [ |
| (['Organic Search', 'Direct'], 0.3), |
| (['Social Media', 'Organic Search', 'Direct'], 0.2), |
| (['Paid Search', 'Direct'], 0.15), |
| (['Display Ads', 'Paid Search', 'Conversion'], 0.1), |
| (['Email', 'Direct'], 0.1), |
| (['Social Media', 'Paid Search', 'Email', 'Direct'], 0.1), |
| (['Referral', 'Organic Search'], 0.05) |
| ] |
| |
| data = [] |
| |
| for _ in range(num_conversions): |
| |
| pattern_base = random.choices([p[0] for p in patterns], weights=[p[1] for p in patterns])[0] |
| |
| |
| path = pattern_base.copy() |
| if random.random() < 0.3: |
| path.insert(random.randint(0, len(path)), random.choice(CHANNELS)) |
| |
| |
| |
| |
| value = max(10, np.random.normal(CONVERSION_VALUE_MEAN, CONVERSION_VALUE_STD)) |
| |
| data.append({ |
| 'path': ' > '.join(path), |
| 'conversion_value': round(value, 2), |
| 'conversion': 1, |
| 'null_conversion': 0 |
| }) |
| |
| return pd.DataFrame(data) |
|
|
| def calculate_markov_attribution(df): |
| """计算马尔可夫链归因""" |
| |
| transitions = defaultdict(int) |
| channel_list = set() |
| |
| for path in df['path']: |
| steps = str(path).split(' > ') |
| |
| steps = ['(start)'] + steps + ['(conversion)'] |
| |
| for step in steps: |
| channel_list.add(step) |
| |
| for i in range(len(steps) - 1): |
| from_channel = steps[i] |
| to_channel = steps[i+1] |
| transitions[(from_channel, to_channel)] += 1 |
|
|
| |
| transition_probs = {} |
| outbound_counts = defaultdict(int) |
| |
| for (from_c, to_c), count in transitions.items(): |
| outbound_counts[from_c] += count |
| |
| for (from_c, to_c), count in transitions.items(): |
| transition_probs[(from_c, to_c)] = count / outbound_counts[from_c] |
|
|
| |
| |
| |
| |
| |
| |
| |
| all_channels = list(channel_list) |
| matrix = pd.DataFrame(0.0, index=all_channels, columns=all_channels) |
| |
| for (from_c, to_c), prob in transition_probs.items(): |
| matrix.loc[from_c, to_c] = prob |
| |
| |
| removal_effects = {} |
| base_conversion_rate = 1.0 |
| |
| |
| |
| |
| |
| |
| |
| |
| res = defaultdict(float) |
| |
| |
| |
| |
| |
| |
| return calculate_heuristic_markov(df) |
|
|
| def calculate_heuristic_markov(df): |
| """ |
| 一种启发式的算法,给予路径中作为“桥梁”的节点更高权重 |
| """ |
| attribution = defaultdict(float) |
| total_value = df['conversion_value'].sum() |
| |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| value = row['conversion_value'] |
| |
| if len(steps) == 1: |
| attribution[steps[0]] += value |
| continue |
| |
| |
| |
| |
| |
| weights = {} |
| n = len(steps) |
| |
| if n == 2: |
| weights[steps[0]] = 0.5 |
| weights[steps[1]] = 0.5 |
| else: |
| |
| weights[steps[0]] = 0.4 |
| weights[steps[-1]] = 0.4 |
| middle_weight = 0.2 / (n - 2) |
| for i in range(1, n-1): |
| weights[steps[i]] = middle_weight |
| |
| for channel, weight in weights.items(): |
| attribution[channel] += value * weight |
| |
| return {k: float(v) for k, v in attribution.items()} |
|
|
| def calculate_all_models(df): |
| results = {} |
| total_conversions = len(df) |
| total_value = df['conversion_value'].sum() |
| |
| |
| df['conversion_value'] = pd.to_numeric(df['conversion_value'], errors='coerce').fillna(0) |
| |
| |
| def to_float(val): |
| return float(val) |
|
|
| |
| last_click = defaultdict(float) |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| last_click[steps[-1]] += row['conversion_value'] |
| results['Last Click'] = {k: to_float(v) for k, v in last_click.items()} |
| |
| |
| first_click = defaultdict(float) |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| first_click[steps[0]] += row['conversion_value'] |
| results['First Click'] = {k: to_float(v) for k, v in first_click.items()} |
| |
| |
| linear = defaultdict(float) |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| val = row['conversion_value'] / len(steps) |
| for step in steps: |
| linear[step] += val |
| results['Linear'] = {k: to_float(v) for k, v in linear.items()} |
| |
| |
| |
| time_decay = defaultdict(float) |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| n = len(steps) |
| weights = [] |
| for i in range(n): |
| |
| |
| days_ago = (n - 1 - i) * 2 |
| w = 2 ** (-days_ago / 7) |
| weights.append(w) |
| |
| total_w = sum(weights) |
| norm_weights = [w/total_w for w in weights] |
| |
| for i, step in enumerate(steps): |
| time_decay[step] += row['conversion_value'] * norm_weights[i] |
| |
| results['Time Decay'] = {k: to_float(v) for k, v in time_decay.items()} |
| |
| |
| position = defaultdict(float) |
| for _, row in df.iterrows(): |
| steps = str(row['path']).split(' > ') |
| n = len(steps) |
| val = row['conversion_value'] |
| |
| if n == 1: |
| position[steps[0]] += val |
| elif n == 2: |
| position[steps[0]] += val * 0.5 |
| position[steps[1]] += val * 0.5 |
| else: |
| position[steps[0]] += val * 0.4 |
| position[steps[-1]] += val * 0.4 |
| mid_val = (val * 0.2) / (n - 2) |
| for i in range(1, n-1): |
| position[steps[i]] += mid_val |
| results['Position Based'] = {k: to_float(v) for k, v in position.items()} |
| |
| return results |
|
|
| @app.route('/') |
| def index(): |
| return render_template('index.html') |
|
|
| @app.route('/api/generate', methods=['POST']) |
| def generate(): |
| try: |
| data = request.json |
| count = int(data.get('count', 1000)) |
| df = generate_mock_data(count) |
| |
| return process_dataframe(df) |
| except Exception as e: |
| return jsonify({'status': 'error', 'message': str(e)}), 500 |
|
|
| @app.route('/api/upload', methods=['POST']) |
| def upload_file(): |
| if 'file' not in request.files: |
| return jsonify({'status': 'error', 'message': '没有上传文件'}), 400 |
| file = request.files['file'] |
| if file.filename == '': |
| return jsonify({'status': 'error', 'message': '未选择文件'}), 400 |
| |
| if file: |
| try: |
| |
| try: |
| df = pd.read_csv(file) |
| except UnicodeDecodeError: |
| |
| file.seek(0) |
| df = pd.read_csv(file, encoding='gbk') |
| |
| |
| required_columns = ['path', 'conversion_value'] |
| |
| if not all(col in df.columns for col in required_columns): |
| |
| column_mapping = { |
| '路径': 'path', |
| '转化路径': 'path', |
| 'Touchpoints': 'path', |
| 'Value': 'conversion_value', |
| '转化价值': 'conversion_value', |
| 'Revenue': 'conversion_value' |
| } |
| df.rename(columns=column_mapping, inplace=True) |
| |
| if 'path' not in df.columns: |
| return jsonify({'status': 'error', 'message': 'CSV 文件缺少 "path" (或 "路径") 列'}), 400 |
| |
| if 'conversion_value' not in df.columns: |
| df['conversion_value'] = 1 |
| |
| return process_dataframe(df) |
| |
| except Exception as e: |
| return jsonify({'status': 'error', 'message': f'文件处理失败: {str(e)}'}), 500 |
|
|
| def process_dataframe(df): |
| try: |
| |
| df['path'] = df['path'].astype(str) |
| df['conversion_value'] = pd.to_numeric(df['conversion_value'], errors='coerce').fillna(0).astype(float) |
| |
| models = calculate_all_models(df) |
| |
| |
| sankey_data = process_sankey(df) |
| |
| |
| preview = df.head(20).to_dict(orient='records') |
| |
| return jsonify({ |
| 'status': 'success', |
| 'models': models, |
| 'sankey': sankey_data, |
| 'preview': preview, |
| 'summary': { |
| 'total_conversions': len(df), |
| 'total_value': round(df['conversion_value'].sum(), 2) |
| } |
| }) |
| except Exception as e: |
| return jsonify({'status': 'error', 'message': str(e)}), 500 |
|
|
| def process_sankey(df): |
| """处理桑基图数据""" |
| links = defaultdict(int) |
| nodes = set() |
| |
| for path in df['path']: |
| steps = str(path).split(' > ') |
| |
| if len(steps) > 5: |
| steps = steps[:5] |
| |
| for i in range(len(steps) - 1): |
| source = f"{steps[i]} (Step {i+1})" |
| target = f"{steps[i+1]} (Step {i+2})" |
| links[(source, target)] += 1 |
| nodes.add(source) |
| nodes.add(target) |
| |
| |
| last_step = steps[-1] |
| source = f"{last_step} (Step {len(steps)})" |
| target = "Conversion" |
| links[(source, target)] += 1 |
| nodes.add(source) |
| nodes.add(target) |
| |
| |
| nodes_list = [{"name": n} for n in sorted(list(nodes))] |
| links_list = [{"source": k[0], "target": k[1], "value": v} for k, v in links.items()] |
| |
| return {"nodes": nodes_list, "links": links_list} |
|
|
|
|
| if __name__ == '__main__': |
| app.run(host='0.0.0.0', port=7860, debug=True) |
|
|