Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| import networkx as nx | |
| from flask import Flask, render_template, jsonify, request | |
| app = Flask(__name__) | |
| app.secret_key = os.urandom(24) | |
| app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 | |
| # --- Logic Core --- | |
| def generate_mock_data(n_paths=500): | |
| """Generates synthetic conversion path data.""" | |
| channels = ['付费搜索', '社交媒体', '邮件营销', '展示广告', '直接访问', '推介引流'] | |
| data = [] | |
| for _ in range(n_paths): | |
| # Determine path length (1 to 5 steps) | |
| length = random.choices([1, 2, 3, 4, 5], weights=[0.3, 0.3, 0.2, 0.1, 0.1])[0] | |
| path = random.choices(channels, k=length) | |
| # Determine conversion (simulating different channel efficiencies) | |
| conversion_prob = 0.05 | |
| if '邮件营销' in path: conversion_prob += 0.1 | |
| if '付费搜索' in path: conversion_prob += 0.05 | |
| if '直接访问' in path and path[-1] == '直接访问': conversion_prob += 0.15 | |
| converted = 1 if random.random() < min(conversion_prob, 0.8) else 0 | |
| conversion_value = random.randint(50, 200) if converted else 0 | |
| path_str = ' > '.join(path) | |
| data.append({ | |
| 'path': path_str, | |
| 'total_conversions': converted, | |
| 'total_conversion_value': conversion_value, | |
| 'total_null': 1 - converted | |
| }) | |
| # Aggregate to reduce rows and match "Path Data" format | |
| df = pd.DataFrame(data) | |
| df = df.groupby('path').agg({ | |
| 'total_conversions': 'sum', | |
| 'total_conversion_value': 'sum', | |
| 'total_null': 'sum' | |
| }).reset_index() | |
| return df | |
| def run_markov_model(df): | |
| """ | |
| Calculates Markov Chain Attribution. | |
| Expects DataFrame with 'path' (channel1 > channel2), 'total_conversions', 'total_conversion_value'. | |
| """ | |
| # 1. Prepare Transition Matrix | |
| transitions = {} | |
| for idx, row in df.iterrows(): | |
| path = row['path'].split(' > ') | |
| total_conversions = row['total_conversions'] | |
| total_null = row['total_null'] | |
| # Add Start and End states | |
| full_path = ['(start)'] + path | |
| if total_conversions > 0: | |
| full_path.append('(conversion)') | |
| else: | |
| full_path.append('(null)') | |
| for i in range(len(full_path) - 1): | |
| source = full_path[i] | |
| target = full_path[i+1] | |
| if source not in transitions: transitions[source] = {} | |
| if target not in transitions[source]: transitions[source][target] = 0 | |
| transitions[source][target] += 1 | |
| # 2. Calculate Probabilities | |
| trans_matrix = {} | |
| for source, targets in transitions.items(): | |
| total_visits = sum(targets.values()) | |
| trans_matrix[source] = {t: count / total_visits for t, count in targets.items()} | |
| # 3. Removal Effect Calculation | |
| # Build graph | |
| G = nx.DiGraph() | |
| for source, targets in trans_matrix.items(): | |
| for target, prob in targets.items(): | |
| G.add_edge(source, target, weight=prob) | |
| channels = [n for n in G.nodes() if n not in ['(start)', '(conversion)', '(null)']] | |
| # Calculate total conversion probability from (start) to (conversion) | |
| try: | |
| base_prob = _calculate_conversion_prob(G) | |
| except: | |
| base_prob = 0 | |
| attribution = {} | |
| total_removal_effect = 0 | |
| removal_effects = {} | |
| if base_prob > 0: | |
| for channel in channels: | |
| # Create subgraph without this channel | |
| G_temp = G.copy() | |
| G_temp.remove_node(channel) | |
| new_prob = _calculate_conversion_prob(G_temp) | |
| removal_effect = 1 - (new_prob / base_prob) | |
| removal_effects[channel] = removal_effect | |
| total_removal_effect += removal_effect | |
| # Normalize to get share | |
| total_conversions = df['total_conversions'].sum() | |
| total_value = df['total_conversion_value'].sum() | |
| for channel, effect in removal_effects.items(): | |
| share = effect / total_removal_effect if total_removal_effect > 0 else 0 | |
| attribution[channel] = { | |
| 'conversions': share * total_conversions, | |
| 'value': share * total_value | |
| } | |
| else: | |
| # Fallback if graph is broken | |
| for channel in channels: | |
| attribution[channel] = {'conversions': 0, 'value': 0} | |
| return attribution, trans_matrix | |
| def _calculate_conversion_prob(G): | |
| """Calculates probability of reaching (conversion) from (start).""" | |
| if not G.has_node('(start)') or not G.has_node('(conversion)'): | |
| return 0 | |
| # Simple path multiplication (can be slow for complex graphs, but okay for marketing paths) | |
| # Better approach: Matrix multiplication or solving linear equations | |
| # For simplicity and speed in this demo, we use a simplified approach or NetworkX algorithms | |
| # Actually, for small graphs, we can use simple flow or just assume acyclic (which is not always true) | |
| # Let's use a robust method: Matrix Power or Absorbing Markov Chains | |
| nodes = list(G.nodes()) | |
| idx = {n: i for i, n in enumerate(nodes)} | |
| n = len(nodes) | |
| M = np.zeros((n, n)) | |
| for u, v, data in G.edges(data=True): | |
| M[idx[u]][idx[v]] = data['weight'] | |
| # Start vector | |
| start_vec = np.zeros(n) | |
| start_vec[idx['(start)']] = 1 | |
| # Iterate (approximate infinite steps) | |
| curr = start_vec | |
| for _ in range(20): # Marketing paths are rarely > 20 deep | |
| curr = curr @ M | |
| return curr[idx['(conversion)']] | |
| def run_heuristic_models(df): | |
| """Calculates First Touch, Last Touch, Linear models.""" | |
| models = { | |
| 'Last Touch': {}, | |
| 'First Touch': {}, | |
| 'Linear': {} | |
| } | |
| total_conv = df['total_conversions'].sum() | |
| for _, row in df.iterrows(): | |
| if row['total_conversions'] == 0: continue | |
| path = row['path'].split(' > ') | |
| val = row['total_conversion_value'] | |
| # Last Touch | |
| lt = path[-1] | |
| models['Last Touch'][lt] = models['Last Touch'].get(lt, 0) + val | |
| # First Touch | |
| ft = path[0] | |
| models['First Touch'][ft] = models['First Touch'].get(ft, 0) + val | |
| # Linear | |
| share = val / len(path) | |
| for channel in path: | |
| models['Linear'][channel] = models['Linear'].get(channel, 0) + share | |
| return models | |
| # --- Routes --- | |
| def index(): | |
| return render_template('index.html') | |
| def simulate(): | |
| try: | |
| df = generate_mock_data(n_paths=1000) | |
| # Convert to records for frontend | |
| raw_data = df.head(50).to_dict(orient='records') | |
| # Run Models | |
| markov_res, trans_matrix = run_markov_model(df) | |
| heuristic_res = run_heuristic_models(df) | |
| # Format results for ECharts | |
| channels = set(markov_res.keys()) | set(heuristic_res['Last Touch'].keys()) | |
| channels = sorted(list(channels)) | |
| comparison_data = { | |
| 'channels': channels, | |
| 'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels], | |
| 'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels], | |
| 'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels], | |
| 'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels], | |
| } | |
| # Better Sankey: Unique nodes per step to avoid cycles (DAG) | |
| sankey_nodes = set() | |
| sankey_links = [] | |
| # Take top 50 paths to avoid clutter | |
| top_paths = df[df['total_conversions'] > 0]['path'].value_counts().head(50) | |
| for path_str, count in top_paths.items(): | |
| steps = path_str.split(' > ') | |
| for i in range(len(steps) - 1): | |
| source = f"{steps[i]} (Step {i+1})" | |
| target = f"{steps[i+1]} (Step {i+2})" | |
| sankey_nodes.add(source) | |
| sankey_nodes.add(target) | |
| sankey_links.append({ | |
| 'source': source, | |
| 'target': target, | |
| 'value': count | |
| }) | |
| # Aggregate links | |
| links_dict = {} | |
| for link in sankey_links: | |
| key = f"{link['source']}->{link['target']}" | |
| if key not in links_dict: | |
| links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0} | |
| links_dict[key]['value'] += link['value'] | |
| final_links = list(links_dict.values()) | |
| final_nodes = [{'name': n} for n in sankey_nodes] | |
| return jsonify({ | |
| 'status': 'success', | |
| 'comparison': comparison_data, | |
| 'sankey': {'nodes': final_nodes, 'links': final_links}, | |
| 'raw_sample': raw_data | |
| }) | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| def upload_file(): | |
| try: | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file part'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No selected file'}), 400 | |
| if file: | |
| try: | |
| # Read CSV | |
| df = pd.read_csv(file) | |
| # Validation | |
| required_cols = ['path', 'total_conversions', 'total_conversion_value', 'total_null'] | |
| if not all(col in df.columns for col in required_cols): | |
| # Try to adapt if columns are different or basic | |
| if 'path' in df.columns and 'conversions' in df.columns: | |
| df.rename(columns={'conversions': 'total_conversions', 'value': 'total_conversion_value', 'nulls': 'total_null'}, inplace=True) | |
| if 'total_conversion_value' not in df.columns: df['total_conversion_value'] = df['total_conversions'] * 100 # Dummy value | |
| if 'total_null' not in df.columns: df['total_null'] = 0 | |
| else: | |
| return jsonify({'error': f'CSV format error. Required columns: {required_cols}'}), 400 | |
| # Fill NaNs | |
| df.fillna(0, inplace=True) | |
| # Process | |
| raw_data = df.head(50).to_dict(orient='records') | |
| markov_res, trans_matrix = run_markov_model(df) | |
| heuristic_res = run_heuristic_models(df) | |
| # Format results | |
| channels = set(markov_res.keys()) | set(heuristic_res['Last Touch'].keys()) | |
| channels = sorted(list(channels)) | |
| comparison_data = { | |
| 'channels': channels, | |
| 'Markov': [round(markov_res.get(c, {}).get('value', 0), 2) for c in channels], | |
| 'Last Touch': [round(heuristic_res['Last Touch'].get(c, 0), 2) for c in channels], | |
| 'First Touch': [round(heuristic_res['First Touch'].get(c, 0), 2) for c in channels], | |
| 'Linear': [round(heuristic_res['Linear'].get(c, 0), 2) for c in channels], | |
| } | |
| # Sankey Logic (Duplicated from simulate - refactor would be better but keeping it inline for speed) | |
| sankey_nodes = set() | |
| sankey_links = [] | |
| top_paths = df[df['total_conversions'] > 0].sort_values('total_conversions', ascending=False).head(50) | |
| for _, row in top_paths.iterrows(): | |
| path_str = row['path'] | |
| count = row['total_conversions'] | |
| steps = path_str.split(' > ') | |
| for i in range(len(steps) - 1): | |
| source = f"{steps[i]} (Step {i+1})" | |
| target = f"{steps[i+1]} (Step {i+2})" | |
| sankey_nodes.add(source) | |
| sankey_nodes.add(target) | |
| sankey_links.append({'source': source, 'target': target, 'value': count}) | |
| links_dict = {} | |
| for link in sankey_links: | |
| key = f"{link['source']}->{link['target']}" | |
| if key not in links_dict: links_dict[key] = {'source': link['source'], 'target': link['target'], 'value': 0} | |
| links_dict[key]['value'] += link['value'] | |
| final_links = list(links_dict.values()) | |
| final_nodes = [{'name': n} for n in sankey_nodes] | |
| return jsonify({ | |
| 'status': 'success', | |
| 'comparison': comparison_data, | |
| 'sankey': {'nodes': final_nodes, 'links': final_links}, | |
| 'raw_sample': raw_data | |
| }) | |
| except Exception as e: | |
| return jsonify({'error': f'Process Error: {str(e)}'}), 500 | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def export_template(): | |
| # Return a CSV template | |
| csv_content = "path,total_conversions,total_conversion_value,total_null\nPaid Search > Email,1,100,0\nDirect > Direct,0,0,1" | |
| from flask import Response | |
| return Response( | |
| csv_content, | |
| mimetype="text/csv", | |
| headers={"Content-disposition": "attachment; filename=template.csv"} | |
| ) | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860) | |