File size: 5,003 Bytes
faebc8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Synthetic Fund Data Generator
Generates realistic mock fund data for the Alpha-Index project
"""

import pandas as pd
import numpy as np
from faker import Faker
import uuid

# Set random seed for reproducibility
np.random.seed(42)
fake = Faker()
Faker.seed(42)

def generate_fund_data(n_funds=5000):
    """Generate synthetic fund data with realistic patterns"""
    
    data = []
    
    # Define realistic ranges and categories
    strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate']
    strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10]  # Buyout and VC are most common
    
    vintage_years = list(range(2010, 2024))
    
    # Historical interest rates (simplified)
    interest_rates = {
        2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48,
        2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40,
        2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06
    }
    
    # Public market volatility (P/E ratios)
    pe_ratios = {
        2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8,
        2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1,
        2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1
    }
    
    for i in range(n_funds):
        fund_id = str(uuid.uuid4())
        fund_manager = f"{fake.company()} Partners"
        vintage_year = np.random.choice(vintage_years)
        strategy = np.random.choice(strategies, p=strategy_weights)
        
        # Fund size varies by strategy
        if strategy == 'Buyout':
            fund_size = np.random.lognormal(mean=6.5, sigma=1.2)  # Larger funds
        elif strategy == 'Venture Capital':
            fund_size = np.random.lognormal(mean=5.0, sigma=1.0)  # Smaller funds
        elif strategy == 'Growth Equity':
            fund_size = np.random.lognormal(mean=6.0, sigma=1.1)
        elif strategy == 'Private Credit':
            fund_size = np.random.lognormal(mean=6.2, sigma=1.0)
        else:  # Real Estate
            fund_size = np.random.lognormal(mean=5.5, sigma=1.2)
        
        fund_size = max(50, min(10000, fund_size))  # Cap between 50M and 10B
        
        interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1)
        pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5)
        
        # Create hidden performance score based on realistic patterns
        # Lower interest rates -> better returns (liquidity)
        # Moderate PE ratios -> better (not too hot, not too cold)
        # Larger funds -> slightly better (but diminishing returns)
        # Strategy matters: VC and Growth have higher variance
        
        score = 0
        
        # Interest rate impact (lower is better)
        score += (5 - interest_rate) * 3
        
        # PE ratio impact (optimal around 18-20)
        optimal_pe = 19
        score += 10 - abs(pe_ratio - optimal_pe) * 0.5
        
        # Fund size impact (log scale, sweet spot around 500-1000M)
        log_size = np.log(fund_size)
        score += (log_size - 5) * 2
        
        # Strategy impact
        if strategy == 'Buyout':
            score += 5 + np.random.normal(0, 3)
        elif strategy == 'Venture Capital':
            score += 3 + np.random.normal(0, 5)  # High variance
        elif strategy == 'Growth Equity':
            score += 4 + np.random.normal(0, 4)
        elif strategy == 'Private Credit':
            score += 6 + np.random.normal(0, 2)  # More stable
        else:  # Real Estate
            score += 4 + np.random.normal(0, 3)
        
        # Add some random noise
        score += np.random.normal(0, 4)
        
        # Normalize to 0-100 and determine quartile
        # Top 25% are top quartile
        score_normalized = score
        
        data.append({
            'fund_id': fund_id,
            'fund_manager': fund_manager,
            'vintage_year': vintage_year,
            'strategy': strategy,
            'fund_size_mil': round(fund_size, 0),
            'macro_interest_rate_at_launch': round(interest_rate, 2),
            'public_market_pe_at_launch': round(pe_ratio, 1),
            'hidden_score': score_normalized
        })
    
    df = pd.DataFrame(data)
    
    # Calculate top quartile threshold
    threshold = df['hidden_score'].quantile(0.75)
    df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int)
    
    # Drop the hidden score (it's only for generation)
    df_final = df.drop('hidden_score', axis=1)
    
    return df_final

if __name__ == "__main__":
    print("Generating synthetic fund data...")
    df = generate_fund_data(n_funds=5000)
    
    print(f"\nGenerated {len(df)} funds")
    print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)")
    print(f"\nStrategy distribution:")
    print(df['strategy'].value_counts())
    
    # Save to CSV
    output_path = 'data/mock_fund_data.csv'
    df.to_csv(output_path, index=False)
    print(f"\nData saved to {output_path}")