Spaces:
Sleeping
Sleeping
File size: 5,003 Bytes
faebc8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
"""
Synthetic Fund Data Generator
Generates realistic mock fund data for the Alpha-Index project
"""
import pandas as pd
import numpy as np
from faker import Faker
import uuid
# Set random seed for reproducibility
np.random.seed(42)
fake = Faker()
Faker.seed(42)
def generate_fund_data(n_funds=5000):
"""Generate synthetic fund data with realistic patterns"""
data = []
# Define realistic ranges and categories
strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate']
strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10] # Buyout and VC are most common
vintage_years = list(range(2010, 2024))
# Historical interest rates (simplified)
interest_rates = {
2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48,
2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40,
2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06
}
# Public market volatility (P/E ratios)
pe_ratios = {
2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8,
2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1,
2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1
}
for i in range(n_funds):
fund_id = str(uuid.uuid4())
fund_manager = f"{fake.company()} Partners"
vintage_year = np.random.choice(vintage_years)
strategy = np.random.choice(strategies, p=strategy_weights)
# Fund size varies by strategy
if strategy == 'Buyout':
fund_size = np.random.lognormal(mean=6.5, sigma=1.2) # Larger funds
elif strategy == 'Venture Capital':
fund_size = np.random.lognormal(mean=5.0, sigma=1.0) # Smaller funds
elif strategy == 'Growth Equity':
fund_size = np.random.lognormal(mean=6.0, sigma=1.1)
elif strategy == 'Private Credit':
fund_size = np.random.lognormal(mean=6.2, sigma=1.0)
else: # Real Estate
fund_size = np.random.lognormal(mean=5.5, sigma=1.2)
fund_size = max(50, min(10000, fund_size)) # Cap between 50M and 10B
interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1)
pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5)
# Create hidden performance score based on realistic patterns
# Lower interest rates -> better returns (liquidity)
# Moderate PE ratios -> better (not too hot, not too cold)
# Larger funds -> slightly better (but diminishing returns)
# Strategy matters: VC and Growth have higher variance
score = 0
# Interest rate impact (lower is better)
score += (5 - interest_rate) * 3
# PE ratio impact (optimal around 18-20)
optimal_pe = 19
score += 10 - abs(pe_ratio - optimal_pe) * 0.5
# Fund size impact (log scale, sweet spot around 500-1000M)
log_size = np.log(fund_size)
score += (log_size - 5) * 2
# Strategy impact
if strategy == 'Buyout':
score += 5 + np.random.normal(0, 3)
elif strategy == 'Venture Capital':
score += 3 + np.random.normal(0, 5) # High variance
elif strategy == 'Growth Equity':
score += 4 + np.random.normal(0, 4)
elif strategy == 'Private Credit':
score += 6 + np.random.normal(0, 2) # More stable
else: # Real Estate
score += 4 + np.random.normal(0, 3)
# Add some random noise
score += np.random.normal(0, 4)
# Normalize to 0-100 and determine quartile
# Top 25% are top quartile
score_normalized = score
data.append({
'fund_id': fund_id,
'fund_manager': fund_manager,
'vintage_year': vintage_year,
'strategy': strategy,
'fund_size_mil': round(fund_size, 0),
'macro_interest_rate_at_launch': round(interest_rate, 2),
'public_market_pe_at_launch': round(pe_ratio, 1),
'hidden_score': score_normalized
})
df = pd.DataFrame(data)
# Calculate top quartile threshold
threshold = df['hidden_score'].quantile(0.75)
df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int)
# Drop the hidden score (it's only for generation)
df_final = df.drop('hidden_score', axis=1)
return df_final
if __name__ == "__main__":
print("Generating synthetic fund data...")
df = generate_fund_data(n_funds=5000)
print(f"\nGenerated {len(df)} funds")
print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)")
print(f"\nStrategy distribution:")
print(df['strategy'].value_counts())
# Save to CSV
output_path = 'data/mock_fund_data.csv'
df.to_csv(output_path, index=False)
print(f"\nData saved to {output_path}")
|