Merger-and-Acquisition / generate_data.py
AAdevloper
Initial commit: Alpha-Index 100 Gradio app
faebc8b
"""
Synthetic Fund Data Generator
Generates realistic mock fund data for the Alpha-Index project
"""
import pandas as pd
import numpy as np
from faker import Faker
import uuid
# Set random seed for reproducibility
np.random.seed(42)
fake = Faker()
Faker.seed(42)
def generate_fund_data(n_funds=5000):
"""Generate synthetic fund data with realistic patterns"""
data = []
# Define realistic ranges and categories
strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate']
strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10] # Buyout and VC are most common
vintage_years = list(range(2010, 2024))
# Historical interest rates (simplified)
interest_rates = {
2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48,
2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40,
2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06
}
# Public market volatility (P/E ratios)
pe_ratios = {
2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8,
2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1,
2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1
}
for i in range(n_funds):
fund_id = str(uuid.uuid4())
fund_manager = f"{fake.company()} Partners"
vintage_year = np.random.choice(vintage_years)
strategy = np.random.choice(strategies, p=strategy_weights)
# Fund size varies by strategy
if strategy == 'Buyout':
fund_size = np.random.lognormal(mean=6.5, sigma=1.2) # Larger funds
elif strategy == 'Venture Capital':
fund_size = np.random.lognormal(mean=5.0, sigma=1.0) # Smaller funds
elif strategy == 'Growth Equity':
fund_size = np.random.lognormal(mean=6.0, sigma=1.1)
elif strategy == 'Private Credit':
fund_size = np.random.lognormal(mean=6.2, sigma=1.0)
else: # Real Estate
fund_size = np.random.lognormal(mean=5.5, sigma=1.2)
fund_size = max(50, min(10000, fund_size)) # Cap between 50M and 10B
interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1)
pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5)
# Create hidden performance score based on realistic patterns
# Lower interest rates -> better returns (liquidity)
# Moderate PE ratios -> better (not too hot, not too cold)
# Larger funds -> slightly better (but diminishing returns)
# Strategy matters: VC and Growth have higher variance
score = 0
# Interest rate impact (lower is better)
score += (5 - interest_rate) * 3
# PE ratio impact (optimal around 18-20)
optimal_pe = 19
score += 10 - abs(pe_ratio - optimal_pe) * 0.5
# Fund size impact (log scale, sweet spot around 500-1000M)
log_size = np.log(fund_size)
score += (log_size - 5) * 2
# Strategy impact
if strategy == 'Buyout':
score += 5 + np.random.normal(0, 3)
elif strategy == 'Venture Capital':
score += 3 + np.random.normal(0, 5) # High variance
elif strategy == 'Growth Equity':
score += 4 + np.random.normal(0, 4)
elif strategy == 'Private Credit':
score += 6 + np.random.normal(0, 2) # More stable
else: # Real Estate
score += 4 + np.random.normal(0, 3)
# Add some random noise
score += np.random.normal(0, 4)
# Normalize to 0-100 and determine quartile
# Top 25% are top quartile
score_normalized = score
data.append({
'fund_id': fund_id,
'fund_manager': fund_manager,
'vintage_year': vintage_year,
'strategy': strategy,
'fund_size_mil': round(fund_size, 0),
'macro_interest_rate_at_launch': round(interest_rate, 2),
'public_market_pe_at_launch': round(pe_ratio, 1),
'hidden_score': score_normalized
})
df = pd.DataFrame(data)
# Calculate top quartile threshold
threshold = df['hidden_score'].quantile(0.75)
df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int)
# Drop the hidden score (it's only for generation)
df_final = df.drop('hidden_score', axis=1)
return df_final
if __name__ == "__main__":
print("Generating synthetic fund data...")
df = generate_fund_data(n_funds=5000)
print(f"\nGenerated {len(df)} funds")
print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)")
print(f"\nStrategy distribution:")
print(df['strategy'].value_counts())
# Save to CSV
output_path = 'data/mock_fund_data.csv'
df.to_csv(output_path, index=False)
print(f"\nData saved to {output_path}")