""" Synthetic Fund Data Generator Generates realistic mock fund data for the Alpha-Index project """ import pandas as pd import numpy as np from faker import Faker import uuid # Set random seed for reproducibility np.random.seed(42) fake = Faker() Faker.seed(42) def generate_fund_data(n_funds=5000): """Generate synthetic fund data with realistic patterns""" data = [] # Define realistic ranges and categories strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate'] strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10] # Buyout and VC are most common vintage_years = list(range(2010, 2024)) # Historical interest rates (simplified) interest_rates = { 2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48, 2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40, 2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06 } # Public market volatility (P/E ratios) pe_ratios = { 2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8, 2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1, 2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1 } for i in range(n_funds): fund_id = str(uuid.uuid4()) fund_manager = f"{fake.company()} Partners" vintage_year = np.random.choice(vintage_years) strategy = np.random.choice(strategies, p=strategy_weights) # Fund size varies by strategy if strategy == 'Buyout': fund_size = np.random.lognormal(mean=6.5, sigma=1.2) # Larger funds elif strategy == 'Venture Capital': fund_size = np.random.lognormal(mean=5.0, sigma=1.0) # Smaller funds elif strategy == 'Growth Equity': fund_size = np.random.lognormal(mean=6.0, sigma=1.1) elif strategy == 'Private Credit': fund_size = np.random.lognormal(mean=6.2, sigma=1.0) else: # Real Estate fund_size = np.random.lognormal(mean=5.5, sigma=1.2) fund_size = max(50, min(10000, fund_size)) # Cap between 50M and 10B interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1) pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5) # Create hidden performance score based on realistic patterns # Lower interest rates -> better returns (liquidity) # Moderate PE ratios -> better (not too hot, not too cold) # Larger funds -> slightly better (but diminishing returns) # Strategy matters: VC and Growth have higher variance score = 0 # Interest rate impact (lower is better) score += (5 - interest_rate) * 3 # PE ratio impact (optimal around 18-20) optimal_pe = 19 score += 10 - abs(pe_ratio - optimal_pe) * 0.5 # Fund size impact (log scale, sweet spot around 500-1000M) log_size = np.log(fund_size) score += (log_size - 5) * 2 # Strategy impact if strategy == 'Buyout': score += 5 + np.random.normal(0, 3) elif strategy == 'Venture Capital': score += 3 + np.random.normal(0, 5) # High variance elif strategy == 'Growth Equity': score += 4 + np.random.normal(0, 4) elif strategy == 'Private Credit': score += 6 + np.random.normal(0, 2) # More stable else: # Real Estate score += 4 + np.random.normal(0, 3) # Add some random noise score += np.random.normal(0, 4) # Normalize to 0-100 and determine quartile # Top 25% are top quartile score_normalized = score data.append({ 'fund_id': fund_id, 'fund_manager': fund_manager, 'vintage_year': vintage_year, 'strategy': strategy, 'fund_size_mil': round(fund_size, 0), 'macro_interest_rate_at_launch': round(interest_rate, 2), 'public_market_pe_at_launch': round(pe_ratio, 1), 'hidden_score': score_normalized }) df = pd.DataFrame(data) # Calculate top quartile threshold threshold = df['hidden_score'].quantile(0.75) df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int) # Drop the hidden score (it's only for generation) df_final = df.drop('hidden_score', axis=1) return df_final if __name__ == "__main__": print("Generating synthetic fund data...") df = generate_fund_data(n_funds=5000) print(f"\nGenerated {len(df)} funds") print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)") print(f"\nStrategy distribution:") print(df['strategy'].value_counts()) # Save to CSV output_path = 'data/mock_fund_data.csv' df.to_csv(output_path, index=False) print(f"\nData saved to {output_path}")