Spaces:
Sleeping
Sleeping
| """ | |
| Synthetic Fund Data Generator | |
| Generates realistic mock fund data for the Alpha-Index project | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from faker import Faker | |
| import uuid | |
| # Set random seed for reproducibility | |
| np.random.seed(42) | |
| fake = Faker() | |
| Faker.seed(42) | |
| def generate_fund_data(n_funds=5000): | |
| """Generate synthetic fund data with realistic patterns""" | |
| data = [] | |
| # Define realistic ranges and categories | |
| strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate'] | |
| strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10] # Buyout and VC are most common | |
| vintage_years = list(range(2010, 2024)) | |
| # Historical interest rates (simplified) | |
| interest_rates = { | |
| 2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48, | |
| 2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40, | |
| 2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06 | |
| } | |
| # Public market volatility (P/E ratios) | |
| pe_ratios = { | |
| 2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8, | |
| 2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1, | |
| 2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1 | |
| } | |
| for i in range(n_funds): | |
| fund_id = str(uuid.uuid4()) | |
| fund_manager = f"{fake.company()} Partners" | |
| vintage_year = np.random.choice(vintage_years) | |
| strategy = np.random.choice(strategies, p=strategy_weights) | |
| # Fund size varies by strategy | |
| if strategy == 'Buyout': | |
| fund_size = np.random.lognormal(mean=6.5, sigma=1.2) # Larger funds | |
| elif strategy == 'Venture Capital': | |
| fund_size = np.random.lognormal(mean=5.0, sigma=1.0) # Smaller funds | |
| elif strategy == 'Growth Equity': | |
| fund_size = np.random.lognormal(mean=6.0, sigma=1.1) | |
| elif strategy == 'Private Credit': | |
| fund_size = np.random.lognormal(mean=6.2, sigma=1.0) | |
| else: # Real Estate | |
| fund_size = np.random.lognormal(mean=5.5, sigma=1.2) | |
| fund_size = max(50, min(10000, fund_size)) # Cap between 50M and 10B | |
| interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1) | |
| pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5) | |
| # Create hidden performance score based on realistic patterns | |
| # Lower interest rates -> better returns (liquidity) | |
| # Moderate PE ratios -> better (not too hot, not too cold) | |
| # Larger funds -> slightly better (but diminishing returns) | |
| # Strategy matters: VC and Growth have higher variance | |
| score = 0 | |
| # Interest rate impact (lower is better) | |
| score += (5 - interest_rate) * 3 | |
| # PE ratio impact (optimal around 18-20) | |
| optimal_pe = 19 | |
| score += 10 - abs(pe_ratio - optimal_pe) * 0.5 | |
| # Fund size impact (log scale, sweet spot around 500-1000M) | |
| log_size = np.log(fund_size) | |
| score += (log_size - 5) * 2 | |
| # Strategy impact | |
| if strategy == 'Buyout': | |
| score += 5 + np.random.normal(0, 3) | |
| elif strategy == 'Venture Capital': | |
| score += 3 + np.random.normal(0, 5) # High variance | |
| elif strategy == 'Growth Equity': | |
| score += 4 + np.random.normal(0, 4) | |
| elif strategy == 'Private Credit': | |
| score += 6 + np.random.normal(0, 2) # More stable | |
| else: # Real Estate | |
| score += 4 + np.random.normal(0, 3) | |
| # Add some random noise | |
| score += np.random.normal(0, 4) | |
| # Normalize to 0-100 and determine quartile | |
| # Top 25% are top quartile | |
| score_normalized = score | |
| data.append({ | |
| 'fund_id': fund_id, | |
| 'fund_manager': fund_manager, | |
| 'vintage_year': vintage_year, | |
| 'strategy': strategy, | |
| 'fund_size_mil': round(fund_size, 0), | |
| 'macro_interest_rate_at_launch': round(interest_rate, 2), | |
| 'public_market_pe_at_launch': round(pe_ratio, 1), | |
| 'hidden_score': score_normalized | |
| }) | |
| df = pd.DataFrame(data) | |
| # Calculate top quartile threshold | |
| threshold = df['hidden_score'].quantile(0.75) | |
| df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int) | |
| # Drop the hidden score (it's only for generation) | |
| df_final = df.drop('hidden_score', axis=1) | |
| return df_final | |
| if __name__ == "__main__": | |
| print("Generating synthetic fund data...") | |
| df = generate_fund_data(n_funds=5000) | |
| print(f"\nGenerated {len(df)} funds") | |
| print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)") | |
| print(f"\nStrategy distribution:") | |
| print(df['strategy'].value_counts()) | |
| # Save to CSV | |
| output_path = 'data/mock_fund_data.csv' | |
| df.to_csv(output_path, index=False) | |
| print(f"\nData saved to {output_path}") | |