Spaces:

AAdevloper
/

Merger-and-Acquisition

Sleeping

Merger-and-Acquisition / generate_data.py

AAdevloper

Initial commit: Alpha-Index 100 Gradio app

faebc8b 3 months ago

5 kB

	"""
	Synthetic Fund Data Generator
	Generates realistic mock fund data for the Alpha-Index project
	"""

	import pandas as pd
	import numpy as np
	from faker import Faker
	import uuid

	# Set random seed for reproducibility
	np.random.seed(42)
	fake = Faker()
	Faker.seed(42)

	def generate_fund_data(n_funds=5000):
	"""Generate synthetic fund data with realistic patterns"""

	data = []

	# Define realistic ranges and categories
	strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate']
	strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10] # Buyout and VC are most common

	vintage_years = list(range(2010, 2024))

	# Historical interest rates (simplified)
	interest_rates = {
	2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48,
	2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40,
	2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06
	}

	# Public market volatility (P/E ratios)
	pe_ratios = {
	2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8,
	2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1,
	2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1
	}

	for i in range(n_funds):
	fund_id = str(uuid.uuid4())
	fund_manager = f"{fake.company()} Partners"
	vintage_year = np.random.choice(vintage_years)
	strategy = np.random.choice(strategies, p=strategy_weights)

	# Fund size varies by strategy
	if strategy == 'Buyout':
	fund_size = np.random.lognormal(mean=6.5, sigma=1.2) # Larger funds
	elif strategy == 'Venture Capital':
	fund_size = np.random.lognormal(mean=5.0, sigma=1.0) # Smaller funds
	elif strategy == 'Growth Equity':
	fund_size = np.random.lognormal(mean=6.0, sigma=1.1)
	elif strategy == 'Private Credit':
	fund_size = np.random.lognormal(mean=6.2, sigma=1.0)
	else: # Real Estate
	fund_size = np.random.lognormal(mean=5.5, sigma=1.2)

	fund_size = max(50, min(10000, fund_size)) # Cap between 50M and 10B

	interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1)
	pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5)

	# Create hidden performance score based on realistic patterns
	# Lower interest rates -> better returns (liquidity)
	# Moderate PE ratios -> better (not too hot, not too cold)
	# Larger funds -> slightly better (but diminishing returns)
	# Strategy matters: VC and Growth have higher variance

	score = 0

	# Interest rate impact (lower is better)
	score += (5 - interest_rate) * 3

	# PE ratio impact (optimal around 18-20)
	optimal_pe = 19
	score += 10 - abs(pe_ratio - optimal_pe) * 0.5

	# Fund size impact (log scale, sweet spot around 500-1000M)
	log_size = np.log(fund_size)
	score += (log_size - 5) * 2

	# Strategy impact
	if strategy == 'Buyout':
	score += 5 + np.random.normal(0, 3)
	elif strategy == 'Venture Capital':
	score += 3 + np.random.normal(0, 5) # High variance
	elif strategy == 'Growth Equity':
	score += 4 + np.random.normal(0, 4)
	elif strategy == 'Private Credit':
	score += 6 + np.random.normal(0, 2) # More stable
	else: # Real Estate
	score += 4 + np.random.normal(0, 3)

	# Add some random noise
	score += np.random.normal(0, 4)

	# Normalize to 0-100 and determine quartile
	# Top 25% are top quartile
	score_normalized = score

	data.append({
	'fund_id': fund_id,
	'fund_manager': fund_manager,
	'vintage_year': vintage_year,
	'strategy': strategy,
	'fund_size_mil': round(fund_size, 0),
	'macro_interest_rate_at_launch': round(interest_rate, 2),
	'public_market_pe_at_launch': round(pe_ratio, 1),
	'hidden_score': score_normalized
	})

	df = pd.DataFrame(data)

	# Calculate top quartile threshold
	threshold = df['hidden_score'].quantile(0.75)
	df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int)

	# Drop the hidden score (it's only for generation)
	df_final = df.drop('hidden_score', axis=1)

	return df_final

	if __name__ == "__main__":
	print("Generating synthetic fund data...")
	df = generate_fund_data(n_funds=5000)

	print(f"\nGenerated {len(df)} funds")
	print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)")
	print(f"\nStrategy distribution:")
	print(df['strategy'].value_counts())

	# Save to CSV
	output_path = 'data/mock_fund_data.csv'
	df.to_csv(output_path, index=False)
	print(f"\nData saved to {output_path}")