"""
Synthetic Fund Data Generator
Generates realistic mock fund data for the Alpha-Index project
"""

import pandas as pd
import numpy as np
from faker import Faker
import uuid

# Set random seed for reproducibility
np.random.seed(42)
fake = Faker()
Faker.seed(42)

def generate_fund_data(n_funds=5000):
    """Generate synthetic fund data with realistic patterns"""
    
    data = []
    
    # Define realistic ranges and categories
    strategies = ['Buyout', 'Venture Capital', 'Growth Equity', 'Private Credit', 'Real Estate']
    strategy_weights = [0.35, 0.25, 0.15, 0.15, 0.10]  # Buyout and VC are most common
    
    vintage_years = list(range(2010, 2024))
    
    # Historical interest rates (simplified)
    interest_rates = {
        2010: 0.18, 2011: 0.60, 2012: 0.65, 2013: 0.91, 2014: 0.48,
        2015: 0.14, 2016: 0.40, 2017: 1.74, 2018: 1.85, 2019: 2.40,
        2020: 2.90, 2021: 3.50, 2022: 5.25, 2023: 4.06
    }
    
    # Public market volatility (P/E ratios)
    pe_ratios = {
        2010: 15.2, 2011: 19.1, 2012: 16.8, 2013: 19.5, 2014: 19.8,
        2015: 15.4, 2016: 17.3, 2017: 23.4, 2018: 19.3, 2019: 22.1,
        2020: 18.7, 2021: 28.3, 2022: 19.6, 2023: 16.1
    }
    
    for i in range(n_funds):
        fund_id = str(uuid.uuid4())
        fund_manager = f"{fake.company()} Partners"
        vintage_year = np.random.choice(vintage_years)
        strategy = np.random.choice(strategies, p=strategy_weights)
        
        # Fund size varies by strategy
        if strategy == 'Buyout':
            fund_size = np.random.lognormal(mean=6.5, sigma=1.2)  # Larger funds
        elif strategy == 'Venture Capital':
            fund_size = np.random.lognormal(mean=5.0, sigma=1.0)  # Smaller funds
        elif strategy == 'Growth Equity':
            fund_size = np.random.lognormal(mean=6.0, sigma=1.1)
        elif strategy == 'Private Credit':
            fund_size = np.random.lognormal(mean=6.2, sigma=1.0)
        else:  # Real Estate
            fund_size = np.random.lognormal(mean=5.5, sigma=1.2)
        
        fund_size = max(50, min(10000, fund_size))  # Cap between 50M and 10B
        
        interest_rate = interest_rates[vintage_year] + np.random.normal(0, 0.1)
        pe_ratio = pe_ratios[vintage_year] + np.random.normal(0, 1.5)
        
        # Create hidden performance score based on realistic patterns
        # Lower interest rates -> better returns (liquidity)
        # Moderate PE ratios -> better (not too hot, not too cold)
        # Larger funds -> slightly better (but diminishing returns)
        # Strategy matters: VC and Growth have higher variance
        
        score = 0
        
        # Interest rate impact (lower is better)
        score += (5 - interest_rate) * 3
        
        # PE ratio impact (optimal around 18-20)
        optimal_pe = 19
        score += 10 - abs(pe_ratio - optimal_pe) * 0.5
        
        # Fund size impact (log scale, sweet spot around 500-1000M)
        log_size = np.log(fund_size)
        score += (log_size - 5) * 2
        
        # Strategy impact
        if strategy == 'Buyout':
            score += 5 + np.random.normal(0, 3)
        elif strategy == 'Venture Capital':
            score += 3 + np.random.normal(0, 5)  # High variance
        elif strategy == 'Growth Equity':
            score += 4 + np.random.normal(0, 4)
        elif strategy == 'Private Credit':
            score += 6 + np.random.normal(0, 2)  # More stable
        else:  # Real Estate
            score += 4 + np.random.normal(0, 3)
        
        # Add some random noise
        score += np.random.normal(0, 4)
        
        # Normalize to 0-100 and determine quartile
        # Top 25% are top quartile
        score_normalized = score
        
        data.append({
            'fund_id': fund_id,
            'fund_manager': fund_manager,
            'vintage_year': vintage_year,
            'strategy': strategy,
            'fund_size_mil': round(fund_size, 0),
            'macro_interest_rate_at_launch': round(interest_rate, 2),
            'public_market_pe_at_launch': round(pe_ratio, 1),
            'hidden_score': score_normalized
        })
    
    df = pd.DataFrame(data)
    
    # Calculate top quartile threshold
    threshold = df['hidden_score'].quantile(0.75)
    df['is_top_quartile'] = (df['hidden_score'] >= threshold).astype(int)
    
    # Drop the hidden score (it's only for generation)
    df_final = df.drop('hidden_score', axis=1)
    
    return df_final

if __name__ == "__main__":
    print("Generating synthetic fund data...")
    df = generate_fund_data(n_funds=5000)
    
    print(f"\nGenerated {len(df)} funds")
    print(f"Top quartile funds: {df['is_top_quartile'].sum()} ({df['is_top_quartile'].mean()*100:.1f}%)")
    print(f"\nStrategy distribution:")
    print(df['strategy'].value_counts())
    
    # Save to CSV
    output_path = 'data/mock_fund_data.csv'
    df.to_csv(output_path, index=False)
    print(f"\nData saved to {output_path}")