""" Pharmaceutical Analytics Seed Data This module contains functions to generate initial seed data for the pharmaceutical analytics database. It creates a minimal set of data entries that serve as the foundation for the larger synthetic dataset. """ import sqlite3 import pandas as pd from datetime import datetime, timedelta def create_seed_regions(conn: sqlite3.Connection) -> None: """Create seed data for regions""" regions = pd.DataFrame({ 'region_id': ['NE', 'SE', 'MW', 'SW', 'W'], 'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'], 'country': ['USA'] * 5, 'division': ['East', 'East', 'Central', 'Central', 'West'], 'population': [55000000, 62000000, 70000000, 42000000, 65000000] }) regions.to_sql('regions', conn, if_exists='replace', index=False) print(f"Created {len(regions)} seed regions") def create_seed_products(conn: sqlite3.Connection) -> None: """Create seed data for products""" products = pd.DataFrame({ 'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'], 'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'], 'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'], 'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'], 'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'], 'status': ['Active', 'Active', 'Active', 'Active', 'Active'], 'list_price': [299.99, 199.99, 499.99, 399.99, 599.99] }) products.to_sql('products', conn, if_exists='replace', index=False) print(f"Created {len(products)} seed products") def create_seed_competitor_products(conn: sqlite3.Connection) -> None: """Create seed data for competitor products""" competitor_products = pd.DataFrame({ 'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'], 'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'], 'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'], 'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'], 'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'], 'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'], 'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99], 'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX'] }) competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False) print(f"Created {len(competitor_products)} seed competitor products") def create_seed_territories(conn: sqlite3.Connection) -> None: """Create seed data for territories""" territory_mapping = { 'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'], 'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'], 'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'], 'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'], 'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX'] } territory_names = { 'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore', 'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley', 'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley', 'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain', 'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest' } territories = [] sales_reps = [f'REP{i:03d}' for i in range(1, 41)] rep_idx = 0 for region_id, territory_ids in territory_mapping.items(): for territory_id in territory_ids: territories.append({ 'territory_id': territory_id, 'territory_name': territory_names[territory_id], 'region_id': region_id, 'sales_rep_id': sales_reps[rep_idx] }) rep_idx += 1 territories_df = pd.DataFrame(territories) territories_df.to_sql('territories', conn, if_exists='replace', index=False) print(f"Created {len(territories_df)} seed territories") def create_seed_market_events(conn: sqlite3.Connection) -> None: """Create seed data for market events""" today = datetime.now() events = [ { 'event_id': 1, 'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'), 'event_type': 'FDA Approval', 'description': 'New indication approved for DrugX', 'affected_products': 'DRX', 'affected_regions': 'ALL', 'impact_score': 0.15 }, { 'event_id': 2, 'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'), 'event_type': 'Guideline Change', 'description': 'Treatment guidelines updated favoring DrugX approach', 'affected_products': 'DRX', 'affected_regions': 'ALL', 'impact_score': 0.10 }, { 'event_id': 3, 'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'), 'event_type': 'Safety Alert', 'description': 'Minor safety concern raised for competing products', 'affected_products': 'CP1,CP3', 'affected_regions': 'ALL', 'impact_score': 0.05 }, { 'event_id': 4, 'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'), 'event_type': 'Competitor Launch', 'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX', 'affected_products': 'DRX', 'affected_regions': 'NE', 'impact_score': -0.35 }, { 'event_id': 5, 'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'), 'event_type': 'Supply Chain Disruption', 'description': 'Supply issues affecting DrugX availability in Northeast', 'affected_products': 'DRX', 'affected_regions': 'NE', 'impact_score': -0.25 } ] events_df = pd.DataFrame(events) events_df.to_sql('market_events', conn, if_exists='replace', index=False) print(f"Created {len(events_df)} seed market events") def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None: """Create all seed data in the database""" conn = sqlite3.connect(db_path) create_seed_regions(conn) create_seed_products(conn) create_seed_competitor_products(conn) create_seed_territories(conn) create_seed_market_events(conn) conn.commit() conn.close() print(f"All seed data created in {db_path}") if __name__ == "__main__": # Create seed data when run directly create_seed_data()