Spaces:
Runtime error
Runtime error
| """ | |
| Pharmaceutical Analytics Seed Data | |
| This module contains functions to generate initial seed data for the pharmaceutical | |
| analytics database. It creates a minimal set of data entries that serve as the | |
| foundation for the larger synthetic dataset. | |
| """ | |
| import sqlite3 | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| def create_seed_regions(conn: sqlite3.Connection) -> None: | |
| """Create seed data for regions""" | |
| regions = pd.DataFrame({ | |
| 'region_id': ['NE', 'SE', 'MW', 'SW', 'W'], | |
| 'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'], | |
| 'country': ['USA'] * 5, | |
| 'division': ['East', 'East', 'Central', 'Central', 'West'], | |
| 'population': [55000000, 62000000, 70000000, 42000000, 65000000] | |
| }) | |
| regions.to_sql('regions', conn, if_exists='replace', index=False) | |
| print(f"Created {len(regions)} seed regions") | |
| def create_seed_products(conn: sqlite3.Connection) -> None: | |
| """Create seed data for products""" | |
| products = pd.DataFrame({ | |
| 'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'], | |
| 'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'], | |
| 'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'], | |
| 'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'], | |
| 'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'], | |
| 'status': ['Active', 'Active', 'Active', 'Active', 'Active'], | |
| 'list_price': [299.99, 199.99, 499.99, 399.99, 599.99] | |
| }) | |
| products.to_sql('products', conn, if_exists='replace', index=False) | |
| print(f"Created {len(products)} seed products") | |
| def create_seed_competitor_products(conn: sqlite3.Connection) -> None: | |
| """Create seed data for competitor products""" | |
| competitor_products = pd.DataFrame({ | |
| 'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'], | |
| 'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'], | |
| 'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'], | |
| 'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'], | |
| 'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'], | |
| 'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'], | |
| 'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99], | |
| 'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX'] | |
| }) | |
| competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False) | |
| print(f"Created {len(competitor_products)} seed competitor products") | |
| def create_seed_territories(conn: sqlite3.Connection) -> None: | |
| """Create seed data for territories""" | |
| territory_mapping = { | |
| 'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'], | |
| 'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'], | |
| 'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'], | |
| 'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'], | |
| 'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX'] | |
| } | |
| territory_names = { | |
| 'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore', | |
| 'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley', | |
| 'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley', | |
| 'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain', | |
| 'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest' | |
| } | |
| territories = [] | |
| sales_reps = [f'REP{i:03d}' for i in range(1, 41)] | |
| rep_idx = 0 | |
| for region_id, territory_ids in territory_mapping.items(): | |
| for territory_id in territory_ids: | |
| territories.append({ | |
| 'territory_id': territory_id, | |
| 'territory_name': territory_names[territory_id], | |
| 'region_id': region_id, | |
| 'sales_rep_id': sales_reps[rep_idx] | |
| }) | |
| rep_idx += 1 | |
| territories_df = pd.DataFrame(territories) | |
| territories_df.to_sql('territories', conn, if_exists='replace', index=False) | |
| print(f"Created {len(territories_df)} seed territories") | |
| def create_seed_market_events(conn: sqlite3.Connection) -> None: | |
| """Create seed data for market events""" | |
| today = datetime.now() | |
| events = [ | |
| { | |
| 'event_id': 1, | |
| 'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'), | |
| 'event_type': 'FDA Approval', | |
| 'description': 'New indication approved for DrugX', | |
| 'affected_products': 'DRX', | |
| 'affected_regions': 'ALL', | |
| 'impact_score': 0.15 | |
| }, | |
| { | |
| 'event_id': 2, | |
| 'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'), | |
| 'event_type': 'Guideline Change', | |
| 'description': 'Treatment guidelines updated favoring DrugX approach', | |
| 'affected_products': 'DRX', | |
| 'affected_regions': 'ALL', | |
| 'impact_score': 0.10 | |
| }, | |
| { | |
| 'event_id': 3, | |
| 'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'), | |
| 'event_type': 'Safety Alert', | |
| 'description': 'Minor safety concern raised for competing products', | |
| 'affected_products': 'CP1,CP3', | |
| 'affected_regions': 'ALL', | |
| 'impact_score': 0.05 | |
| }, | |
| { | |
| 'event_id': 4, | |
| 'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'), | |
| 'event_type': 'Competitor Launch', | |
| 'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX', | |
| 'affected_products': 'DRX', | |
| 'affected_regions': 'NE', | |
| 'impact_score': -0.35 | |
| }, | |
| { | |
| 'event_id': 5, | |
| 'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'), | |
| 'event_type': 'Supply Chain Disruption', | |
| 'description': 'Supply issues affecting DrugX availability in Northeast', | |
| 'affected_products': 'DRX', | |
| 'affected_regions': 'NE', | |
| 'impact_score': -0.25 | |
| } | |
| ] | |
| events_df = pd.DataFrame(events) | |
| events_df.to_sql('market_events', conn, if_exists='replace', index=False) | |
| print(f"Created {len(events_df)} seed market events") | |
| def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None: | |
| """Create all seed data in the database""" | |
| conn = sqlite3.connect(db_path) | |
| create_seed_regions(conn) | |
| create_seed_products(conn) | |
| create_seed_competitor_products(conn) | |
| create_seed_territories(conn) | |
| create_seed_market_events(conn) | |
| conn.commit() | |
| conn.close() | |
| print(f"All seed data created in {db_path}") | |
| if __name__ == "__main__": | |
| # Create seed data when run directly | |
| create_seed_data() |