Spaces:
Runtime error
Runtime error
File size: 7,208 Bytes
3e9cac4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
"""
Pharmaceutical Analytics Seed Data
This module contains functions to generate initial seed data for the pharmaceutical
analytics database. It creates a minimal set of data entries that serve as the
foundation for the larger synthetic dataset.
"""
import sqlite3
import pandas as pd
from datetime import datetime, timedelta
def create_seed_regions(conn: sqlite3.Connection) -> None:
"""Create seed data for regions"""
regions = pd.DataFrame({
'region_id': ['NE', 'SE', 'MW', 'SW', 'W'],
'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'],
'country': ['USA'] * 5,
'division': ['East', 'East', 'Central', 'Central', 'West'],
'population': [55000000, 62000000, 70000000, 42000000, 65000000]
})
regions.to_sql('regions', conn, if_exists='replace', index=False)
print(f"Created {len(regions)} seed regions")
def create_seed_products(conn: sqlite3.Connection) -> None:
"""Create seed data for products"""
products = pd.DataFrame({
'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'],
'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'],
'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'],
'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'],
'status': ['Active', 'Active', 'Active', 'Active', 'Active'],
'list_price': [299.99, 199.99, 499.99, 399.99, 599.99]
})
products.to_sql('products', conn, if_exists='replace', index=False)
print(f"Created {len(products)} seed products")
def create_seed_competitor_products(conn: sqlite3.Connection) -> None:
"""Create seed data for competitor products"""
competitor_products = pd.DataFrame({
'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'],
'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'],
'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'],
'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'],
'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'],
'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99],
'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX']
})
competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False)
print(f"Created {len(competitor_products)} seed competitor products")
def create_seed_territories(conn: sqlite3.Connection) -> None:
"""Create seed data for territories"""
territory_mapping = {
'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'],
'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'],
'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'],
'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'],
'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX']
}
territory_names = {
'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore',
'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley',
'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley',
'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain',
'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest'
}
territories = []
sales_reps = [f'REP{i:03d}' for i in range(1, 41)]
rep_idx = 0
for region_id, territory_ids in territory_mapping.items():
for territory_id in territory_ids:
territories.append({
'territory_id': territory_id,
'territory_name': territory_names[territory_id],
'region_id': region_id,
'sales_rep_id': sales_reps[rep_idx]
})
rep_idx += 1
territories_df = pd.DataFrame(territories)
territories_df.to_sql('territories', conn, if_exists='replace', index=False)
print(f"Created {len(territories_df)} seed territories")
def create_seed_market_events(conn: sqlite3.Connection) -> None:
"""Create seed data for market events"""
today = datetime.now()
events = [
{
'event_id': 1,
'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'),
'event_type': 'FDA Approval',
'description': 'New indication approved for DrugX',
'affected_products': 'DRX',
'affected_regions': 'ALL',
'impact_score': 0.15
},
{
'event_id': 2,
'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'),
'event_type': 'Guideline Change',
'description': 'Treatment guidelines updated favoring DrugX approach',
'affected_products': 'DRX',
'affected_regions': 'ALL',
'impact_score': 0.10
},
{
'event_id': 3,
'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'),
'event_type': 'Safety Alert',
'description': 'Minor safety concern raised for competing products',
'affected_products': 'CP1,CP3',
'affected_regions': 'ALL',
'impact_score': 0.05
},
{
'event_id': 4,
'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'),
'event_type': 'Competitor Launch',
'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX',
'affected_products': 'DRX',
'affected_regions': 'NE',
'impact_score': -0.35
},
{
'event_id': 5,
'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'),
'event_type': 'Supply Chain Disruption',
'description': 'Supply issues affecting DrugX availability in Northeast',
'affected_products': 'DRX',
'affected_regions': 'NE',
'impact_score': -0.25
}
]
events_df = pd.DataFrame(events)
events_df.to_sql('market_events', conn, if_exists='replace', index=False)
print(f"Created {len(events_df)} seed market events")
def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None:
"""Create all seed data in the database"""
conn = sqlite3.connect(db_path)
create_seed_regions(conn)
create_seed_products(conn)
create_seed_competitor_products(conn)
create_seed_territories(conn)
create_seed_market_events(conn)
conn.commit()
conn.close()
print(f"All seed data created in {db_path}")
if __name__ == "__main__":
# Create seed data when run directly
create_seed_data() |