Spaces:
Runtime error
Runtime error
Create data/seed_data.py
Browse files- data/seed_data.py +165 -0
data/seed_data.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pharmaceutical Analytics Seed Data
|
| 3 |
+
|
| 4 |
+
This module contains functions to generate initial seed data for the pharmaceutical
|
| 5 |
+
analytics database. It creates a minimal set of data entries that serve as the
|
| 6 |
+
foundation for the larger synthetic dataset.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sqlite3
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from datetime import datetime, timedelta
|
| 12 |
+
|
| 13 |
+
def create_seed_regions(conn: sqlite3.Connection) -> None:
|
| 14 |
+
"""Create seed data for regions"""
|
| 15 |
+
regions = pd.DataFrame({
|
| 16 |
+
'region_id': ['NE', 'SE', 'MW', 'SW', 'W'],
|
| 17 |
+
'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'],
|
| 18 |
+
'country': ['USA'] * 5,
|
| 19 |
+
'division': ['East', 'East', 'Central', 'Central', 'West'],
|
| 20 |
+
'population': [55000000, 62000000, 70000000, 42000000, 65000000]
|
| 21 |
+
})
|
| 22 |
+
|
| 23 |
+
regions.to_sql('regions', conn, if_exists='replace', index=False)
|
| 24 |
+
print(f"Created {len(regions)} seed regions")
|
| 25 |
+
|
| 26 |
+
def create_seed_products(conn: sqlite3.Connection) -> None:
|
| 27 |
+
"""Create seed data for products"""
|
| 28 |
+
products = pd.DataFrame({
|
| 29 |
+
'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'],
|
| 30 |
+
'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'],
|
| 31 |
+
'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
|
| 32 |
+
'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'],
|
| 33 |
+
'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'],
|
| 34 |
+
'status': ['Active', 'Active', 'Active', 'Active', 'Active'],
|
| 35 |
+
'list_price': [299.99, 199.99, 499.99, 399.99, 599.99]
|
| 36 |
+
})
|
| 37 |
+
|
| 38 |
+
products.to_sql('products', conn, if_exists='replace', index=False)
|
| 39 |
+
print(f"Created {len(products)} seed products")
|
| 40 |
+
|
| 41 |
+
def create_seed_competitor_products(conn: sqlite3.Connection) -> None:
|
| 42 |
+
"""Create seed data for competitor products"""
|
| 43 |
+
competitor_products = pd.DataFrame({
|
| 44 |
+
'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'],
|
| 45 |
+
'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'],
|
| 46 |
+
'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'],
|
| 47 |
+
'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
|
| 48 |
+
'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'],
|
| 49 |
+
'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'],
|
| 50 |
+
'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99],
|
| 51 |
+
'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX']
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False)
|
| 55 |
+
print(f"Created {len(competitor_products)} seed competitor products")
|
| 56 |
+
|
| 57 |
+
def create_seed_territories(conn: sqlite3.Connection) -> None:
|
| 58 |
+
"""Create seed data for territories"""
|
| 59 |
+
territory_mapping = {
|
| 60 |
+
'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'],
|
| 61 |
+
'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'],
|
| 62 |
+
'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'],
|
| 63 |
+
'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'],
|
| 64 |
+
'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX']
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
territory_names = {
|
| 68 |
+
'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore',
|
| 69 |
+
'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley',
|
| 70 |
+
'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley',
|
| 71 |
+
'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain',
|
| 72 |
+
'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest'
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
territories = []
|
| 76 |
+
sales_reps = [f'REP{i:03d}' for i in range(1, 41)]
|
| 77 |
+
rep_idx = 0
|
| 78 |
+
|
| 79 |
+
for region_id, territory_ids in territory_mapping.items():
|
| 80 |
+
for territory_id in territory_ids:
|
| 81 |
+
territories.append({
|
| 82 |
+
'territory_id': territory_id,
|
| 83 |
+
'territory_name': territory_names[territory_id],
|
| 84 |
+
'region_id': region_id,
|
| 85 |
+
'sales_rep_id': sales_reps[rep_idx]
|
| 86 |
+
})
|
| 87 |
+
rep_idx += 1
|
| 88 |
+
|
| 89 |
+
territories_df = pd.DataFrame(territories)
|
| 90 |
+
territories_df.to_sql('territories', conn, if_exists='replace', index=False)
|
| 91 |
+
print(f"Created {len(territories_df)} seed territories")
|
| 92 |
+
|
| 93 |
+
def create_seed_market_events(conn: sqlite3.Connection) -> None:
|
| 94 |
+
"""Create seed data for market events"""
|
| 95 |
+
today = datetime.now()
|
| 96 |
+
events = [
|
| 97 |
+
{
|
| 98 |
+
'event_id': 1,
|
| 99 |
+
'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'),
|
| 100 |
+
'event_type': 'FDA Approval',
|
| 101 |
+
'description': 'New indication approved for DrugX',
|
| 102 |
+
'affected_products': 'DRX',
|
| 103 |
+
'affected_regions': 'ALL',
|
| 104 |
+
'impact_score': 0.15
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
'event_id': 2,
|
| 108 |
+
'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'),
|
| 109 |
+
'event_type': 'Guideline Change',
|
| 110 |
+
'description': 'Treatment guidelines updated favoring DrugX approach',
|
| 111 |
+
'affected_products': 'DRX',
|
| 112 |
+
'affected_regions': 'ALL',
|
| 113 |
+
'impact_score': 0.10
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
'event_id': 3,
|
| 117 |
+
'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'),
|
| 118 |
+
'event_type': 'Safety Alert',
|
| 119 |
+
'description': 'Minor safety concern raised for competing products',
|
| 120 |
+
'affected_products': 'CP1,CP3',
|
| 121 |
+
'affected_regions': 'ALL',
|
| 122 |
+
'impact_score': 0.05
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
'event_id': 4,
|
| 126 |
+
'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'),
|
| 127 |
+
'event_type': 'Competitor Launch',
|
| 128 |
+
'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX',
|
| 129 |
+
'affected_products': 'DRX',
|
| 130 |
+
'affected_regions': 'NE',
|
| 131 |
+
'impact_score': -0.35
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
'event_id': 5,
|
| 135 |
+
'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'),
|
| 136 |
+
'event_type': 'Supply Chain Disruption',
|
| 137 |
+
'description': 'Supply issues affecting DrugX availability in Northeast',
|
| 138 |
+
'affected_products': 'DRX',
|
| 139 |
+
'affected_regions': 'NE',
|
| 140 |
+
'impact_score': -0.25
|
| 141 |
+
}
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
events_df = pd.DataFrame(events)
|
| 145 |
+
events_df.to_sql('market_events', conn, if_exists='replace', index=False)
|
| 146 |
+
print(f"Created {len(events_df)} seed market events")
|
| 147 |
+
|
| 148 |
+
def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None:
|
| 149 |
+
"""Create all seed data in the database"""
|
| 150 |
+
conn = sqlite3.connect(db_path)
|
| 151 |
+
|
| 152 |
+
create_seed_regions(conn)
|
| 153 |
+
create_seed_products(conn)
|
| 154 |
+
create_seed_competitor_products(conn)
|
| 155 |
+
create_seed_territories(conn)
|
| 156 |
+
create_seed_market_events(conn)
|
| 157 |
+
|
| 158 |
+
conn.commit()
|
| 159 |
+
conn.close()
|
| 160 |
+
|
| 161 |
+
print(f"All seed data created in {db_path}")
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
# Create seed data when run directly
|
| 165 |
+
create_seed_data()
|