sales_analytics / data /seed_data.py
cryogenic22's picture
Create data/seed_data.py
3e9cac4 verified
"""
Pharmaceutical Analytics Seed Data
This module contains functions to generate initial seed data for the pharmaceutical
analytics database. It creates a minimal set of data entries that serve as the
foundation for the larger synthetic dataset.
"""
import sqlite3
import pandas as pd
from datetime import datetime, timedelta
def create_seed_regions(conn: sqlite3.Connection) -> None:
"""Create seed data for regions"""
regions = pd.DataFrame({
'region_id': ['NE', 'SE', 'MW', 'SW', 'W'],
'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'],
'country': ['USA'] * 5,
'division': ['East', 'East', 'Central', 'Central', 'West'],
'population': [55000000, 62000000, 70000000, 42000000, 65000000]
})
regions.to_sql('regions', conn, if_exists='replace', index=False)
print(f"Created {len(regions)} seed regions")
def create_seed_products(conn: sqlite3.Connection) -> None:
"""Create seed data for products"""
products = pd.DataFrame({
'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'],
'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'],
'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'],
'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'],
'status': ['Active', 'Active', 'Active', 'Active', 'Active'],
'list_price': [299.99, 199.99, 499.99, 399.99, 599.99]
})
products.to_sql('products', conn, if_exists='replace', index=False)
print(f"Created {len(products)} seed products")
def create_seed_competitor_products(conn: sqlite3.Connection) -> None:
"""Create seed data for competitor products"""
competitor_products = pd.DataFrame({
'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'],
'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'],
'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'],
'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'],
'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'],
'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99],
'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX']
})
competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False)
print(f"Created {len(competitor_products)} seed competitor products")
def create_seed_territories(conn: sqlite3.Connection) -> None:
"""Create seed data for territories"""
territory_mapping = {
'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'],
'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'],
'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'],
'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'],
'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX']
}
territory_names = {
'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore',
'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley',
'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley',
'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain',
'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest'
}
territories = []
sales_reps = [f'REP{i:03d}' for i in range(1, 41)]
rep_idx = 0
for region_id, territory_ids in territory_mapping.items():
for territory_id in territory_ids:
territories.append({
'territory_id': territory_id,
'territory_name': territory_names[territory_id],
'region_id': region_id,
'sales_rep_id': sales_reps[rep_idx]
})
rep_idx += 1
territories_df = pd.DataFrame(territories)
territories_df.to_sql('territories', conn, if_exists='replace', index=False)
print(f"Created {len(territories_df)} seed territories")
def create_seed_market_events(conn: sqlite3.Connection) -> None:
"""Create seed data for market events"""
today = datetime.now()
events = [
{
'event_id': 1,
'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'),
'event_type': 'FDA Approval',
'description': 'New indication approved for DrugX',
'affected_products': 'DRX',
'affected_regions': 'ALL',
'impact_score': 0.15
},
{
'event_id': 2,
'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'),
'event_type': 'Guideline Change',
'description': 'Treatment guidelines updated favoring DrugX approach',
'affected_products': 'DRX',
'affected_regions': 'ALL',
'impact_score': 0.10
},
{
'event_id': 3,
'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'),
'event_type': 'Safety Alert',
'description': 'Minor safety concern raised for competing products',
'affected_products': 'CP1,CP3',
'affected_regions': 'ALL',
'impact_score': 0.05
},
{
'event_id': 4,
'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'),
'event_type': 'Competitor Launch',
'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX',
'affected_products': 'DRX',
'affected_regions': 'NE',
'impact_score': -0.35
},
{
'event_id': 5,
'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'),
'event_type': 'Supply Chain Disruption',
'description': 'Supply issues affecting DrugX availability in Northeast',
'affected_products': 'DRX',
'affected_regions': 'NE',
'impact_score': -0.25
}
]
events_df = pd.DataFrame(events)
events_df.to_sql('market_events', conn, if_exists='replace', index=False)
print(f"Created {len(events_df)} seed market events")
def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None:
"""Create all seed data in the database"""
conn = sqlite3.connect(db_path)
create_seed_regions(conn)
create_seed_products(conn)
create_seed_competitor_products(conn)
create_seed_territories(conn)
create_seed_market_events(conn)
conn.commit()
conn.close()
print(f"All seed data created in {db_path}")
if __name__ == "__main__":
# Create seed data when run directly
create_seed_data()