File size: 7,208 Bytes
3e9cac4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
Pharmaceutical Analytics Seed Data

This module contains functions to generate initial seed data for the pharmaceutical
analytics database. It creates a minimal set of data entries that serve as the 
foundation for the larger synthetic dataset.
"""

import sqlite3
import pandas as pd
from datetime import datetime, timedelta

def create_seed_regions(conn: sqlite3.Connection) -> None:
    """Create seed data for regions"""
    regions = pd.DataFrame({
        'region_id': ['NE', 'SE', 'MW', 'SW', 'W'],
        'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'],
        'country': ['USA'] * 5,
        'division': ['East', 'East', 'Central', 'Central', 'West'],
        'population': [55000000, 62000000, 70000000, 42000000, 65000000]
    })
    
    regions.to_sql('regions', conn, if_exists='replace', index=False)
    print(f"Created {len(regions)} seed regions")

def create_seed_products(conn: sqlite3.Connection) -> None:
    """Create seed data for products"""
    products = pd.DataFrame({
        'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'],
        'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'],
        'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
        'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'],
        'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'],
        'status': ['Active', 'Active', 'Active', 'Active', 'Active'],
        'list_price': [299.99, 199.99, 499.99, 399.99, 599.99]
    })
    
    products.to_sql('products', conn, if_exists='replace', index=False)
    print(f"Created {len(products)} seed products")

def create_seed_competitor_products(conn: sqlite3.Connection) -> None:
    """Create seed data for competitor products"""
    competitor_products = pd.DataFrame({
        'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'],
        'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'],
        'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'],
        'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
        'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'],
        'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'],
        'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99],
        'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX']
    })
    
    competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False)
    print(f"Created {len(competitor_products)} seed competitor products")

def create_seed_territories(conn: sqlite3.Connection) -> None:
    """Create seed data for territories"""
    territory_mapping = {
        'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'],
        'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'],
        'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'],
        'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'],
        'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX']
    }
    
    territory_names = {
        'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore',
        'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley',
        'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley',
        'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain',
        'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest'
    }
    
    territories = []
    sales_reps = [f'REP{i:03d}' for i in range(1, 41)]
    rep_idx = 0
    
    for region_id, territory_ids in territory_mapping.items():
        for territory_id in territory_ids:
            territories.append({
                'territory_id': territory_id,
                'territory_name': territory_names[territory_id],
                'region_id': region_id,
                'sales_rep_id': sales_reps[rep_idx]
            })
            rep_idx += 1
    
    territories_df = pd.DataFrame(territories)
    territories_df.to_sql('territories', conn, if_exists='replace', index=False)
    print(f"Created {len(territories_df)} seed territories")

def create_seed_market_events(conn: sqlite3.Connection) -> None:
    """Create seed data for market events"""
    today = datetime.now()
    events = [
        {
            'event_id': 1,
            'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'),
            'event_type': 'FDA Approval',
            'description': 'New indication approved for DrugX',
            'affected_products': 'DRX',
            'affected_regions': 'ALL',
            'impact_score': 0.15
        },
        {
            'event_id': 2,
            'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'),
            'event_type': 'Guideline Change',
            'description': 'Treatment guidelines updated favoring DrugX approach',
            'affected_products': 'DRX',
            'affected_regions': 'ALL',
            'impact_score': 0.10
        },
        {
            'event_id': 3,
            'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'),
            'event_type': 'Safety Alert',
            'description': 'Minor safety concern raised for competing products',
            'affected_products': 'CP1,CP3',
            'affected_regions': 'ALL',
            'impact_score': 0.05
        },
        {
            'event_id': 4,
            'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'),
            'event_type': 'Competitor Launch',
            'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX',
            'affected_products': 'DRX',
            'affected_regions': 'NE',
            'impact_score': -0.35
        },
        {
            'event_id': 5,
            'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'),
            'event_type': 'Supply Chain Disruption',
            'description': 'Supply issues affecting DrugX availability in Northeast',
            'affected_products': 'DRX',
            'affected_regions': 'NE',
            'impact_score': -0.25
        }
    ]
    
    events_df = pd.DataFrame(events)
    events_df.to_sql('market_events', conn, if_exists='replace', index=False)
    print(f"Created {len(events_df)} seed market events")

def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None:
    """Create all seed data in the database"""
    conn = sqlite3.connect(db_path)
    
    create_seed_regions(conn)
    create_seed_products(conn)
    create_seed_competitor_products(conn)
    create_seed_territories(conn)
    create_seed_market_events(conn)
    
    conn.commit()
    conn.close()
    
    print(f"All seed data created in {db_path}")

if __name__ == "__main__":
    # Create seed data when run directly
    create_seed_data()