cryogenic22 commited on
Commit
3e9cac4
·
verified ·
1 Parent(s): d419ea2

Create data/seed_data.py

Browse files
Files changed (1) hide show
  1. data/seed_data.py +165 -0
data/seed_data.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pharmaceutical Analytics Seed Data
3
+
4
+ This module contains functions to generate initial seed data for the pharmaceutical
5
+ analytics database. It creates a minimal set of data entries that serve as the
6
+ foundation for the larger synthetic dataset.
7
+ """
8
+
9
+ import sqlite3
10
+ import pandas as pd
11
+ from datetime import datetime, timedelta
12
+
13
+ def create_seed_regions(conn: sqlite3.Connection) -> None:
14
+ """Create seed data for regions"""
15
+ regions = pd.DataFrame({
16
+ 'region_id': ['NE', 'SE', 'MW', 'SW', 'W'],
17
+ 'region_name': ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West'],
18
+ 'country': ['USA'] * 5,
19
+ 'division': ['East', 'East', 'Central', 'Central', 'West'],
20
+ 'population': [55000000, 62000000, 70000000, 42000000, 65000000]
21
+ })
22
+
23
+ regions.to_sql('regions', conn, if_exists='replace', index=False)
24
+ print(f"Created {len(regions)} seed regions")
25
+
26
+ def create_seed_products(conn: sqlite3.Connection) -> None:
27
+ """Create seed data for products"""
28
+ products = pd.DataFrame({
29
+ 'product_id': ['DRX', 'PRX', 'TRX', 'ZRX', 'NRX'],
30
+ 'product_name': ['DrugX', 'PainRex', 'TranquiX', 'ZymoRex', 'NeuroRex'],
31
+ 'therapeutic_area': ['Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
32
+ 'molecule': ['moleculeX', 'moleculeP', 'moleculeT', 'moleculeZ', 'moleculeN'],
33
+ 'launch_date': ['2020-01-01', '2018-06-15', '2021-03-10', '2019-11-05', '2022-01-20'],
34
+ 'status': ['Active', 'Active', 'Active', 'Active', 'Active'],
35
+ 'list_price': [299.99, 199.99, 499.99, 399.99, 599.99]
36
+ })
37
+
38
+ products.to_sql('products', conn, if_exists='replace', index=False)
39
+ print(f"Created {len(products)} seed products")
40
+
41
+ def create_seed_competitor_products(conn: sqlite3.Connection) -> None:
42
+ """Create seed data for competitor products"""
43
+ competitor_products = pd.DataFrame({
44
+ 'competitor_product_id': ['CP1', 'CP2', 'CP3', 'CP4', 'CP5', 'CP6'],
45
+ 'product_name': ['CompDrug1', 'CompDrug2', 'CompDrug3', 'CompDrug4', 'CompDrug5', 'CompDrug6'],
46
+ 'manufacturer': ['CompPharma', 'MedCorp', 'BioSolutions', 'GeneriCo', 'PharmGiant', 'MoleCorp'],
47
+ 'therapeutic_area': ['Cardiology', 'Cardiology', 'Pain Management', 'Neurology', 'Immunology', 'Neurology'],
48
+ 'molecule': ['moleculeC1', 'moleculeC2', 'moleculeC3', 'moleculeC4', 'moleculeC5', 'moleculeC6'],
49
+ 'launch_date': ['2019-05-10', '2023-01-15', '2017-11-20', '2020-08-05', '2021-03-15', '2022-07-10'],
50
+ 'list_price': [279.99, 259.99, 189.99, 459.99, 379.99, 549.99],
51
+ 'competing_with_product_id': ['DRX', 'DRX', 'PRX', 'TRX', 'ZRX', 'NRX']
52
+ })
53
+
54
+ competitor_products.to_sql('competitor_products', conn, if_exists='replace', index=False)
55
+ print(f"Created {len(competitor_products)} seed competitor products")
56
+
57
+ def create_seed_territories(conn: sqlite3.Connection) -> None:
58
+ """Create seed data for territories"""
59
+ territory_mapping = {
60
+ 'NE': ['NE-NYC', 'NE-BOS', 'NE-PHL', 'NE-DCA'],
61
+ 'SE': ['SE-ATL', 'SE-MIA', 'SE-CLT', 'SE-NSH'],
62
+ 'MW': ['MW-CHI', 'MW-DET', 'MW-MIN', 'MW-STL'],
63
+ 'SW': ['SW-DAL', 'SW-HOU', 'SW-PHX', 'SW-DEN'],
64
+ 'W': ['W-LAX', 'W-SFO', 'W-SEA', 'W-PDX']
65
+ }
66
+
67
+ territory_names = {
68
+ 'NE-NYC': 'New York Metro', 'NE-BOS': 'New England', 'NE-PHL': 'Philadelphia', 'NE-DCA': 'DC-Baltimore',
69
+ 'SE-ATL': 'Atlanta', 'SE-MIA': 'Florida', 'SE-CLT': 'Carolinas', 'SE-NSH': 'Tennessee Valley',
70
+ 'MW-CHI': 'Chicago', 'MW-DET': 'Great Lakes', 'MW-MIN': 'Upper Midwest', 'MW-STL': 'Missouri Valley',
71
+ 'SW-DAL': 'North Texas', 'SW-HOU': 'Gulf Coast', 'SW-PHX': 'Southwest Desert', 'SW-DEN': 'Mountain',
72
+ 'W-LAX': 'Southern California', 'W-SFO': 'Northern California', 'W-SEA': 'Pacific Northwest', 'W-PDX': 'Northwest'
73
+ }
74
+
75
+ territories = []
76
+ sales_reps = [f'REP{i:03d}' for i in range(1, 41)]
77
+ rep_idx = 0
78
+
79
+ for region_id, territory_ids in territory_mapping.items():
80
+ for territory_id in territory_ids:
81
+ territories.append({
82
+ 'territory_id': territory_id,
83
+ 'territory_name': territory_names[territory_id],
84
+ 'region_id': region_id,
85
+ 'sales_rep_id': sales_reps[rep_idx]
86
+ })
87
+ rep_idx += 1
88
+
89
+ territories_df = pd.DataFrame(territories)
90
+ territories_df.to_sql('territories', conn, if_exists='replace', index=False)
91
+ print(f"Created {len(territories_df)} seed territories")
92
+
93
+ def create_seed_market_events(conn: sqlite3.Connection) -> None:
94
+ """Create seed data for market events"""
95
+ today = datetime.now()
96
+ events = [
97
+ {
98
+ 'event_id': 1,
99
+ 'event_date': (today - timedelta(days=365)).strftime('%Y-%m-%d'),
100
+ 'event_type': 'FDA Approval',
101
+ 'description': 'New indication approved for DrugX',
102
+ 'affected_products': 'DRX',
103
+ 'affected_regions': 'ALL',
104
+ 'impact_score': 0.15
105
+ },
106
+ {
107
+ 'event_id': 2,
108
+ 'event_date': (today - timedelta(days=180)).strftime('%Y-%m-%d'),
109
+ 'event_type': 'Guideline Change',
110
+ 'description': 'Treatment guidelines updated favoring DrugX approach',
111
+ 'affected_products': 'DRX',
112
+ 'affected_regions': 'ALL',
113
+ 'impact_score': 0.10
114
+ },
115
+ {
116
+ 'event_id': 3,
117
+ 'event_date': (today - timedelta(days=90)).strftime('%Y-%m-%d'),
118
+ 'event_type': 'Safety Alert',
119
+ 'description': 'Minor safety concern raised for competing products',
120
+ 'affected_products': 'CP1,CP3',
121
+ 'affected_regions': 'ALL',
122
+ 'impact_score': 0.05
123
+ },
124
+ {
125
+ 'event_id': 4,
126
+ 'event_date': (today - timedelta(days=45)).strftime('%Y-%m-%d'),
127
+ 'event_type': 'Competitor Launch',
128
+ 'description': 'CompDrug2 launched by MedCorp targeting the same indication as DrugX',
129
+ 'affected_products': 'DRX',
130
+ 'affected_regions': 'NE',
131
+ 'impact_score': -0.35
132
+ },
133
+ {
134
+ 'event_id': 5,
135
+ 'event_date': (today - timedelta(days=60)).strftime('%Y-%m-%d'),
136
+ 'event_type': 'Supply Chain Disruption',
137
+ 'description': 'Supply issues affecting DrugX availability in Northeast',
138
+ 'affected_products': 'DRX',
139
+ 'affected_regions': 'NE',
140
+ 'impact_score': -0.25
141
+ }
142
+ ]
143
+
144
+ events_df = pd.DataFrame(events)
145
+ events_df.to_sql('market_events', conn, if_exists='replace', index=False)
146
+ print(f"Created {len(events_df)} seed market events")
147
+
148
+ def create_seed_data(db_path: str = "data/pharma_db.sqlite") -> None:
149
+ """Create all seed data in the database"""
150
+ conn = sqlite3.connect(db_path)
151
+
152
+ create_seed_regions(conn)
153
+ create_seed_products(conn)
154
+ create_seed_competitor_products(conn)
155
+ create_seed_territories(conn)
156
+ create_seed_market_events(conn)
157
+
158
+ conn.commit()
159
+ conn.close()
160
+
161
+ print(f"All seed data created in {db_path}")
162
+
163
+ if __name__ == "__main__":
164
+ # Create seed data when run directly
165
+ create_seed_data()