File size: 8,066 Bytes
2fcea48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | """
scripts/generate_test_data.py
Generates realistic test data for Sheami using your modules.db.SheamiDB API.
Behavior:
- Creates N users (default 100)
- Each user: 3-5 patients (enforced)
- Each patient: 2-6 reports
- Each report: 3-6 tests drawn from TEST_POOL
- For each patient we write trends (per test) using add_or_update_trend
- For each patient we write a final report using add_final_report
Usage:
pip install faker pymongo python-dotenv
MONGODB_URI="mongodb+srv://<user>:<pass>@cluster0.xxxxx.mongodb.net" \
MONGODB_DB="sheami" \
python scripts/generate_test_data.py --num-users 100
The script CALLS THESE EXACT methods on your SheamiDB:
- add_user(email, name)
- add_patient(user_id, name, dob, gender)
- add_report(patient_id, file_name, parsed_data)
- add_or_update_trend(patient_id, test_name, trend_data)
- add_final_report(patient_id, summary, recommendations, trend_snapshots)
"""
import argparse
import random
from collections import defaultdict
from datetime import datetime, timedelta
import os
from faker import Faker
from dotenv import load_dotenv
# Ensure env is loaded
load_dotenv()
# import your DB wrapper
from modules.db import SheamiDB
# ---------- Config & test pool ----------
faker = Faker()
TEST_POOL = {
"Hemoglobin": (11.0, 17.5, "g/dL", "11.0-17.5"),
"Glucose (Fasting)": (60, 130, "mg/dL", "70-99 fasting"),
"Total Cholesterol": (120, 300, "mg/dL", "<200 desirable"),
"Triglycerides": (40, 300, "mg/dL", "<150 normal"),
"HDL": (30, 90, "mg/dL", ">40 desirable"),
"LDL": (50, 200, "mg/dL", "<100 ideal"),
"Creatinine": (0.5, 1.8, "mg/dL", "0.5-1.2"),
"Urea (BUN)": (7, 30, "mg/dL", "7-20"),
"Sodium": (130, 150, "mmol/L", "135-145"),
"Potassium": (3.2, 5.2, "mmol/L", "3.5-5.0"),
"ALT": (7, 55, "U/L", "<45"),
"AST": (8, 48, "U/L", "<40"),
}
def random_date_between(start_year=2019):
start = datetime(start_year, 1, 1)
end = datetime.now()
days = (end - start).days
return start + timedelta(days=random.randint(0, days))
def make_test_values(k):
"""Return list of test dicts matching parsed_data.tests schema."""
chosen = random.sample(list(TEST_POOL.items()), k=k)
tests = []
for name, (low, high, unit, ref) in chosen:
# generate float for float ranges, int for integer-like
if isinstance(low, float) or isinstance(high, float):
value = round(random.uniform(low, high), 2)
else:
value = int(round(random.uniform(low, high)))
tests.append({
"name": name,
"value": value,
"unit": unit,
"reference_range": ref
})
return tests
def compute_direction(points):
if len(points) < 2:
return "stable"
if points[-1]["value"] > points[-2]["value"]:
return "increasing"
if points[-1]["value"] < points[-2]["value"]:
return "decreasing"
return "stable"
# ---------- Generator function ----------
def generate_test_data(db_uri: str, db_name: str, num_users: int = 100,
min_patients=3, max_patients=5,
min_reports=2, max_reports=6,
min_tests=3, max_tests=6,
seed: int = None):
if seed is not None:
random.seed(seed)
Faker.seed(seed)
db = SheamiDB(db_uri, db_name=db_name)
counters = {"users": 0, "patients": 0, "reports": 0, "trends": 0, "final_reports": 0}
for u_idx in range(num_users):
# create user
user_name = faker.name()
user_email = faker.unique.safe_email()
user_id = db.add_user(email=user_email, name=user_name)
counters["users"] += 1
# 3-5 patients per user (as requested)
num_patients = random.randint(min_patients, max_patients)
for _p in range(num_patients):
patient_name = faker.name()
# realistic DOB between 18 and 85
age = random.randint(18, 85)
dob_dt = datetime.now() - timedelta(days=365 * age + random.randint(0, 365))
dob_str = dob_dt.strftime("%Y-%m-%d")
gender = random.choice(["male", "female", "other"])
patient_id = db.add_patient(user_id=user_id, name=patient_name, dob=dob_str, gender=gender)
counters["patients"] += 1
# collect trend points per test name
trends_map = defaultdict(list)
# 2-6 reports per patient
num_reports = random.randint(min_reports, max_reports)
for r_i in range(num_reports):
report_date_dt = random_date_between()
report_date = report_date_dt.strftime("%Y-%m-%d")
num_tests = random.randint(min_tests, max_tests)
tests = make_test_values(num_tests)
parsed_data = {
"tests": tests,
"report_date": report_date
}
file_name = f"report_{report_date.replace('-', '')}_{random.randint(1000,9999)}.pdf"
report_id = db.add_report(patient_id=patient_id, file_name=file_name, parsed_data=parsed_data)
counters["reports"] += 1
# append to trends_map
for t in tests:
trends_map[t["name"]].append({"date": report_date, "value": t["value"]})
# write trends to DB using add_or_update_trend (upsert)
for test_name, points in trends_map.items():
# sort points by date
pts_sorted = sorted(points, key=lambda x: x["date"])
db.add_or_update_trend(patient_id=patient_id, test_name=test_name, trend_data=pts_sorted)
counters["trends"] += 1
# create a final report summarizing trends
trend_snapshots = []
for test_name, points in trends_map.items():
pts_sorted = sorted(points, key=lambda x: x["date"])
latest_value = pts_sorted[-1]["value"]
direction = compute_direction(pts_sorted)
trend_snapshots.append({
"test_name": test_name,
"latest_value": latest_value,
"direction": direction
})
summary = f"Auto-generated summary for {patient_name} ({len(trend_snapshots)} tests)"
recommendations = []
# simple heuristic: if any trending up, recommend follow-up
if any(ts["direction"] == "increasing" for ts in trend_snapshots):
recommendations.append("Follow up for rising values")
else:
recommendations.append("Continue routine monitoring")
db.add_final_report(patient_id=patient_id,
summary=summary,
recommendations=recommendations,
trend_snapshots=trend_snapshots)
counters["final_reports"] += 1
# occasional progress print
if (u_idx + 1) % 10 == 0 or (u_idx + 1) == num_users:
print(f"Created {u_idx+1}/{num_users} users so far...")
# summary
print("Generation complete. Summary:")
for k, v in counters.items():
print(f" {k}: {v}")
# ---------- CLI ----------
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate test data for Sheami (matches your db.py).")
parser.add_argument("--num-users", type=int, default=100, help="Number of users to create")
parser.add_argument("--db-uri", type=str, default=os.getenv("MONGODB_URI", "mongodb://localhost:27017"),
help="MongoDB connection URI")
parser.add_argument("--db-name", type=str, default=os.getenv("MONGODB_DB", "sheami"),
help="Database name")
parser.add_argument("--seed", type=int, default=None, help="Random seed (optional)")
args = parser.parse_args()
generate_test_data(db_uri=args.db_uri, db_name=args.db_name,
num_users=args.num_users, seed=args.seed)
|