File size: 1,472 Bytes
268b40a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | import os
import json
from datasets import load_dataset
def setup():
from vector_store import collection
if collection.count() == 0:
print("Loading dataset into ChromaDB...")
dataset = load_dataset(
"AGBonnet/augmented-clinical-notes",
split="train[:200]"
)
df = dataset.to_pandas()
success = 0
for i, row in df.iterrows():
try:
summary = json.loads(row["summary"]) if isinstance(row["summary"], str) else row["summary"]
patient_info = summary.get("patient information", {})
age = patient_info.get("age", "Unknown")
sex = patient_info.get("sex", "Unknown")
visit_motivation = summary.get("visit motivation", "Unknown")
collection.add(
documents=[row["full_note"]],
metadatas=[{
"idx": str(row["idx"]),
"age": str(age),
"sex": str(sex),
"visit_motivation": str(visit_motivation)[:200]
}],
ids=[str(row["idx"])]
)
success += 1
except Exception:
continue
print(f"✅ Loaded {success} notes into ChromaDB")
else:
print(f"✅ ChromaDB already has {collection.count()} notes")
if __name__ == "__main__":
setup() |