| import os | |
| import json | |
| from datasets import load_dataset | |
| def setup(): | |
| from vector_store import collection | |
| if collection.count() == 0: | |
| print("Loading dataset into ChromaDB...") | |
| dataset = load_dataset( | |
| "AGBonnet/augmented-clinical-notes", | |
| split="train[:200]" | |
| ) | |
| df = dataset.to_pandas() | |
| success = 0 | |
| for i, row in df.iterrows(): | |
| try: | |
| summary = json.loads(row["summary"]) if isinstance(row["summary"], str) else row["summary"] | |
| patient_info = summary.get("patient information", {}) | |
| age = patient_info.get("age", "Unknown") | |
| sex = patient_info.get("sex", "Unknown") | |
| visit_motivation = summary.get("visit motivation", "Unknown") | |
| collection.add( | |
| documents=[row["full_note"]], | |
| metadatas=[{ | |
| "idx": str(row["idx"]), | |
| "age": str(age), | |
| "sex": str(sex), | |
| "visit_motivation": str(visit_motivation)[:200] | |
| }], | |
| ids=[str(row["idx"])] | |
| ) | |
| success += 1 | |
| except Exception: | |
| continue | |
| print(f"β Loaded {success} notes into ChromaDB") | |
| else: | |
| print(f"β ChromaDB already has {collection.count()} notes") | |
| if __name__ == "__main__": | |
| setup() |