AMR-Guard / scripts /ingest_data.py
ghitaben's picture
Med-I-C -> AMR-Guard
85020ae
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# 1. Setup Chroma Persistence
CHROMA_PATH = "data/chroma_db"
DATA_PATH = "data/AMR-Guard/raw"
def ingest_medical_data():
# Persistent client for the competition (Kaggle/Local)
client = chromadb.PersistentClient(path=CHROMA_PATH)
# Define the embedding model
model_name = "all-MiniLM-L6-v2"
ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)
# 2. Ingest Guidelines (PDFs)
# We create a specific collection for cleaner retrieval
guideline_col = client.get_or_create_collection(name="antibiotic_guidelines", embedding_function=ef)
loader = DirectoryLoader(f"{DATA_PATH}/guidelines", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
# 1000/100 split as discussed for clinical coherence
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
# Adding to Chroma
guideline_col.add(
ids=[f"guideline_{i}" for i in range(len(chunks))],
documents=[c.page_content for c in chunks],
metadatas=[c.metadata for c in chunks]
)
print(f"Successfully ingested {len(chunks)} guideline chunks.")
if __name__ == "__main__":
ingest_medical_data()