File size: 1,888 Bytes
65562f6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
import os
import zipfile
import torch
from transformers import AutoModel, AutoTokenizer
import chromadb
# Constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
DB_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb_store")
ZIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb_store.zip")
# Step 1: Unzip the vector store if not already present
if not os.path.exists(os.path.join(DB_DIR, "chroma.sqlite3")):
print("🔓 Unzipping prebuilt ChromaDB store...")
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
zip_ref.extractall(".")
print("Vector store unzipped and ready.")
else:
print("Vector store already present. Skipping unzip.")
# Step 2: Connect to persistent ChromaDB
client = chromadb.PersistentClient(path=DB_DIR)
discharge_collection = client.get_or_create_collection("discharge_notes")
trials_collection = client.get_or_create_collection("clinical_trials")
# Step 3: Load BioBERT for embedding
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
# Step 4: Embedding function
def get_embedding(text: str):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().tolist()
# Final check
print(f"📦 ChromaDB Status:")
print(f" - Discharge Notes Loaded: {discharge_collection.count()}")
print(f" - Clinical Trials Loaded: {trials_collection.count()}") |