File size: 1,888 Bytes
65562f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import zipfile
import torch
from transformers import AutoModel, AutoTokenizer
import chromadb

# Constants
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"
DB_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb_store")
ZIP_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chromadb_store.zip")

# Step 1: Unzip the vector store if not already present
if not os.path.exists(os.path.join(DB_DIR, "chroma.sqlite3")):
    print("🔓 Unzipping prebuilt ChromaDB store...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(".")
    print("Vector store unzipped and ready.")
else:
    print("Vector store already present. Skipping unzip.")

# Step 2: Connect to persistent ChromaDB
client = chromadb.PersistentClient(path=DB_DIR)
discharge_collection = client.get_or_create_collection("discharge_notes")
trials_collection = client.get_or_create_collection("clinical_trials")

# Step 3: Load BioBERT for embedding
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

# Step 4: Embedding function
def get_embedding(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy().tolist()

# Final check
print(f"📦 ChromaDB Status:")
print(f"  - Discharge Notes Loaded: {discharge_collection.count()}")
print(f"  - Clinical Trials Loaded: {trials_collection.count()}")