Spaces:
Sleeping
Sleeping
File size: 7,113 Bytes
8a693e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | """
Build Vector Database for Lab Report Decoder
Uses Hugging Face sentence-transformers for embeddings
"""
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import glob
def load_documents_from_directory(directory: str) -> list:
"""Load all text files from a directory"""
documents = []
if not os.path.exists(directory):
print(f"β οΈ Directory not found: {directory}")
return documents
# Find all .txt files
txt_files = glob.glob(os.path.join(directory, "**", "*.txt"), recursive=True)
for filepath in txt_files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
if content.strip():
documents.append({
'content': content,
'source': filepath,
'filename': os.path.basename(filepath)
})
except Exception as e:
print(f"Error reading {filepath}: {e}")
return documents
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list:
"""Split text into overlapping chunks"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence boundary
if end < len(text):
last_period = chunk.rfind('.')
last_newline = chunk.rfind('\n')
break_point = max(last_period, last_newline)
if break_point > chunk_size * 0.5: # Only if break point is reasonable
chunk = chunk[:break_point + 1]
end = start + break_point + 1
chunks.append(chunk.strip())
start = end - overlap
return chunks
def build_knowledge_base():
"""Build the vector database from medical documents"""
print("π Loading medical documents...")
# Load documents from data directory
data_dir = 'data/'
all_documents = []
if not os.path.exists(data_dir):
print(f"β οΈ Creating data directory: {data_dir}")
os.makedirs(data_dir, exist_ok=True)
os.makedirs(os.path.join(data_dir, 'lab_markers'), exist_ok=True)
os.makedirs(os.path.join(data_dir, 'nutrition'), exist_ok=True)
os.makedirs(os.path.join(data_dir, 'conditions'), exist_ok=True)
print("β οΈ Please add medical reference documents to the data/ folder")
return None
# Load from all subdirectories
for subdir in ['lab_markers', 'nutrition', 'conditions']:
subdir_path = os.path.join(data_dir, subdir)
docs = load_documents_from_directory(subdir_path)
all_documents.extend(docs)
if not all_documents:
print("β οΈ No documents found in data/ directory")
print("Please add .txt files with medical information")
return None
print(f"β
Loaded {len(all_documents)} documents")
# Chunk documents
print("βοΈ Splitting documents into chunks...")
all_chunks = []
all_metadata = []
for doc in all_documents:
chunks = chunk_text(doc['content'], chunk_size=1000, overlap=200)
for i, chunk in enumerate(chunks):
all_chunks.append(chunk)
all_metadata.append({
'source': doc['source'],
'filename': doc['filename'],
'chunk_id': i
})
print(f"β
Created {len(all_chunks)} text chunks")
# Load embedding model
print("π§ Loading embedding model (this may take a moment)...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("β
Embedding model loaded")
# Create embeddings
print("π Creating embeddings (this may take a few minutes)...")
embeddings = embedding_model.encode(
all_chunks,
show_progress_bar=True,
convert_to_numpy=True
)
print(f"β
Created {len(embeddings)} embeddings")
# Create ChromaDB collection
print("πΎ Building ChromaDB vector store...")
# Initialize client
db_path = "./chroma_db"
client = chromadb.PersistentClient(path=db_path)
# Delete existing collection if it exists
try:
client.delete_collection("lab_reports")
print("ποΈ Deleted existing collection")
except:
pass
# Create new collection
collection = client.create_collection(
name="lab_reports",
metadata={"description": "Medical lab report information"}
)
# Add documents in batches
batch_size = 100
for i in range(0, len(all_chunks), batch_size):
batch_chunks = all_chunks[i:i + batch_size]
batch_embeddings = embeddings[i:i + batch_size].tolist()
batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_chunks))]
batch_metadata = all_metadata[i:i + batch_size]
collection.add(
documents=batch_chunks,
embeddings=batch_embeddings,
ids=batch_ids,
metadatas=batch_metadata
)
print("β
Vector database built successfully!")
print(f"π Database location: {db_path}")
print(f"π Total vectors: {len(all_chunks)}")
return collection
def test_retrieval(collection):
"""Test the retrieval system"""
if collection is None:
print("\nβ οΈ No collection to test")
return
print("\nπ Testing retrieval system...")
# Load embedding model for queries
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
test_queries = [
"What does low hemoglobin mean?",
"What foods are high in iron?",
"Normal range for glucose"
]
for query in test_queries:
print(f"\nπ Query: {query}")
# Create query embedding
query_embedding = embedding_model.encode(query).tolist()
# Search
results = collection.query(
query_embeddings=[query_embedding],
n_results=2
)
if results and results['documents']:
print(f" β
Found {len(results['documents'][0])} relevant documents")
print(f" π Top result preview: {results['documents'][0][0][:150]}...")
else:
print(" β No results found")
if __name__ == "__main__":
print("π Building Lab Report Decoder Vector Database\n")
# Build the database
collection = build_knowledge_base()
# Test it
if collection:
test_retrieval(collection)
print("\nπ Setup complete! You can now run the Flask application.")
else:
print("\nβ οΈ Please add medical documents to the data/ folder and run again.") |