| | """
|
| | Build Vector Database for Lab Report Decoder
|
| | Uses Hugging Face sentence-transformers for embeddings
|
| | """
|
| |
|
| | import os
|
| | from pathlib import Path
|
| | from sentence_transformers import SentenceTransformer
|
| | import chromadb
|
| | from chromadb.config import Settings
|
| | import glob
|
| |
|
| | def load_documents_from_directory(directory: str) -> list:
|
| | """Load all text files from a directory"""
|
| | documents = []
|
| |
|
| | if not os.path.exists(directory):
|
| | print(f"β οΈ Directory not found: {directory}")
|
| | return documents
|
| |
|
| |
|
| | txt_files = glob.glob(os.path.join(directory, "**", "*.txt"), recursive=True)
|
| |
|
| | for filepath in txt_files:
|
| | try:
|
| | with open(filepath, 'r', encoding='utf-8') as f:
|
| | content = f.read()
|
| | if content.strip():
|
| | documents.append({
|
| | 'content': content,
|
| | 'source': filepath,
|
| | 'filename': os.path.basename(filepath)
|
| | })
|
| | except Exception as e:
|
| | print(f"Error reading {filepath}: {e}")
|
| |
|
| | return documents
|
| |
|
| | def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list:
|
| | """Split text into overlapping chunks"""
|
| | chunks = []
|
| | start = 0
|
| |
|
| | while start < len(text):
|
| | end = start + chunk_size
|
| | chunk = text[start:end]
|
| |
|
| |
|
| | if end < len(text):
|
| | last_period = chunk.rfind('.')
|
| | last_newline = chunk.rfind('\n')
|
| | break_point = max(last_period, last_newline)
|
| |
|
| | if break_point > chunk_size * 0.5:
|
| | chunk = chunk[:break_point + 1]
|
| | end = start + break_point + 1
|
| |
|
| | chunks.append(chunk.strip())
|
| | start = end - overlap
|
| |
|
| | return chunks
|
| |
|
| | def build_knowledge_base():
|
| | """Build the vector database from medical documents"""
|
| |
|
| | print("π Loading medical documents...")
|
| |
|
| |
|
| | data_dir = 'data/'
|
| | all_documents = []
|
| |
|
| | if not os.path.exists(data_dir):
|
| | print(f"β οΈ Creating data directory: {data_dir}")
|
| | os.makedirs(data_dir, exist_ok=True)
|
| | os.makedirs(os.path.join(data_dir, 'lab_markers'), exist_ok=True)
|
| | os.makedirs(os.path.join(data_dir, 'nutrition'), exist_ok=True)
|
| | os.makedirs(os.path.join(data_dir, 'conditions'), exist_ok=True)
|
| | print("β οΈ Please add medical reference documents to the data/ folder")
|
| | return None
|
| |
|
| |
|
| | for subdir in ['lab_markers', 'nutrition', 'conditions']:
|
| | subdir_path = os.path.join(data_dir, subdir)
|
| | docs = load_documents_from_directory(subdir_path)
|
| | all_documents.extend(docs)
|
| |
|
| | if not all_documents:
|
| | print("β οΈ No documents found in data/ directory")
|
| | print("Please add .txt files with medical information")
|
| | return None
|
| |
|
| | print(f"β
Loaded {len(all_documents)} documents")
|
| |
|
| |
|
| | print("βοΈ Splitting documents into chunks...")
|
| | all_chunks = []
|
| | all_metadata = []
|
| |
|
| | for doc in all_documents:
|
| | chunks = chunk_text(doc['content'], chunk_size=1000, overlap=200)
|
| | for i, chunk in enumerate(chunks):
|
| | all_chunks.append(chunk)
|
| | all_metadata.append({
|
| | 'source': doc['source'],
|
| | 'filename': doc['filename'],
|
| | 'chunk_id': i
|
| | })
|
| |
|
| | print(f"β
Created {len(all_chunks)} text chunks")
|
| |
|
| |
|
| | print("π§ Loading embedding model (this may take a moment)...")
|
| | embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| | print("β
Embedding model loaded")
|
| |
|
| |
|
| | print("π Creating embeddings (this may take a few minutes)...")
|
| | embeddings = embedding_model.encode(
|
| | all_chunks,
|
| | show_progress_bar=True,
|
| | convert_to_numpy=True
|
| | )
|
| | print(f"β
Created {len(embeddings)} embeddings")
|
| |
|
| |
|
| | print("πΎ Building ChromaDB vector store...")
|
| |
|
| |
|
| | db_path = "./chroma_db"
|
| | client = chromadb.PersistentClient(path=db_path)
|
| |
|
| |
|
| | try:
|
| | client.delete_collection("lab_reports")
|
| | print("ποΈ Deleted existing collection")
|
| | except:
|
| | pass
|
| |
|
| |
|
| | collection = client.create_collection(
|
| | name="lab_reports",
|
| | metadata={"description": "Medical lab report information"}
|
| | )
|
| |
|
| |
|
| | batch_size = 100
|
| | for i in range(0, len(all_chunks), batch_size):
|
| | batch_chunks = all_chunks[i:i + batch_size]
|
| | batch_embeddings = embeddings[i:i + batch_size].tolist()
|
| | batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_chunks))]
|
| | batch_metadata = all_metadata[i:i + batch_size]
|
| |
|
| | collection.add(
|
| | documents=batch_chunks,
|
| | embeddings=batch_embeddings,
|
| | ids=batch_ids,
|
| | metadatas=batch_metadata
|
| | )
|
| |
|
| | print("β
Vector database built successfully!")
|
| | print(f"π Database location: {db_path}")
|
| | print(f"π Total vectors: {len(all_chunks)}")
|
| |
|
| | return collection
|
| |
|
| | def test_retrieval(collection):
|
| | """Test the retrieval system"""
|
| | if collection is None:
|
| | print("\nβ οΈ No collection to test")
|
| | return
|
| |
|
| | print("\nπ Testing retrieval system...")
|
| |
|
| |
|
| | embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| |
|
| | test_queries = [
|
| | "What does low hemoglobin mean?",
|
| | "What foods are high in iron?",
|
| | "Normal range for glucose"
|
| | ]
|
| |
|
| | for query in test_queries:
|
| | print(f"\nπ Query: {query}")
|
| |
|
| |
|
| | query_embedding = embedding_model.encode(query).tolist()
|
| |
|
| |
|
| | results = collection.query(
|
| | query_embeddings=[query_embedding],
|
| | n_results=2
|
| | )
|
| |
|
| | if results and results['documents']:
|
| | print(f" β
Found {len(results['documents'][0])} relevant documents")
|
| | print(f" π Top result preview: {results['documents'][0][0][:150]}...")
|
| | else:
|
| | print(" β No results found")
|
| |
|
| | if __name__ == "__main__":
|
| | print("π Building Lab Report Decoder Vector Database\n")
|
| |
|
| |
|
| | collection = build_knowledge_base()
|
| |
|
| |
|
| | if collection:
|
| | test_retrieval(collection)
|
| | print("\nπ Setup complete! You can now run the Flask application.")
|
| | else:
|
| | print("\nβ οΈ Please add medical documents to the data/ folder and run again.") |