Spaces:

Hanan-Alnakhal
/

Lab-test-decoder

Running

App Files Files Community

Lab-test-decoder / build_vector_db.py

Hanan-Alnakhal

Upload 12 files

8a693e2 verified 3 months ago

raw

history blame

7.11 kB

	"""
	Build Vector Database for Lab Report Decoder
	Uses Hugging Face sentence-transformers for embeddings
	"""

	import os
	from pathlib import Path
	from sentence_transformers import SentenceTransformer
	import chromadb
	from chromadb.config import Settings
	import glob

	def load_documents_from_directory(directory: str) -> list:
	"""Load all text files from a directory"""
	documents = []

	if not os.path.exists(directory):
	print(f"⚠️ Directory not found: {directory}")
	return documents

	# Find all .txt files
	txt_files = glob.glob(os.path.join(directory, "*", ".txt"), recursive=True)

	for filepath in txt_files:
	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	content = f.read()
	if content.strip():
	documents.append({
	'content': content,
	'source': filepath,
	'filename': os.path.basename(filepath)
	})
	except Exception as e:
	print(f"Error reading {filepath}: {e}")

	return documents

	def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> list:
	"""Split text into overlapping chunks"""
	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size
	chunk = text[start:end]

	# Try to break at sentence boundary
	if end < len(text):
	last_period = chunk.rfind('.')
	last_newline = chunk.rfind('\n')
	break_point = max(last_period, last_newline)

	if break_point > chunk_size * 0.5: # Only if break point is reasonable
	chunk = chunk[:break_point + 1]
	end = start + break_point + 1

	chunks.append(chunk.strip())
	start = end - overlap

	return chunks

	def build_knowledge_base():
	"""Build the vector database from medical documents"""

	print("📚 Loading medical documents...")

	# Load documents from data directory
	data_dir = 'data/'
	all_documents = []

	if not os.path.exists(data_dir):
	print(f"⚠️ Creating data directory: {data_dir}")
	os.makedirs(data_dir, exist_ok=True)
	os.makedirs(os.path.join(data_dir, 'lab_markers'), exist_ok=True)
	os.makedirs(os.path.join(data_dir, 'nutrition'), exist_ok=True)
	os.makedirs(os.path.join(data_dir, 'conditions'), exist_ok=True)
	print("⚠️ Please add medical reference documents to the data/ folder")
	return None

	# Load from all subdirectories
	for subdir in ['lab_markers', 'nutrition', 'conditions']:
	subdir_path = os.path.join(data_dir, subdir)
	docs = load_documents_from_directory(subdir_path)
	all_documents.extend(docs)

	if not all_documents:
	print("⚠️ No documents found in data/ directory")
	print("Please add .txt files with medical information")
	return None

	print(f"✅ Loaded {len(all_documents)} documents")

	# Chunk documents
	print("✂️ Splitting documents into chunks...")
	all_chunks = []
	all_metadata = []

	for doc in all_documents:
	chunks = chunk_text(doc['content'], chunk_size=1000, overlap=200)
	for i, chunk in enumerate(chunks):
	all_chunks.append(chunk)
	all_metadata.append({
	'source': doc['source'],
	'filename': doc['filename'],
	'chunk_id': i
	})

	print(f"✅ Created {len(all_chunks)} text chunks")

	# Load embedding model
	print("🧠 Loading embedding model (this may take a moment)...")
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
	print("✅ Embedding model loaded")

	# Create embeddings
	print("🔄 Creating embeddings (this may take a few minutes)...")
	embeddings = embedding_model.encode(
	all_chunks,
	show_progress_bar=True,
	convert_to_numpy=True
	)
	print(f"✅ Created {len(embeddings)} embeddings")

	# Create ChromaDB collection
	print("💾 Building ChromaDB vector store...")

	# Initialize client
	db_path = "./chroma_db"
	client = chromadb.PersistentClient(path=db_path)

	# Delete existing collection if it exists
	try:
	client.delete_collection("lab_reports")
	print("🗑️ Deleted existing collection")
	except:
	pass

	# Create new collection
	collection = client.create_collection(
	name="lab_reports",
	metadata={"description": "Medical lab report information"}
	)

	# Add documents in batches
	batch_size = 100
	for i in range(0, len(all_chunks), batch_size):
	batch_chunks = all_chunks[i:i + batch_size]
	batch_embeddings = embeddings[i:i + batch_size].tolist()
	batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_chunks))]
	batch_metadata = all_metadata[i:i + batch_size]

	collection.add(
	documents=batch_chunks,
	embeddings=batch_embeddings,
	ids=batch_ids,
	metadatas=batch_metadata
	)

	print("✅ Vector database built successfully!")
	print(f"📍 Database location: {db_path}")
	print(f"📊 Total vectors: {len(all_chunks)}")

	return collection

	def test_retrieval(collection):
	"""Test the retrieval system"""
	if collection is None:
	print("\n⚠️ No collection to test")
	return

	print("\n🔍 Testing retrieval system...")

	# Load embedding model for queries
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	test_queries = [
	"What does low hemoglobin mean?",
	"What foods are high in iron?",
	"Normal range for glucose"
	]

	for query in test_queries:
	print(f"\n📝 Query: {query}")

	# Create query embedding
	query_embedding = embedding_model.encode(query).tolist()

	# Search
	results = collection.query(
	query_embeddings=[query_embedding],
	n_results=2
	)

	if results and results['documents']:
	print(f" ✅ Found {len(results['documents'][0])} relevant documents")
	print(f" 📄 Top result preview: {results['documents'][0][0][:150]}...")
	else:
	print(" ❌ No results found")

	if __name__ == "__main__":
	print("🚀 Building Lab Report Decoder Vector Database\n")

	# Build the database
	collection = build_knowledge_base()

	# Test it
	if collection:
	test_retrieval(collection)
	print("\n🎉 Setup complete! You can now run the Flask application.")
	else:
	print("\n⚠️ Please add medical documents to the data/ folder and run again.")