Spaces:
Sleeping
Sleeping
feat: implement hospital-specific customization pipeline with two-stage ANNOY retrieval
Browse files- Restructure pdf-version to customization/ for hospital-specific deployment
- Add customization_pipeline.py with two-stage retrieval (tag -> document -> chunk)
- Implement ANNOY indices for fast medical concept and chunk similarity search
- Add generate_embeddings.py for building hospital-specific embeddings
- Create test suite validating end-to-end pipeline functionality
- Add customization_requirements.txt with all necessary dependencies
- Update .gitignore to exclude rag_env/ virtual environment
This enables hospitals to deploy their own customized medical RAG systems
with private documents while maintaining the base medical knowledge framework.
- .gitignore +1 -0
- customization/customization_pipeline.py +156 -0
- {src/pdf-version β customization}/generate_embeddings.py +14 -18
- {src/pdf-version β customization/src}/__init__.py +0 -0
- customization/src/data/__init__.py +23 -0
- {src/pdf-version β customization/src}/data/loaders.py +0 -0
- {src/pdf-version β customization/src}/data/pdf_processing.py +0 -0
- {src/pdf-version β customization/src}/demos/__init__.py +0 -0
- {src/pdf-version β customization/src}/demos/demo_runner.py +157 -4
- {src/pdf-version β customization/src}/indexing/__init__.py +0 -0
- customization/src/indexing/annoy_manager.py +392 -0
- {src/pdf-version β customization/src}/indexing/document_indexer.py +0 -0
- {src/pdf-version β customization/src}/indexing/embedding_creator.py +0 -0
- {src/pdf-version β customization/src}/indexing/storage.py +91 -2
- {src/pdf-version β customization/src}/models/__init__.py +0 -0
- {src/pdf-version β customization/src}/models/embedding_models.py +0 -0
- {src/pdf-version β customization/src}/rag/__init__.py +0 -0
- {src/pdf-version β customization/src}/rag/medical_rag_pipeline.py +0 -0
- {src/pdf-version β customization/src}/retrieval/__init__.py +0 -0
- {src/pdf-version β customization/src}/retrieval/chunk_retriever.py +177 -3
- {src/pdf-version β customization/src}/retrieval/document_retriever.py +207 -3
- {src/pdf-version β customization/src}/utils/__init__.py +0 -0
- {src/pdf-version β customization/src}/utils/helpers.py +0 -0
- customization/test/test_pipeline.py +117 -0
- customization_requirements.txt +188 -0
- src/pdf-version/data/__init__.py +0 -15
- src/pdf-version/main.py +0 -83
- src/pdf-version/oncall_ai.py +0 -55
.gitignore
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
# π§ Virtual environments
|
| 2 |
genAIvenv/
|
| 3 |
.final_project_env/
|
|
|
|
| 4 |
.env
|
| 5 |
.venv
|
| 6 |
env/
|
|
|
|
| 1 |
# π§ Virtual environments
|
| 2 |
genAIvenv/
|
| 3 |
.final_project_env/
|
| 4 |
+
rag_env/
|
| 5 |
.env
|
| 6 |
.venv
|
| 7 |
env/
|
customization/customization_pipeline.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Customization Pipeline - Hospital-Specific Document Retrieval
|
| 3 |
+
|
| 4 |
+
This module provides the interface for hospital-specific document processing and retrieval.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
|
| 11 |
+
# Add src directory to Python path
|
| 12 |
+
sys.path.insert(0, str(Path(__file__).parent / 'src'))
|
| 13 |
+
|
| 14 |
+
# Import necessary modules
|
| 15 |
+
from models.embedding_models import load_biomedbert_model
|
| 16 |
+
from data.loaders import load_annotations
|
| 17 |
+
from indexing.document_indexer import build_document_index
|
| 18 |
+
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
| 19 |
+
from indexing.storage import save_document_system, load_document_system_with_annoy
|
| 20 |
+
from retrieval.document_retriever import create_document_tag_mapping
|
| 21 |
+
from retrieval.chunk_retriever import find_relevant_chunks_with_fallback
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def build_customization_embeddings():
|
| 25 |
+
"""Build embeddings for the hospital-specific documents in the docs folder."""
|
| 26 |
+
print("π₯ Building hospital-specific embeddings...")
|
| 27 |
+
|
| 28 |
+
# Paths
|
| 29 |
+
base_path = Path(__file__).parent
|
| 30 |
+
docs_path = base_path / "docs"
|
| 31 |
+
processing_path = base_path / "processing"
|
| 32 |
+
|
| 33 |
+
# Load model and annotations
|
| 34 |
+
embedding_model = load_biomedbert_model()
|
| 35 |
+
annotations = load_annotations(file_path=str(processing_path / "mapping.json"))
|
| 36 |
+
|
| 37 |
+
if not annotations:
|
| 38 |
+
print("β Unable to load annotation data")
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
# Build document index with chunks
|
| 42 |
+
print("π Processing documents...")
|
| 43 |
+
document_index = build_document_index(
|
| 44 |
+
annotations,
|
| 45 |
+
assets_dir=str(docs_path),
|
| 46 |
+
chunk_size=256,
|
| 47 |
+
chunk_overlap=25
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Create embeddings
|
| 51 |
+
print("π’ Creating embeddings...")
|
| 52 |
+
tag_embeddings = create_tag_embeddings(embedding_model, document_index)
|
| 53 |
+
doc_tag_mapping = create_document_tag_mapping(document_index, tag_embeddings)
|
| 54 |
+
chunk_embeddings = create_chunk_embeddings(embedding_model, document_index)
|
| 55 |
+
|
| 56 |
+
# Save everything
|
| 57 |
+
print("πΎ Saving to processing folder...")
|
| 58 |
+
save_document_system(
|
| 59 |
+
document_index,
|
| 60 |
+
tag_embeddings,
|
| 61 |
+
doc_tag_mapping,
|
| 62 |
+
chunk_embeddings,
|
| 63 |
+
output_dir=str(processing_path / "embeddings"),
|
| 64 |
+
build_annoy_indices=True
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
print("β
Embeddings built successfully!")
|
| 68 |
+
return True
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def retrieve_document_chunks(query: str, top_k: int = 5) -> List[Dict]:
|
| 72 |
+
"""Retrieve relevant document chunks using two-stage ANNOY retrieval.
|
| 73 |
+
|
| 74 |
+
Stage 1: Find relevant documents using tag embeddings (medical concepts)
|
| 75 |
+
Stage 2: Find relevant chunks within those documents using chunk embeddings
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
query: The search query
|
| 79 |
+
top_k: Number of chunks to retrieve
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
List of dictionaries containing chunk information
|
| 83 |
+
"""
|
| 84 |
+
# Load model and existing embeddings
|
| 85 |
+
embedding_model = load_biomedbert_model()
|
| 86 |
+
|
| 87 |
+
# Load from processing folder
|
| 88 |
+
processing_path = Path(__file__).parent / "processing"
|
| 89 |
+
|
| 90 |
+
# Load the saved system with ANNOY indices
|
| 91 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = \
|
| 92 |
+
load_document_system_with_annoy(
|
| 93 |
+
input_dir=str(processing_path / "embeddings"),
|
| 94 |
+
annoy_dir=str(processing_path / "indices")
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
if annoy_manager is None:
|
| 98 |
+
print("β Failed to load ANNOY manager")
|
| 99 |
+
return []
|
| 100 |
+
|
| 101 |
+
# Create query embedding
|
| 102 |
+
query_embedding = embedding_model.encode(query)
|
| 103 |
+
|
| 104 |
+
# Stage 1: Find relevant documents using tag ANNOY index
|
| 105 |
+
print(f"π Stage 1: Finding relevant documents for query: '{query}'")
|
| 106 |
+
relevant_tags, tag_distances = annoy_manager.search_tags(
|
| 107 |
+
query_embedding=query_embedding,
|
| 108 |
+
n_neighbors=20, # Get more tags to find diverse documents
|
| 109 |
+
include_distances=True
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Get documents that contain these relevant tags
|
| 113 |
+
relevant_docs = set()
|
| 114 |
+
for tag in relevant_tags[:10]: # Use top 10 tags
|
| 115 |
+
for doc_name, doc_info in doc_tag_mapping.items():
|
| 116 |
+
if tag in doc_info['tags']:
|
| 117 |
+
relevant_docs.add(doc_name)
|
| 118 |
+
|
| 119 |
+
relevant_docs = list(relevant_docs)
|
| 120 |
+
print(f"β
Found {len(relevant_docs)} relevant documents based on medical tags")
|
| 121 |
+
|
| 122 |
+
if not relevant_docs:
|
| 123 |
+
print("β No relevant documents found")
|
| 124 |
+
return []
|
| 125 |
+
|
| 126 |
+
# Stage 2: Find relevant chunks within these documents using chunk ANNOY index
|
| 127 |
+
print(f"π Stage 2: Finding relevant chunks within {len(relevant_docs)} documents")
|
| 128 |
+
chunks, chunk_distances = annoy_manager.search_chunks_in_documents(
|
| 129 |
+
query_embedding=query_embedding,
|
| 130 |
+
document_names=relevant_docs,
|
| 131 |
+
n_neighbors=top_k,
|
| 132 |
+
include_distances=True
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Convert ANNOY distances to cosine similarities
|
| 136 |
+
from indexing.annoy_manager import convert_angular_distance_to_cosine_similarity
|
| 137 |
+
|
| 138 |
+
# Format results
|
| 139 |
+
results = []
|
| 140 |
+
for chunk, distance in zip(chunks, chunk_distances):
|
| 141 |
+
# Convert angular distance to cosine similarity
|
| 142 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 143 |
+
|
| 144 |
+
results.append({
|
| 145 |
+
'document': chunk['document'],
|
| 146 |
+
'chunk_text': chunk['text'],
|
| 147 |
+
'score': similarity,
|
| 148 |
+
'metadata': {
|
| 149 |
+
'chunk_id': chunk['chunk_id'],
|
| 150 |
+
'start_char': chunk.get('start_char', 0),
|
| 151 |
+
'end_char': chunk.get('end_char', 0)
|
| 152 |
+
}
|
| 153 |
+
})
|
| 154 |
+
|
| 155 |
+
print(f"β
Retrieved {len(results)} relevant chunks")
|
| 156 |
+
return results
|
{src/pdf-version β customization}/generate_embeddings.py
RENAMED
|
@@ -1,18 +1,12 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
import
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
# Add pdf-version directory to Python path
|
| 10 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
| 11 |
-
|
| 12 |
-
from demos.demo_runner import build_medical_rag_system
|
| 13 |
|
| 14 |
def main():
|
| 15 |
-
print("π Starting to build
|
| 16 |
print("π Configuration:")
|
| 17 |
print(" - Chunk size: 256 tokens")
|
| 18 |
print(" - Chunk overlap: 25 tokens (10%)")
|
|
@@ -22,17 +16,19 @@ def main():
|
|
| 22 |
print("")
|
| 23 |
|
| 24 |
try:
|
| 25 |
-
|
| 26 |
|
| 27 |
-
if
|
| 28 |
-
print("β
Successfully built
|
| 29 |
-
print("π Generated files:")
|
| 30 |
-
print(" - document_index.json")
|
| 31 |
-
print(" - tag_embeddings.json")
|
| 32 |
-
print(" - document_tag_mapping.json")
|
| 33 |
-
print(" - chunk_embeddings.json")
|
|
|
|
|
|
|
| 34 |
else:
|
| 35 |
-
print("β Failed to build
|
| 36 |
|
| 37 |
except KeyboardInterrupt:
|
| 38 |
print("\nβ οΈ Process interrupted by user")
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Generate embeddings for hospital-specific documents
|
| 4 |
"""
|
| 5 |
|
| 6 |
+
from customization_pipeline import build_customization_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def main():
|
| 9 |
+
print("π Starting to build hospital-specific embeddings...")
|
| 10 |
print("π Configuration:")
|
| 11 |
print(" - Chunk size: 256 tokens")
|
| 12 |
print(" - Chunk overlap: 25 tokens (10%)")
|
|
|
|
| 16 |
print("")
|
| 17 |
|
| 18 |
try:
|
| 19 |
+
success = build_customization_embeddings()
|
| 20 |
|
| 21 |
+
if success:
|
| 22 |
+
print("\nβ
Successfully built embeddings!")
|
| 23 |
+
print("π Generated files in processing folder:")
|
| 24 |
+
print(" - embeddings/document_index.json")
|
| 25 |
+
print(" - embeddings/tag_embeddings.json")
|
| 26 |
+
print(" - embeddings/document_tag_mapping.json")
|
| 27 |
+
print(" - embeddings/chunk_embeddings.json")
|
| 28 |
+
print(" - indices/annoy_metadata.json")
|
| 29 |
+
print(" - indices/*.ann files")
|
| 30 |
else:
|
| 31 |
+
print("\nβ Failed to build embeddings")
|
| 32 |
|
| 33 |
except KeyboardInterrupt:
|
| 34 |
print("\nβ οΈ Process interrupted by user")
|
{src/pdf-version β customization/src}/__init__.py
RENAMED
|
File without changes
|
customization/src/data/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data loading and PDF processing."""
|
| 2 |
+
|
| 3 |
+
from .loaders import load_annotations, filter_pdf_files
|
| 4 |
+
|
| 5 |
+
# Try to import PDF processing functions, but handle missing dependencies gracefully
|
| 6 |
+
try:
|
| 7 |
+
from .pdf_processing import (
|
| 8 |
+
extract_pdf_text,
|
| 9 |
+
extract_tables_from_pdf,
|
| 10 |
+
extract_images_ocr_from_pdf,
|
| 11 |
+
extract_pdf_content_enhanced
|
| 12 |
+
)
|
| 13 |
+
PDF_PROCESSING_AVAILABLE = True
|
| 14 |
+
__all__ = [
|
| 15 |
+
'load_annotations', 'filter_pdf_files',
|
| 16 |
+
'extract_pdf_text', 'extract_tables_from_pdf',
|
| 17 |
+
'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
|
| 18 |
+
]
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
print(f"β οΈ PDF processing not available: {e}")
|
| 21 |
+
print("π Only working with existing embeddings")
|
| 22 |
+
PDF_PROCESSING_AVAILABLE = False
|
| 23 |
+
__all__ = ['load_annotations', 'filter_pdf_files']
|
{src/pdf-version β customization/src}/data/loaders.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/data/pdf_processing.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/demos/__init__.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/demos/demo_runner.py
RENAMED
|
@@ -6,9 +6,15 @@ from models.embedding_models import load_biomedbert_model
|
|
| 6 |
from data.loaders import load_annotations
|
| 7 |
from indexing.document_indexer import build_document_index
|
| 8 |
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
| 9 |
-
from indexing.storage import save_document_system, load_document_system
|
| 10 |
-
from retrieval.document_retriever import
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
|
@@ -135,4 +141,151 @@ def demo_all_strategies(query: str = "chest pain and shortness of breath"):
|
|
| 135 |
for strategy, docs in results.items():
|
| 136 |
print(f"{strategy:>10}: {len(docs)} documents selected")
|
| 137 |
|
| 138 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
from data.loaders import load_annotations
|
| 7 |
from indexing.document_indexer import build_document_index
|
| 8 |
from indexing.embedding_creator import create_tag_embeddings, create_chunk_embeddings
|
| 9 |
+
from indexing.storage import save_document_system, load_document_system, load_document_system_with_annoy
|
| 10 |
+
from retrieval.document_retriever import (
|
| 11 |
+
create_document_tag_mapping, find_relevant_documents,
|
| 12 |
+
find_relevant_documents_with_fallback
|
| 13 |
+
)
|
| 14 |
+
from retrieval.chunk_retriever import (
|
| 15 |
+
find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag,
|
| 16 |
+
find_relevant_chunks_with_fallback
|
| 17 |
+
)
|
| 18 |
|
| 19 |
|
| 20 |
def build_medical_rag_system(enable_chunk_embeddings: bool = True):
|
|
|
|
| 141 |
for strategy, docs in results.items():
|
| 142 |
print(f"{strategy:>10}: {len(docs)} documents selected")
|
| 143 |
|
| 144 |
+
return results
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def demo_rag_query_with_annoy(query: str = "chest pain and shortness of breath",
|
| 148 |
+
strategy: str = "top_p", use_chunks: bool = True, **kwargs):
|
| 149 |
+
"""Demo RAG query functionality with ANNOY acceleration."""
|
| 150 |
+
print(f"\nπ Demo ANNOY Query: '{query}' (Strategy: {strategy}, Use chunks: {use_chunks})")
|
| 151 |
+
print("=" * 80)
|
| 152 |
+
|
| 153 |
+
# Try to load existing system with ANNOY
|
| 154 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
|
| 155 |
+
|
| 156 |
+
if document_index is None:
|
| 157 |
+
print("π¦ No saved system found, building new one...")
|
| 158 |
+
build_result = build_medical_rag_system(enable_chunk_embeddings=use_chunks)
|
| 159 |
+
if build_result[0] is None:
|
| 160 |
+
return
|
| 161 |
+
embedding_model, document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = build_result
|
| 162 |
+
|
| 163 |
+
# Try to load ANNOY manager after building
|
| 164 |
+
from indexing.storage import load_annoy_manager
|
| 165 |
+
annoy_manager = load_annoy_manager()
|
| 166 |
+
else:
|
| 167 |
+
embedding_model = load_biomedbert_model()
|
| 168 |
+
|
| 169 |
+
print(f"π§ ANNOY Status: {'Available' if annoy_manager else 'Not available (using fallback)'}")
|
| 170 |
+
|
| 171 |
+
# Find relevant documents using ANNOY-accelerated method with fallback
|
| 172 |
+
print(f"\nπ Finding relevant documents...")
|
| 173 |
+
import time
|
| 174 |
+
start_time = time.time()
|
| 175 |
+
|
| 176 |
+
relevant_docs = find_relevant_documents_with_fallback(
|
| 177 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 178 |
+
annoy_manager=annoy_manager, strategy=strategy, **kwargs
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
doc_search_time = time.time() - start_time
|
| 182 |
+
print(f"β±οΈ Document search completed in {doc_search_time:.4f}s")
|
| 183 |
+
|
| 184 |
+
if use_chunks and chunk_embeddings:
|
| 185 |
+
# Find relevant chunks using ANNOY-accelerated method with fallback
|
| 186 |
+
print(f"\nπ Finding relevant chunks within selected documents...")
|
| 187 |
+
start_time = time.time()
|
| 188 |
+
|
| 189 |
+
relevant_chunks = find_relevant_chunks_with_fallback(
|
| 190 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
| 191 |
+
annoy_manager=annoy_manager, strategy=strategy,
|
| 192 |
+
top_chunks_per_doc=3, **kwargs
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
chunk_search_time = time.time() - start_time
|
| 196 |
+
print(f"β±οΈ Chunk search completed in {chunk_search_time:.4f}s")
|
| 197 |
+
|
| 198 |
+
# Get chunks for RAG
|
| 199 |
+
rag_content = get_chunks_for_rag(relevant_chunks, max_chunks=10)
|
| 200 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} chunks")
|
| 201 |
+
|
| 202 |
+
total_time = doc_search_time + chunk_search_time
|
| 203 |
+
print(f"π Total search time: {total_time:.4f}s")
|
| 204 |
+
|
| 205 |
+
else:
|
| 206 |
+
# Get full documents for RAG
|
| 207 |
+
rag_content = get_documents_for_rag(relevant_docs, document_index)
|
| 208 |
+
print(f"\nπ Ready for RAG with {len(rag_content)} full documents")
|
| 209 |
+
print(f"π Total search time: {doc_search_time:.4f}s")
|
| 210 |
+
|
| 211 |
+
return rag_content
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def demo_performance_comparison(query: str = "chest pain and shortness of breath"):
|
| 215 |
+
"""Demo performance comparison between original and ANNOY methods."""
|
| 216 |
+
print(f"\nβ‘ Performance Comparison Demo")
|
| 217 |
+
print("=" * 80)
|
| 218 |
+
print(f"Query: '{query}'")
|
| 219 |
+
|
| 220 |
+
# Load system with ANNOY
|
| 221 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager = load_document_system_with_annoy()
|
| 222 |
+
|
| 223 |
+
if document_index is None:
|
| 224 |
+
print("β No saved system found")
|
| 225 |
+
return
|
| 226 |
+
|
| 227 |
+
embedding_model = load_biomedbert_model()
|
| 228 |
+
strategy = "top_p"
|
| 229 |
+
strategy_params = {"top_p": 0.8, "min_similarity": 0.3}
|
| 230 |
+
|
| 231 |
+
print(f"\nπ Testing document retrieval performance...")
|
| 232 |
+
|
| 233 |
+
# Test original method
|
| 234 |
+
import time
|
| 235 |
+
start_time = time.time()
|
| 236 |
+
original_docs = find_relevant_documents(
|
| 237 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 238 |
+
strategy=strategy, **strategy_params
|
| 239 |
+
)
|
| 240 |
+
original_time = time.time() - start_time
|
| 241 |
+
|
| 242 |
+
# Test ANNOY method (with fallback)
|
| 243 |
+
start_time = time.time()
|
| 244 |
+
annoy_docs = find_relevant_documents_with_fallback(
|
| 245 |
+
query, embedding_model, tag_embeddings, doc_tag_mapping,
|
| 246 |
+
annoy_manager=annoy_manager, strategy=strategy, **strategy_params
|
| 247 |
+
)
|
| 248 |
+
annoy_time = time.time() - start_time
|
| 249 |
+
|
| 250 |
+
# Results
|
| 251 |
+
print(f"π Original method: {len(original_docs)} docs in {original_time:.4f}s")
|
| 252 |
+
print(f"π ANNOY method: {len(annoy_docs)} docs in {annoy_time:.4f}s")
|
| 253 |
+
|
| 254 |
+
if annoy_time > 0:
|
| 255 |
+
speedup = original_time / annoy_time
|
| 256 |
+
print(f"β‘ Speedup: {speedup:.2f}x")
|
| 257 |
+
|
| 258 |
+
# Check result similarity
|
| 259 |
+
if original_docs and annoy_docs:
|
| 260 |
+
overlap = set(original_docs) & set(annoy_docs)
|
| 261 |
+
print(f"π Result overlap: {len(overlap)}/{len(original_docs)} documents")
|
| 262 |
+
|
| 263 |
+
# Test chunk retrieval if available
|
| 264 |
+
if chunk_embeddings and len(original_docs) > 0:
|
| 265 |
+
print(f"\nπ Testing chunk retrieval performance...")
|
| 266 |
+
relevant_docs = original_docs[:2] # Test with first 2 documents
|
| 267 |
+
|
| 268 |
+
# Original method
|
| 269 |
+
start_time = time.time()
|
| 270 |
+
original_chunks = find_relevant_chunks(
|
| 271 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
| 272 |
+
strategy=strategy, **strategy_params
|
| 273 |
+
)
|
| 274 |
+
original_chunk_time = time.time() - start_time
|
| 275 |
+
|
| 276 |
+
# ANNOY method (with fallback)
|
| 277 |
+
start_time = time.time()
|
| 278 |
+
annoy_chunks = find_relevant_chunks_with_fallback(
|
| 279 |
+
query, embedding_model, relevant_docs, chunk_embeddings,
|
| 280 |
+
annoy_manager=annoy_manager, strategy=strategy, **strategy_params
|
| 281 |
+
)
|
| 282 |
+
annoy_chunk_time = time.time() - start_time
|
| 283 |
+
|
| 284 |
+
print(f"π Original chunks: {len(original_chunks)} chunks in {original_chunk_time:.4f}s")
|
| 285 |
+
print(f"π ANNOY chunks: {len(annoy_chunks)} chunks in {annoy_chunk_time:.4f}s")
|
| 286 |
+
|
| 287 |
+
if annoy_chunk_time > 0:
|
| 288 |
+
chunk_speedup = original_chunk_time / annoy_chunk_time
|
| 289 |
+
print(f"β‘ Chunk speedup: {chunk_speedup:.2f}x")
|
| 290 |
+
|
| 291 |
+
print(f"\nβ
Performance comparison completed!")
|
{src/pdf-version β customization/src}/indexing/__init__.py
RENAMED
|
File without changes
|
customization/src/indexing/annoy_manager.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ANNOY index management for PDF-based RAG system."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import numpy as np
|
| 6 |
+
from typing import Dict, List, Optional, Tuple, Union
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from annoy import AnnoyIndex
|
| 12 |
+
except ImportError:
|
| 13 |
+
raise ImportError("annoy package is required. Install with: pip install annoy")
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class AnnoyIndexManager:
|
| 21 |
+
"""Manages ANNOY indices for fast vector similarity search."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, embedding_dim: int = 1024, metric: str = 'angular'):
|
| 24 |
+
"""
|
| 25 |
+
Initialize ANNOY index manager.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
embedding_dim: Dimension of embeddings (1024 for BGE Large Medical)
|
| 29 |
+
metric: Distance metric ('angular' for cosine similarity, 'euclidean', 'manhattan', 'hamming', 'dot')
|
| 30 |
+
"""
|
| 31 |
+
self.embedding_dim = embedding_dim
|
| 32 |
+
self.metric = metric
|
| 33 |
+
self.tag_index = None
|
| 34 |
+
self.chunk_index = None
|
| 35 |
+
self.tag_to_id_mapping = {}
|
| 36 |
+
self.id_to_tag_mapping = {}
|
| 37 |
+
self.chunk_to_id_mapping = {}
|
| 38 |
+
self.id_to_chunk_mapping = {}
|
| 39 |
+
|
| 40 |
+
logger.info(f"Initialized AnnoyIndexManager: dim={embedding_dim}, metric={metric}")
|
| 41 |
+
|
| 42 |
+
def build_tag_index(self, tag_embeddings: Dict[str, np.ndarray], n_trees: int = 50) -> AnnoyIndex:
|
| 43 |
+
"""
|
| 44 |
+
Build ANNOY index for tag embeddings.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
tag_embeddings: Dictionary mapping tags to their embeddings
|
| 48 |
+
n_trees: Number of trees (more trees = better precision, slower build)
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Built ANNOY index
|
| 52 |
+
"""
|
| 53 |
+
logger.info(f"Building tag ANNOY index with {len(tag_embeddings)} tags...")
|
| 54 |
+
|
| 55 |
+
# Create index
|
| 56 |
+
self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
|
| 57 |
+
|
| 58 |
+
# Create mappings
|
| 59 |
+
self.tag_to_id_mapping = {}
|
| 60 |
+
self.id_to_tag_mapping = {}
|
| 61 |
+
|
| 62 |
+
# Add embeddings to index
|
| 63 |
+
for tag_id, (tag, embedding) in enumerate(tag_embeddings.items()):
|
| 64 |
+
self.tag_index.add_item(tag_id, embedding)
|
| 65 |
+
self.tag_to_id_mapping[tag] = tag_id
|
| 66 |
+
self.id_to_tag_mapping[tag_id] = tag
|
| 67 |
+
|
| 68 |
+
# Build index
|
| 69 |
+
logger.info(f"Building index with {n_trees} trees...")
|
| 70 |
+
self.tag_index.build(n_trees)
|
| 71 |
+
|
| 72 |
+
logger.info(f"β
Tag ANNOY index built successfully: {len(tag_embeddings)} tags")
|
| 73 |
+
return self.tag_index
|
| 74 |
+
|
| 75 |
+
def build_chunk_index(self, chunk_embeddings: Dict[str, List[Dict]], n_trees: int = 50) -> AnnoyIndex:
|
| 76 |
+
"""
|
| 77 |
+
Build ANNOY index for chunk embeddings.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
chunk_embeddings: Dictionary mapping document names to lists of chunk dictionaries
|
| 81 |
+
n_trees: Number of trees
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
Built ANNOY index
|
| 85 |
+
"""
|
| 86 |
+
# Count total chunks
|
| 87 |
+
total_chunks = sum(len(chunks) for chunks in chunk_embeddings.values())
|
| 88 |
+
logger.info(f"Building chunk ANNOY index with {total_chunks} chunks...")
|
| 89 |
+
|
| 90 |
+
# Create index
|
| 91 |
+
self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
|
| 92 |
+
|
| 93 |
+
# Create mappings
|
| 94 |
+
self.chunk_to_id_mapping = {}
|
| 95 |
+
self.id_to_chunk_mapping = {}
|
| 96 |
+
|
| 97 |
+
chunk_id = 0
|
| 98 |
+
for doc_name, chunks in chunk_embeddings.items():
|
| 99 |
+
for chunk in chunks:
|
| 100 |
+
# Create unique chunk identifier
|
| 101 |
+
chunk_key = f"{doc_name}#{chunk['chunk_id']}"
|
| 102 |
+
|
| 103 |
+
# Add to index
|
| 104 |
+
self.chunk_index.add_item(chunk_id, chunk['embedding'])
|
| 105 |
+
|
| 106 |
+
# Create mappings
|
| 107 |
+
self.chunk_to_id_mapping[chunk_key] = chunk_id
|
| 108 |
+
self.id_to_chunk_mapping[chunk_id] = {
|
| 109 |
+
'document': doc_name,
|
| 110 |
+
'chunk_id': chunk['chunk_id'],
|
| 111 |
+
'text': chunk['text'],
|
| 112 |
+
'start_char': chunk.get('start_char', 0),
|
| 113 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
| 114 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
| 115 |
+
'chunk_key': chunk_key
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
chunk_id += 1
|
| 119 |
+
|
| 120 |
+
# Build index
|
| 121 |
+
logger.info(f"Building chunk index with {n_trees} trees...")
|
| 122 |
+
self.chunk_index.build(n_trees)
|
| 123 |
+
|
| 124 |
+
logger.info(f"β
Chunk ANNOY index built successfully: {total_chunks} chunks")
|
| 125 |
+
return self.chunk_index
|
| 126 |
+
|
| 127 |
+
def save_indices(self, output_dir: Union[str, Path]):
|
| 128 |
+
"""
|
| 129 |
+
Save ANNOY indices and mappings to disk.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
output_dir: Directory to save indices
|
| 133 |
+
"""
|
| 134 |
+
output_dir = Path(output_dir)
|
| 135 |
+
# Save indices at the same level as embeddings, not inside embeddings
|
| 136 |
+
indices_dir = output_dir.parent / 'indices'
|
| 137 |
+
indices_dir.mkdir(exist_ok=True)
|
| 138 |
+
|
| 139 |
+
# Save tag index
|
| 140 |
+
if self.tag_index is not None:
|
| 141 |
+
tag_index_path = indices_dir / 'tag_embeddings.ann'
|
| 142 |
+
self.tag_index.save(str(tag_index_path))
|
| 143 |
+
|
| 144 |
+
# Save tag mappings
|
| 145 |
+
tag_mappings_path = indices_dir / 'tag_mappings.json'
|
| 146 |
+
with open(tag_mappings_path, 'w', encoding='utf-8') as f:
|
| 147 |
+
json.dump({
|
| 148 |
+
'tag_to_id': self.tag_to_id_mapping,
|
| 149 |
+
'id_to_tag': self.id_to_tag_mapping
|
| 150 |
+
}, f, indent=2, ensure_ascii=False)
|
| 151 |
+
|
| 152 |
+
logger.info(f"β
Tag index saved: {tag_index_path}")
|
| 153 |
+
|
| 154 |
+
# Save chunk index
|
| 155 |
+
if self.chunk_index is not None:
|
| 156 |
+
chunk_index_path = indices_dir / 'chunk_embeddings.ann'
|
| 157 |
+
self.chunk_index.save(str(chunk_index_path))
|
| 158 |
+
|
| 159 |
+
# Save chunk mappings
|
| 160 |
+
chunk_mappings_path = indices_dir / 'chunk_mappings.json'
|
| 161 |
+
with open(chunk_mappings_path, 'w', encoding='utf-8') as f:
|
| 162 |
+
json.dump({
|
| 163 |
+
'chunk_to_id': self.chunk_to_id_mapping,
|
| 164 |
+
'id_to_chunk': self.id_to_chunk_mapping
|
| 165 |
+
}, f, indent=2, ensure_ascii=False)
|
| 166 |
+
|
| 167 |
+
logger.info(f"β
Chunk index saved: {chunk_index_path}")
|
| 168 |
+
|
| 169 |
+
# Save index metadata
|
| 170 |
+
metadata_path = indices_dir / 'annoy_metadata.json'
|
| 171 |
+
with open(metadata_path, 'w', encoding='utf-8') as f:
|
| 172 |
+
json.dump({
|
| 173 |
+
'embedding_dim': self.embedding_dim,
|
| 174 |
+
'metric': self.metric,
|
| 175 |
+
'tag_index_exists': self.tag_index is not None,
|
| 176 |
+
'chunk_index_exists': self.chunk_index is not None,
|
| 177 |
+
'num_tags': len(self.tag_to_id_mapping),
|
| 178 |
+
'num_chunks': len(self.chunk_to_id_mapping)
|
| 179 |
+
}, f, indent=2)
|
| 180 |
+
|
| 181 |
+
logger.info(f"β
ANNOY indices saved to: {indices_dir}")
|
| 182 |
+
|
| 183 |
+
def load_indices(self, input_dir: Union[str, Path]) -> bool:
|
| 184 |
+
"""
|
| 185 |
+
Load ANNOY indices and mappings from disk.
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
input_dir: Directory containing saved indices
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
True if successfully loaded, False otherwise
|
| 192 |
+
"""
|
| 193 |
+
input_dir = Path(input_dir)
|
| 194 |
+
# Load indices from the same level as embeddings, not inside embeddings
|
| 195 |
+
indices_dir = input_dir.parent / 'indices'
|
| 196 |
+
|
| 197 |
+
if not indices_dir.exists():
|
| 198 |
+
logger.warning(f"Indices directory not found: {indices_dir}")
|
| 199 |
+
return False
|
| 200 |
+
|
| 201 |
+
try:
|
| 202 |
+
# Load metadata
|
| 203 |
+
metadata_path = indices_dir / 'annoy_metadata.json'
|
| 204 |
+
if metadata_path.exists():
|
| 205 |
+
with open(metadata_path, 'r', encoding='utf-8') as f:
|
| 206 |
+
metadata = json.load(f)
|
| 207 |
+
self.embedding_dim = metadata['embedding_dim']
|
| 208 |
+
self.metric = metadata['metric']
|
| 209 |
+
logger.info(f"Loaded metadata: dim={self.embedding_dim}, metric={self.metric}")
|
| 210 |
+
|
| 211 |
+
# Load tag index
|
| 212 |
+
tag_index_path = indices_dir / 'tag_embeddings.ann'
|
| 213 |
+
tag_mappings_path = indices_dir / 'tag_mappings.json'
|
| 214 |
+
|
| 215 |
+
if tag_index_path.exists() and tag_mappings_path.exists():
|
| 216 |
+
self.tag_index = AnnoyIndex(self.embedding_dim, self.metric)
|
| 217 |
+
self.tag_index.load(str(tag_index_path))
|
| 218 |
+
|
| 219 |
+
with open(tag_mappings_path, 'r', encoding='utf-8') as f:
|
| 220 |
+
mappings = json.load(f)
|
| 221 |
+
self.tag_to_id_mapping = mappings['tag_to_id']
|
| 222 |
+
self.id_to_tag_mapping = {int(k): v for k, v in mappings['id_to_tag'].items()}
|
| 223 |
+
|
| 224 |
+
logger.info(f"β
Tag index loaded: {len(self.tag_to_id_mapping)} tags")
|
| 225 |
+
|
| 226 |
+
# Load chunk index
|
| 227 |
+
chunk_index_path = indices_dir / 'chunk_embeddings.ann'
|
| 228 |
+
chunk_mappings_path = indices_dir / 'chunk_mappings.json'
|
| 229 |
+
|
| 230 |
+
if chunk_index_path.exists() and chunk_mappings_path.exists():
|
| 231 |
+
self.chunk_index = AnnoyIndex(self.embedding_dim, self.metric)
|
| 232 |
+
self.chunk_index.load(str(chunk_index_path))
|
| 233 |
+
|
| 234 |
+
with open(chunk_mappings_path, 'r', encoding='utf-8') as f:
|
| 235 |
+
mappings = json.load(f)
|
| 236 |
+
self.chunk_to_id_mapping = mappings['chunk_to_id']
|
| 237 |
+
self.id_to_chunk_mapping = {int(k): v for k, v in mappings['id_to_chunk'].items()}
|
| 238 |
+
|
| 239 |
+
logger.info(f"β
Chunk index loaded: {len(self.chunk_to_id_mapping)} chunks")
|
| 240 |
+
|
| 241 |
+
return True
|
| 242 |
+
|
| 243 |
+
except Exception as e:
|
| 244 |
+
logger.error(f"Failed to load ANNOY indices: {e}")
|
| 245 |
+
return False
|
| 246 |
+
|
| 247 |
+
def search_tags(self, query_embedding: np.ndarray, n_neighbors: int = 10,
|
| 248 |
+
include_distances: bool = True) -> Union[List[str], Tuple[List[str], List[float]]]:
|
| 249 |
+
"""
|
| 250 |
+
Search for similar tags using ANNOY index.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
query_embedding: Query embedding vector
|
| 254 |
+
n_neighbors: Number of nearest neighbors to return
|
| 255 |
+
include_distances: Whether to return distances
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
List of tag names, or tuple of (tag_names, distances)
|
| 259 |
+
"""
|
| 260 |
+
if self.tag_index is None:
|
| 261 |
+
raise ValueError("Tag index not built or loaded")
|
| 262 |
+
|
| 263 |
+
# Search using ANNOY
|
| 264 |
+
if include_distances:
|
| 265 |
+
neighbor_ids, distances = self.tag_index.get_nns_by_vector(
|
| 266 |
+
query_embedding, n_neighbors, include_distances=True
|
| 267 |
+
)
|
| 268 |
+
else:
|
| 269 |
+
neighbor_ids = self.tag_index.get_nns_by_vector(
|
| 270 |
+
query_embedding, n_neighbors, include_distances=False
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Convert IDs to tag names
|
| 274 |
+
tag_names = [self.id_to_tag_mapping[neighbor_id] for neighbor_id in neighbor_ids]
|
| 275 |
+
|
| 276 |
+
if include_distances:
|
| 277 |
+
return tag_names, distances
|
| 278 |
+
else:
|
| 279 |
+
return tag_names
|
| 280 |
+
|
| 281 |
+
def search_chunks(self, query_embedding: np.ndarray, n_neighbors: int = 10,
|
| 282 |
+
include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
|
| 283 |
+
"""
|
| 284 |
+
Search for similar chunks using ANNOY index.
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
query_embedding: Query embedding vector
|
| 288 |
+
n_neighbors: Number of nearest neighbors to return
|
| 289 |
+
include_distances: Whether to return distances
|
| 290 |
+
|
| 291 |
+
Returns:
|
| 292 |
+
List of chunk dictionaries, or tuple of (chunks, distances)
|
| 293 |
+
"""
|
| 294 |
+
if self.chunk_index is None:
|
| 295 |
+
raise ValueError("Chunk index not built or loaded")
|
| 296 |
+
|
| 297 |
+
# Search using ANNOY
|
| 298 |
+
if include_distances:
|
| 299 |
+
neighbor_ids, distances = self.chunk_index.get_nns_by_vector(
|
| 300 |
+
query_embedding, n_neighbors, include_distances=True
|
| 301 |
+
)
|
| 302 |
+
else:
|
| 303 |
+
neighbor_ids = self.chunk_index.get_nns_by_vector(
|
| 304 |
+
query_embedding, n_neighbors, include_distances=False
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
# Convert IDs to chunk info
|
| 308 |
+
chunks = [self.id_to_chunk_mapping[neighbor_id] for neighbor_id in neighbor_ids]
|
| 309 |
+
|
| 310 |
+
if include_distances:
|
| 311 |
+
return chunks, distances
|
| 312 |
+
else:
|
| 313 |
+
return chunks
|
| 314 |
+
|
| 315 |
+
def search_chunks_in_documents(self, query_embedding: np.ndarray,
|
| 316 |
+
document_names: List[str], n_neighbors: int = 10,
|
| 317 |
+
include_distances: bool = True) -> Union[List[Dict], Tuple[List[Dict], List[float]]]:
|
| 318 |
+
"""
|
| 319 |
+
Search for similar chunks within specific documents.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
query_embedding: Query embedding vector
|
| 323 |
+
document_names: List of document names to search within
|
| 324 |
+
n_neighbors: Number of nearest neighbors to return
|
| 325 |
+
include_distances: Whether to return distances
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
List of chunk dictionaries, or tuple of (chunks, distances)
|
| 329 |
+
"""
|
| 330 |
+
if self.chunk_index is None:
|
| 331 |
+
raise ValueError("Chunk index not built or loaded")
|
| 332 |
+
|
| 333 |
+
# Get more candidates than needed since we'll filter by document
|
| 334 |
+
search_candidates = min(n_neighbors * 5, len(self.id_to_chunk_mapping))
|
| 335 |
+
|
| 336 |
+
# Search using ANNOY
|
| 337 |
+
if include_distances:
|
| 338 |
+
candidate_ids, distances = self.chunk_index.get_nns_by_vector(
|
| 339 |
+
query_embedding, search_candidates, include_distances=True
|
| 340 |
+
)
|
| 341 |
+
else:
|
| 342 |
+
candidate_ids = self.chunk_index.get_nns_by_vector(
|
| 343 |
+
query_embedding, search_candidates, include_distances=False
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
# Filter by document names and take top n_neighbors
|
| 347 |
+
filtered_chunks = []
|
| 348 |
+
filtered_distances = [] if include_distances else None
|
| 349 |
+
|
| 350 |
+
for i, candidate_id in enumerate(candidate_ids):
|
| 351 |
+
chunk_info = self.id_to_chunk_mapping[candidate_id]
|
| 352 |
+
if chunk_info['document'] in document_names:
|
| 353 |
+
filtered_chunks.append(chunk_info)
|
| 354 |
+
if include_distances:
|
| 355 |
+
filtered_distances.append(distances[i])
|
| 356 |
+
|
| 357 |
+
if len(filtered_chunks) >= n_neighbors:
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
+
if include_distances:
|
| 361 |
+
return filtered_chunks, filtered_distances
|
| 362 |
+
else:
|
| 363 |
+
return filtered_chunks
|
| 364 |
+
|
| 365 |
+
def get_index_stats(self) -> Dict:
|
| 366 |
+
"""Get statistics about the loaded indices."""
|
| 367 |
+
stats = {
|
| 368 |
+
'embedding_dim': self.embedding_dim,
|
| 369 |
+
'metric': self.metric,
|
| 370 |
+
'tag_index_loaded': self.tag_index is not None,
|
| 371 |
+
'chunk_index_loaded': self.chunk_index is not None,
|
| 372 |
+
'num_tags': len(self.tag_to_id_mapping) if self.tag_index else 0,
|
| 373 |
+
'num_chunks': len(self.chunk_to_id_mapping) if self.chunk_index else 0
|
| 374 |
+
}
|
| 375 |
+
return stats
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def convert_angular_distance_to_cosine_similarity(angular_distance: float) -> float:
|
| 379 |
+
"""
|
| 380 |
+
Convert ANNOY angular distance to cosine similarity.
|
| 381 |
+
|
| 382 |
+
Args:
|
| 383 |
+
angular_distance: Angular distance from ANNOY
|
| 384 |
+
|
| 385 |
+
Returns:
|
| 386 |
+
Cosine similarity (0 to 1)
|
| 387 |
+
"""
|
| 388 |
+
# Angular distance is related to cosine similarity by:
|
| 389 |
+
# angular_distance = 2 * arccos(cosine_similarity) / Ο
|
| 390 |
+
# Therefore: cosine_similarity = cos(angular_distance * Ο / 2)
|
| 391 |
+
import math
|
| 392 |
+
return math.cos(angular_distance * math.pi / 2)
|
{src/pdf-version β customization/src}/indexing/document_indexer.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/indexing/embedding_creator.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/indexing/storage.py
RENAMED
|
@@ -2,13 +2,19 @@
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
| 5 |
from typing import Dict, Optional, Tuple
|
| 6 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
| 10 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
| 11 |
-
output_dir: str = None):
|
| 12 |
"""Save the complete document indexing system.
|
| 13 |
|
| 14 |
Args:
|
|
@@ -85,6 +91,31 @@ def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
|
| 85 |
with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
|
| 86 |
json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
print("β
Document system saved to files")
|
| 89 |
|
| 90 |
|
|
@@ -161,4 +192,62 @@ def load_document_system(input_dir: str = None) -> Tuple[Optional[Dict], Optiona
|
|
| 161 |
|
| 162 |
except Exception as e:
|
| 163 |
print(f"β Failed to load document system: {e}")
|
| 164 |
-
return None, None, None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
import logging
|
| 6 |
from typing import Dict, Optional, Tuple
|
| 7 |
import numpy as np
|
| 8 |
+
from .annoy_manager import AnnoyIndexManager
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
|
| 15 |
def save_document_system(document_index: Dict, tag_embeddings: Dict,
|
| 16 |
doc_tag_mapping: Dict, chunk_embeddings: Dict = None,
|
| 17 |
+
output_dir: str = None, build_annoy_indices: bool = True):
|
| 18 |
"""Save the complete document indexing system.
|
| 19 |
|
| 20 |
Args:
|
|
|
|
| 91 |
with open(os.path.join(output_dir, 'chunk_embeddings.json'), 'w', encoding='utf-8') as f:
|
| 92 |
json.dump(chunk_embeddings_serializable, f, indent=2, ensure_ascii=False)
|
| 93 |
|
| 94 |
+
# Build and save ANNOY indices if requested
|
| 95 |
+
if build_annoy_indices:
|
| 96 |
+
logger.info("π§ Building ANNOY indices for fast retrieval...")
|
| 97 |
+
try:
|
| 98 |
+
# Initialize ANNOY manager (assuming BGE Large Medical embedding dimension)
|
| 99 |
+
annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
|
| 100 |
+
|
| 101 |
+
# Build tag index
|
| 102 |
+
logger.info("Building tag ANNOY index...")
|
| 103 |
+
annoy_manager.build_tag_index(tag_embeddings, n_trees=50)
|
| 104 |
+
|
| 105 |
+
# Build chunk index if chunk embeddings are provided
|
| 106 |
+
if chunk_embeddings:
|
| 107 |
+
logger.info("Building chunk ANNOY index...")
|
| 108 |
+
annoy_manager.build_chunk_index(chunk_embeddings, n_trees=50)
|
| 109 |
+
|
| 110 |
+
# Save indices
|
| 111 |
+
logger.info("Saving ANNOY indices...")
|
| 112 |
+
annoy_manager.save_indices(output_dir)
|
| 113 |
+
|
| 114 |
+
logger.info("β
ANNOY indices built and saved successfully")
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"β Failed to build ANNOY indices: {e}")
|
| 117 |
+
logger.warning("Continuing without ANNOY indices - will use original search methods")
|
| 118 |
+
|
| 119 |
print("β
Document system saved to files")
|
| 120 |
|
| 121 |
|
|
|
|
| 192 |
|
| 193 |
except Exception as e:
|
| 194 |
print(f"β Failed to load document system: {e}")
|
| 195 |
+
return None, None, None, None
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def load_annoy_manager(input_dir: str = None) -> Optional[AnnoyIndexManager]:
|
| 199 |
+
"""
|
| 200 |
+
Load ANNOY index manager with pre-built indices.
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
input_dir: Input directory containing saved indices
|
| 204 |
+
|
| 205 |
+
Returns:
|
| 206 |
+
AnnoyIndexManager instance or None if loading fails
|
| 207 |
+
"""
|
| 208 |
+
if input_dir is None:
|
| 209 |
+
# Get project root directory
|
| 210 |
+
from pathlib import Path
|
| 211 |
+
root_dir = Path(__file__).parent.parent.parent.parent
|
| 212 |
+
input_dir = root_dir / 'embeddings' / 'pdfembeddings'
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
# Initialize ANNOY manager
|
| 216 |
+
annoy_manager = AnnoyIndexManager(embedding_dim=1024, metric='angular')
|
| 217 |
+
|
| 218 |
+
# Try to load indices
|
| 219 |
+
if annoy_manager.load_indices(input_dir):
|
| 220 |
+
logger.info("β
ANNOY indices loaded successfully")
|
| 221 |
+
return annoy_manager
|
| 222 |
+
else:
|
| 223 |
+
logger.warning("β οΈ Failed to load ANNOY indices")
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logger.error(f"β Failed to initialize ANNOY manager: {e}")
|
| 228 |
+
return None
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def load_document_system_with_annoy(input_dir: str = None, annoy_dir: str = None) -> Tuple[Optional[Dict], Optional[Dict], Optional[Dict], Optional[Dict], Optional[AnnoyIndexManager]]:
|
| 232 |
+
"""
|
| 233 |
+
Load the complete document indexing system including ANNOY indices.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
input_dir: Input directory containing saved files
|
| 237 |
+
annoy_dir: Directory containing ANNOY indices (if different from input_dir)
|
| 238 |
+
|
| 239 |
+
Returns:
|
| 240 |
+
Tuple of (document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager).
|
| 241 |
+
Returns all None values if loading fails.
|
| 242 |
+
"""
|
| 243 |
+
# Load the standard document system
|
| 244 |
+
document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings = load_document_system(input_dir)
|
| 245 |
+
|
| 246 |
+
if document_index is None:
|
| 247 |
+
return None, None, None, None, None
|
| 248 |
+
|
| 249 |
+
# Load ANNOY manager
|
| 250 |
+
# Use annoy_dir if provided, otherwise use input_dir
|
| 251 |
+
annoy_manager = load_annoy_manager(annoy_dir if annoy_dir else input_dir)
|
| 252 |
+
|
| 253 |
+
return document_index, tag_embeddings, doc_tag_mapping, chunk_embeddings, annoy_manager
|
{src/pdf-version β customization/src}/models/__init__.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/models/embedding_models.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/rag/__init__.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/rag/medical_rag_pipeline.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/retrieval/__init__.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/retrieval/chunk_retriever.py
RENAMED
|
@@ -1,9 +1,15 @@
|
|
| 1 |
"""Chunk-level retrieval functionality."""
|
| 2 |
|
| 3 |
-
from typing import List, Dict, Callable
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
@@ -190,4 +196,172 @@ def get_chunks_for_rag(relevant_chunks: List[Dict], max_chunks: int = 10) -> Lis
|
|
| 190 |
rag_chunks.append(formatted_chunk)
|
| 191 |
|
| 192 |
print(f"π Retrieved {len(rag_chunks)} chunks for RAG")
|
| 193 |
-
return rag_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Chunk-level retrieval functionality."""
|
| 2 |
|
| 3 |
+
from typing import List, Dict, Callable, Optional
|
| 4 |
import numpy as np
|
| 5 |
+
import logging
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from indexing.embedding_creator import create_text_embedding
|
| 8 |
+
from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
|
| 15 |
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
|
|
| 196 |
rag_chunks.append(formatted_chunk)
|
| 197 |
|
| 198 |
print(f"π Retrieved {len(rag_chunks)} chunks for RAG")
|
| 199 |
+
return rag_chunks
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ANNOY-accelerated chunk retrieval functions
|
| 203 |
+
|
| 204 |
+
def find_relevant_chunks_annoy_top_k(query: str, model: SentenceTransformer,
|
| 205 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
| 206 |
+
top_chunks_per_doc: int = 3,
|
| 207 |
+
similarity_metric: str = "angular") -> List[Dict]:
|
| 208 |
+
"""Find most relevant chunks using ANNOY index and Top-K strategy."""
|
| 209 |
+
query_embedding = create_text_embedding(model, query)
|
| 210 |
+
|
| 211 |
+
# Use ANNOY to search chunks in the relevant documents
|
| 212 |
+
all_chunks, distances = annoy_manager.search_chunks_in_documents(
|
| 213 |
+
query_embedding, relevant_docs,
|
| 214 |
+
n_neighbors=len(relevant_docs) * top_chunks_per_doc,
|
| 215 |
+
include_distances=True
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# Convert distances to similarities and format results
|
| 219 |
+
all_relevant_chunks = []
|
| 220 |
+
for chunk, distance in zip(all_chunks, distances):
|
| 221 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 222 |
+
|
| 223 |
+
chunk_result = {
|
| 224 |
+
'document': chunk['document'],
|
| 225 |
+
'chunk_id': chunk['chunk_id'],
|
| 226 |
+
'text': chunk['text'],
|
| 227 |
+
'start_char': chunk.get('start_char', 0),
|
| 228 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
| 229 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
| 230 |
+
'similarity': similarity
|
| 231 |
+
}
|
| 232 |
+
all_relevant_chunks.append(chunk_result)
|
| 233 |
+
|
| 234 |
+
# Group by document and take top chunks per document
|
| 235 |
+
doc_chunks = {}
|
| 236 |
+
for chunk in all_relevant_chunks:
|
| 237 |
+
doc_name = chunk['document']
|
| 238 |
+
if doc_name not in doc_chunks:
|
| 239 |
+
doc_chunks[doc_name] = []
|
| 240 |
+
doc_chunks[doc_name].append(chunk)
|
| 241 |
+
|
| 242 |
+
# Take top chunks from each document
|
| 243 |
+
final_chunks = []
|
| 244 |
+
for doc_name in relevant_docs:
|
| 245 |
+
if doc_name in doc_chunks:
|
| 246 |
+
doc_chunks[doc_name].sort(key=lambda x: x['similarity'], reverse=True)
|
| 247 |
+
final_chunks.extend(doc_chunks[doc_name][:top_chunks_per_doc])
|
| 248 |
+
|
| 249 |
+
# Sort all chunks by similarity
|
| 250 |
+
final_chunks.sort(key=lambda x: x['similarity'], reverse=True)
|
| 251 |
+
|
| 252 |
+
logger.info(f"π Found {len(final_chunks)} relevant chunks (ANNOY Top-K)")
|
| 253 |
+
for i, chunk in enumerate(final_chunks[:5]): # Show top 5
|
| 254 |
+
logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
| 255 |
+
logger.info(f" Preview: {chunk['text'][:100]}...")
|
| 256 |
+
|
| 257 |
+
return final_chunks
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def find_relevant_chunks_annoy_top_p(query: str, model: SentenceTransformer,
|
| 261 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
| 262 |
+
top_p: float = 0.6, min_similarity: float = 0.3,
|
| 263 |
+
similarity_metric: str = "angular") -> List[Dict]:
|
| 264 |
+
"""Find most relevant chunks using ANNOY index and Top-P strategy."""
|
| 265 |
+
query_embedding = create_text_embedding(model, query)
|
| 266 |
+
|
| 267 |
+
# Search more chunks to ensure we have enough candidates for Top-P selection
|
| 268 |
+
search_candidates = min(len(relevant_docs) * 10, 100) # Reasonable upper limit
|
| 269 |
+
|
| 270 |
+
# Use ANNOY to search chunks in the relevant documents
|
| 271 |
+
all_chunks, distances = annoy_manager.search_chunks_in_documents(
|
| 272 |
+
query_embedding, relevant_docs,
|
| 273 |
+
n_neighbors=search_candidates,
|
| 274 |
+
include_distances=True
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Convert distances to similarities and filter by minimum similarity
|
| 278 |
+
filtered_chunks = []
|
| 279 |
+
for chunk, distance in zip(all_chunks, distances):
|
| 280 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 281 |
+
|
| 282 |
+
# Only include chunks above minimum similarity threshold
|
| 283 |
+
if similarity >= min_similarity:
|
| 284 |
+
chunk_result = {
|
| 285 |
+
'document': chunk['document'],
|
| 286 |
+
'chunk_id': chunk['chunk_id'],
|
| 287 |
+
'text': chunk['text'],
|
| 288 |
+
'start_char': chunk.get('start_char', 0),
|
| 289 |
+
'end_char': chunk.get('end_char', len(chunk['text'])),
|
| 290 |
+
'token_count': chunk.get('token_count', len(chunk['text'].split())),
|
| 291 |
+
'similarity': similarity
|
| 292 |
+
}
|
| 293 |
+
filtered_chunks.append(chunk_result)
|
| 294 |
+
|
| 295 |
+
if not filtered_chunks:
|
| 296 |
+
logger.warning(f"β οΈ No chunks found above similarity threshold {min_similarity}")
|
| 297 |
+
return []
|
| 298 |
+
|
| 299 |
+
# Sort by similarity
|
| 300 |
+
filtered_chunks.sort(key=lambda x: x['similarity'], reverse=True)
|
| 301 |
+
|
| 302 |
+
# Apply Top-P selection
|
| 303 |
+
total_score = sum(chunk['similarity'] for chunk in filtered_chunks)
|
| 304 |
+
cumulative_prob = 0.0
|
| 305 |
+
selected_chunks = []
|
| 306 |
+
|
| 307 |
+
for chunk in filtered_chunks:
|
| 308 |
+
prob = chunk['similarity'] / total_score
|
| 309 |
+
cumulative_prob += prob
|
| 310 |
+
selected_chunks.append(chunk)
|
| 311 |
+
|
| 312 |
+
# Stop when we reach the Top-P threshold
|
| 313 |
+
if cumulative_prob >= top_p:
|
| 314 |
+
break
|
| 315 |
+
|
| 316 |
+
logger.info(f"π Found {len(selected_chunks)} relevant chunks (ANNOY Top-P={top_p})")
|
| 317 |
+
logger.info(f"π Filtered from {len(filtered_chunks)} chunks above threshold")
|
| 318 |
+
logger.info(f"π Cumulative probability: {cumulative_prob:.3f}")
|
| 319 |
+
|
| 320 |
+
for i, chunk in enumerate(selected_chunks[:5]): # Show top 5
|
| 321 |
+
logger.info(f" {i+1}. {chunk['document']} (chunk {chunk['chunk_id']}, similarity: {chunk['similarity']:.3f})")
|
| 322 |
+
logger.info(f" Preview: {chunk['text'][:100]}...")
|
| 323 |
+
|
| 324 |
+
return selected_chunks
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def find_relevant_chunks_annoy(query: str, model: SentenceTransformer,
|
| 328 |
+
relevant_docs: List[str], annoy_manager: AnnoyIndexManager,
|
| 329 |
+
strategy: str = "top_p", **kwargs) -> List[Dict]:
|
| 330 |
+
"""Unified interface for ANNOY-accelerated chunk retrieval with different strategies."""
|
| 331 |
+
|
| 332 |
+
similarity_metric = kwargs.get("similarity_metric", "angular")
|
| 333 |
+
|
| 334 |
+
if strategy == "top_k":
|
| 335 |
+
top_chunks_per_doc = kwargs.get("top_chunks_per_doc", 3)
|
| 336 |
+
return find_relevant_chunks_annoy_top_k(query, model, relevant_docs, annoy_manager,
|
| 337 |
+
top_chunks_per_doc, similarity_metric)
|
| 338 |
+
|
| 339 |
+
elif strategy == "top_p":
|
| 340 |
+
top_p = kwargs.get("top_p", 0.6)
|
| 341 |
+
min_similarity = kwargs.get("min_similarity", 0.3)
|
| 342 |
+
return find_relevant_chunks_annoy_top_p(query, model, relevant_docs, annoy_manager,
|
| 343 |
+
top_p, min_similarity, similarity_metric)
|
| 344 |
+
|
| 345 |
+
else:
|
| 346 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k' or 'top_p'")
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def find_relevant_chunks_with_fallback(query: str, model: SentenceTransformer,
|
| 350 |
+
relevant_docs: List[str], chunk_embeddings: Dict,
|
| 351 |
+
annoy_manager: Optional[AnnoyIndexManager] = None,
|
| 352 |
+
strategy: str = "top_p", **kwargs) -> List[Dict]:
|
| 353 |
+
"""
|
| 354 |
+
Find relevant chunks with ANNOY acceleration and fallback to original method.
|
| 355 |
+
|
| 356 |
+
This function automatically uses ANNOY if available, otherwise falls back to original search.
|
| 357 |
+
"""
|
| 358 |
+
if annoy_manager is not None:
|
| 359 |
+
try:
|
| 360 |
+
logger.info("π Using ANNOY-accelerated chunk retrieval")
|
| 361 |
+
return find_relevant_chunks_annoy(query, model, relevant_docs, annoy_manager, strategy, **kwargs)
|
| 362 |
+
except Exception as e:
|
| 363 |
+
logger.warning(f"β οΈ ANNOY chunk retrieval failed, falling back to original method: {e}")
|
| 364 |
+
|
| 365 |
+
# Fallback to original method
|
| 366 |
+
logger.info("π Using original chunk retrieval method")
|
| 367 |
+
return find_relevant_chunks(query, model, relevant_docs, chunk_embeddings, strategy, **kwargs)
|
{src/pdf-version β customization/src}/retrieval/document_retriever.py
RENAMED
|
@@ -1,9 +1,15 @@
|
|
| 1 |
"""Document retrieval strategies and functionality."""
|
| 2 |
|
| 3 |
-
from typing import List, Dict
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
|
|
@@ -189,4 +195,202 @@ def create_document_tag_mapping(document_index: Dict, tag_embeddings: Dict) -> D
|
|
| 189 |
'treatments': doc_info.get('treatments', [])
|
| 190 |
}
|
| 191 |
|
| 192 |
-
return doc_tag_mapping
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Document retrieval strategies and functionality."""
|
| 2 |
|
| 3 |
+
from typing import List, Dict, Optional
|
| 4 |
import numpy as np
|
| 5 |
+
import logging
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from indexing.embedding_creator import create_text_embedding
|
| 8 |
+
from indexing.annoy_manager import AnnoyIndexManager, convert_angular_distance_to_cosine_similarity
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
|
| 15 |
def find_relevant_documents_top_k(query: str, model: SentenceTransformer,
|
|
|
|
| 195 |
'treatments': doc_info.get('treatments', [])
|
| 196 |
}
|
| 197 |
|
| 198 |
+
return doc_tag_mapping
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# ANNOY-accelerated document retrieval functions
|
| 202 |
+
|
| 203 |
+
def find_relevant_documents_annoy_top_k(query: str, model: SentenceTransformer,
|
| 204 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
| 205 |
+
top_k: int = 3, search_neighbors: int = 20) -> List[str]:
|
| 206 |
+
"""Find top-k most relevant documents using ANNOY index for fast tag search."""
|
| 207 |
+
query_embedding = create_text_embedding(model, query)
|
| 208 |
+
|
| 209 |
+
# Use ANNOY to find similar tags quickly
|
| 210 |
+
similar_tags, distances = annoy_manager.search_tags(
|
| 211 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# Convert angular distances to cosine similarities
|
| 215 |
+
tag_similarities = {}
|
| 216 |
+
for tag, distance in zip(similar_tags, distances):
|
| 217 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 218 |
+
tag_similarities[tag] = similarity
|
| 219 |
+
|
| 220 |
+
# Find documents that contain the most similar tags
|
| 221 |
+
doc_scores = {}
|
| 222 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 223 |
+
doc_tags = doc_info['tags']
|
| 224 |
+
|
| 225 |
+
# Calculate document score using max similarity for precise tag matching
|
| 226 |
+
if doc_tags:
|
| 227 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 228 |
+
# Use max similarity to find documents with best tag matches
|
| 229 |
+
doc_score = max(similarities)
|
| 230 |
+
doc_scores[pdf_name] = doc_score
|
| 231 |
+
|
| 232 |
+
# Sort and return top-k documents
|
| 233 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
| 234 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs[:top_k]]
|
| 235 |
+
|
| 236 |
+
logger.info(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY TOP-K)")
|
| 237 |
+
for i, doc_name in enumerate(relevant_docs):
|
| 238 |
+
score = doc_scores[doc_name]
|
| 239 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
| 240 |
+
|
| 241 |
+
return relevant_docs
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def find_relevant_documents_annoy_top_p(query: str, model: SentenceTransformer,
|
| 245 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
| 246 |
+
top_p: float = 0.6, min_similarity: float = 0.5,
|
| 247 |
+
search_neighbors: int = 30) -> List[str]:
|
| 248 |
+
"""Find documents using TOP-P (nucleus sampling) approach with ANNOY acceleration."""
|
| 249 |
+
query_embedding = create_text_embedding(model, query)
|
| 250 |
+
|
| 251 |
+
# Use ANNOY to find similar tags quickly
|
| 252 |
+
similar_tags, distances = annoy_manager.search_tags(
|
| 253 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Convert angular distances to cosine similarities
|
| 257 |
+
tag_similarities = {}
|
| 258 |
+
for tag, distance in zip(similar_tags, distances):
|
| 259 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 260 |
+
tag_similarities[tag] = similarity
|
| 261 |
+
|
| 262 |
+
# Find documents that contain the most similar tags
|
| 263 |
+
doc_scores = {}
|
| 264 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 265 |
+
doc_tags = doc_info['tags']
|
| 266 |
+
|
| 267 |
+
# Calculate document score using max similarity for precise tag matching
|
| 268 |
+
if doc_tags:
|
| 269 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 270 |
+
# Use max similarity to find documents with best tag matches
|
| 271 |
+
doc_score = max(similarities)
|
| 272 |
+
doc_scores[pdf_name] = doc_score
|
| 273 |
+
|
| 274 |
+
# Filter out documents below minimum similarity threshold
|
| 275 |
+
filtered_docs = {doc: score for doc, score in doc_scores.items()
|
| 276 |
+
if score >= min_similarity}
|
| 277 |
+
|
| 278 |
+
if not filtered_docs:
|
| 279 |
+
logger.warning(f"β οΈ No documents found above similarity threshold {min_similarity}")
|
| 280 |
+
return []
|
| 281 |
+
|
| 282 |
+
# Sort documents by similarity score
|
| 283 |
+
sorted_docs = sorted(filtered_docs.items(), key=lambda x: x[1], reverse=True)
|
| 284 |
+
|
| 285 |
+
# Apply TOP-P selection
|
| 286 |
+
total_score = sum(score for _, score in sorted_docs)
|
| 287 |
+
cumulative_prob = 0.0
|
| 288 |
+
selected_docs = []
|
| 289 |
+
|
| 290 |
+
for doc_name, score in sorted_docs:
|
| 291 |
+
prob = score / total_score
|
| 292 |
+
cumulative_prob += prob
|
| 293 |
+
selected_docs.append(doc_name)
|
| 294 |
+
|
| 295 |
+
# Stop when we reach the TOP-P threshold
|
| 296 |
+
if cumulative_prob >= top_p:
|
| 297 |
+
break
|
| 298 |
+
|
| 299 |
+
logger.info(f"π Found {len(selected_docs)} relevant documents for query: '{query}' (ANNOY TOP-P={top_p})")
|
| 300 |
+
logger.info(f"π Cumulative probability: {cumulative_prob:.3f}")
|
| 301 |
+
|
| 302 |
+
for i, doc_name in enumerate(selected_docs):
|
| 303 |
+
score = doc_scores[doc_name]
|
| 304 |
+
prob = score / total_score
|
| 305 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f}, prob: {prob:.3f})")
|
| 306 |
+
|
| 307 |
+
return selected_docs
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def find_relevant_documents_annoy_threshold(query: str, model: SentenceTransformer,
|
| 311 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
| 312 |
+
similarity_threshold: float = 0.5, search_neighbors: int = 50) -> List[str]:
|
| 313 |
+
"""Find all documents above a similarity threshold using ANNOY acceleration."""
|
| 314 |
+
query_embedding = create_text_embedding(model, query)
|
| 315 |
+
|
| 316 |
+
# Use ANNOY to find similar tags quickly
|
| 317 |
+
similar_tags, distances = annoy_manager.search_tags(
|
| 318 |
+
query_embedding, n_neighbors=search_neighbors, include_distances=True
|
| 319 |
+
)
|
| 320 |
+
|
| 321 |
+
# Convert angular distances to cosine similarities
|
| 322 |
+
tag_similarities = {}
|
| 323 |
+
for tag, distance in zip(similar_tags, distances):
|
| 324 |
+
similarity = convert_angular_distance_to_cosine_similarity(distance)
|
| 325 |
+
tag_similarities[tag] = similarity
|
| 326 |
+
|
| 327 |
+
# Find documents that contain the most similar tags
|
| 328 |
+
doc_scores = {}
|
| 329 |
+
for pdf_name, doc_info in doc_tag_mapping.items():
|
| 330 |
+
doc_tags = doc_info['tags']
|
| 331 |
+
|
| 332 |
+
# Calculate document score using weighted average
|
| 333 |
+
if doc_tags:
|
| 334 |
+
similarities = [tag_similarities.get(tag, 0) for tag in doc_tags]
|
| 335 |
+
avg_similarity = np.mean(similarities)
|
| 336 |
+
max_similarity = max(similarities)
|
| 337 |
+
# Weighted combination: 70% average (overall relevance) + 30% max (strongest match)
|
| 338 |
+
doc_score = avg_similarity * 0.7 + max_similarity * 0.3
|
| 339 |
+
if doc_score >= similarity_threshold:
|
| 340 |
+
doc_scores[pdf_name] = doc_score
|
| 341 |
+
|
| 342 |
+
# Sort by similarity score
|
| 343 |
+
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
|
| 344 |
+
relevant_docs = [doc_name for doc_name, score in sorted_docs]
|
| 345 |
+
|
| 346 |
+
logger.info(f"π Found {len(relevant_docs)} relevant documents for query: '{query}' (ANNOY threshold={similarity_threshold})")
|
| 347 |
+
for i, doc_name in enumerate(relevant_docs):
|
| 348 |
+
score = doc_scores[doc_name]
|
| 349 |
+
logger.info(f" {i+1}. {doc_name} (similarity: {score:.3f})")
|
| 350 |
+
|
| 351 |
+
return relevant_docs
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def find_relevant_documents_annoy(query: str, model: SentenceTransformer,
|
| 355 |
+
annoy_manager: AnnoyIndexManager, doc_tag_mapping: Dict,
|
| 356 |
+
strategy: str = "top_k", **kwargs) -> List[str]:
|
| 357 |
+
"""Unified interface for ANNOY-accelerated document retrieval with different strategies."""
|
| 358 |
+
if strategy == "top_k":
|
| 359 |
+
top_k = kwargs.get("top_k", 3)
|
| 360 |
+
search_neighbors = kwargs.get("search_neighbors", 20)
|
| 361 |
+
return find_relevant_documents_annoy_top_k(query, model, annoy_manager, doc_tag_mapping, top_k, search_neighbors)
|
| 362 |
+
|
| 363 |
+
elif strategy == "top_p":
|
| 364 |
+
top_p = kwargs.get("top_p", 0.6)
|
| 365 |
+
min_similarity = kwargs.get("min_similarity", 0.5)
|
| 366 |
+
search_neighbors = kwargs.get("search_neighbors", 30)
|
| 367 |
+
return find_relevant_documents_annoy_top_p(query, model, annoy_manager, doc_tag_mapping, top_p, min_similarity, search_neighbors)
|
| 368 |
+
|
| 369 |
+
elif strategy == "threshold":
|
| 370 |
+
similarity_threshold = kwargs.get("similarity_threshold", 0.5)
|
| 371 |
+
search_neighbors = kwargs.get("search_neighbors", 50)
|
| 372 |
+
return find_relevant_documents_annoy_threshold(query, model, annoy_manager, doc_tag_mapping, similarity_threshold, search_neighbors)
|
| 373 |
+
|
| 374 |
+
else:
|
| 375 |
+
raise ValueError(f"Unknown strategy: {strategy}. Use 'top_k', 'top_p', or 'threshold'")
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def find_relevant_documents_with_fallback(query: str, model: SentenceTransformer,
|
| 379 |
+
tag_embeddings: Dict, doc_tag_mapping: Dict,
|
| 380 |
+
annoy_manager: Optional[AnnoyIndexManager] = None,
|
| 381 |
+
strategy: str = "top_k", **kwargs) -> List[str]:
|
| 382 |
+
"""
|
| 383 |
+
Find relevant documents with ANNOY acceleration and fallback to original method.
|
| 384 |
+
|
| 385 |
+
This function automatically uses ANNOY if available, otherwise falls back to original search.
|
| 386 |
+
"""
|
| 387 |
+
if annoy_manager is not None:
|
| 388 |
+
try:
|
| 389 |
+
logger.info("π Using ANNOY-accelerated document retrieval")
|
| 390 |
+
return find_relevant_documents_annoy(query, model, annoy_manager, doc_tag_mapping, strategy, **kwargs)
|
| 391 |
+
except Exception as e:
|
| 392 |
+
logger.warning(f"β οΈ ANNOY retrieval failed, falling back to original method: {e}")
|
| 393 |
+
|
| 394 |
+
# Fallback to original method
|
| 395 |
+
logger.info("π Using original document retrieval method")
|
| 396 |
+
return find_relevant_documents(query, model, tag_embeddings, doc_tag_mapping, strategy, **kwargs)
|
{src/pdf-version β customization/src}/utils/__init__.py
RENAMED
|
File without changes
|
{src/pdf-version β customization/src}/utils/helpers.py
RENAMED
|
File without changes
|
customization/test/test_pipeline.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Test script to verify the customization pipeline with ANNOY indices."""
|
| 3 |
+
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Add parent directory to path
|
| 8 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 9 |
+
|
| 10 |
+
from customization_pipeline import retrieve_document_chunks
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_pipeline():
|
| 14 |
+
"""Test the complete pipeline with different queries."""
|
| 15 |
+
print("π§ͺ Testing Customization Pipeline with ANNOY Indices")
|
| 16 |
+
print("=" * 60)
|
| 17 |
+
|
| 18 |
+
# Test queries
|
| 19 |
+
test_queries = [
|
| 20 |
+
"chest pain and shortness of breath",
|
| 21 |
+
"pregnancy bleeding emergency",
|
| 22 |
+
"atrial fibrillation treatment",
|
| 23 |
+
"fever of unknown origin",
|
| 24 |
+
"dizziness diagnostic approach"
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
for query in test_queries:
|
| 28 |
+
print(f"\nπ Query: '{query}'")
|
| 29 |
+
print("-" * 60)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
# Retrieve chunks
|
| 33 |
+
results = retrieve_document_chunks(query, top_k=3)
|
| 34 |
+
|
| 35 |
+
if results:
|
| 36 |
+
print(f"β
Found {len(results)} relevant chunks:\n")
|
| 37 |
+
|
| 38 |
+
for i, result in enumerate(results, 1):
|
| 39 |
+
print(f"Result {i}:")
|
| 40 |
+
print(f" π Document: {result['document']}")
|
| 41 |
+
print(f" π Score: {result['score']:.4f}")
|
| 42 |
+
print(f" π Chunk ID: {result['metadata']['chunk_id']}")
|
| 43 |
+
print(f" π Text Preview: {result['chunk_text'][:150]}...")
|
| 44 |
+
print()
|
| 45 |
+
else:
|
| 46 |
+
print("β No results found")
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"β Error processing query: {e}")
|
| 50 |
+
import traceback
|
| 51 |
+
traceback.print_exc()
|
| 52 |
+
|
| 53 |
+
print("\n" + "=" * 60)
|
| 54 |
+
print("β
Pipeline test completed!")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_specific_medical_cases():
|
| 58 |
+
"""Test specific medical scenarios."""
|
| 59 |
+
print("\n\nπ₯ Testing Specific Medical Cases")
|
| 60 |
+
print("=" * 60)
|
| 61 |
+
|
| 62 |
+
medical_cases = {
|
| 63 |
+
"Cardiac Emergency": "acute coronary syndrome ST elevation",
|
| 64 |
+
"Neurological": "stroke symptoms thrombolysis window",
|
| 65 |
+
"Respiratory": "pulmonary embolism Wells score",
|
| 66 |
+
"Obstetric Emergency": "eclampsia magnesium sulfate",
|
| 67 |
+
"Pediatric": "pediatric seizure management"
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
for case_type, query in medical_cases.items():
|
| 71 |
+
print(f"\nπ {case_type}: '{query}'")
|
| 72 |
+
print("-" * 60)
|
| 73 |
+
|
| 74 |
+
results = retrieve_document_chunks(query, top_k=2)
|
| 75 |
+
|
| 76 |
+
if results:
|
| 77 |
+
for result in results:
|
| 78 |
+
print(f"π {result['document']}")
|
| 79 |
+
print(f" Score: {result['score']:.4f}")
|
| 80 |
+
print(f" Relevant content found in chunk {result['metadata']['chunk_id']}")
|
| 81 |
+
else:
|
| 82 |
+
print(" No specific guidance found")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def test_performance():
|
| 86 |
+
"""Test retrieval performance."""
|
| 87 |
+
import time
|
| 88 |
+
|
| 89 |
+
print("\n\nβ‘ Testing Retrieval Performance")
|
| 90 |
+
print("=" * 60)
|
| 91 |
+
|
| 92 |
+
queries = [
|
| 93 |
+
"chest pain",
|
| 94 |
+
"headache emergency",
|
| 95 |
+
"fever neutropenia",
|
| 96 |
+
"pneumonia antibiotics",
|
| 97 |
+
"atrial fibrillation"
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
total_time = 0
|
| 101 |
+
for query in queries:
|
| 102 |
+
start_time = time.time()
|
| 103 |
+
results = retrieve_document_chunks(query, top_k=5)
|
| 104 |
+
elapsed = time.time() - start_time
|
| 105 |
+
total_time += elapsed
|
| 106 |
+
|
| 107 |
+
print(f"Query: '{query}' - Retrieved {len(results)} chunks in {elapsed:.3f}s")
|
| 108 |
+
|
| 109 |
+
avg_time = total_time / len(queries)
|
| 110 |
+
print(f"\nπ Average retrieval time: {avg_time:.3f}s per query")
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
# Run all tests
|
| 115 |
+
test_pipeline()
|
| 116 |
+
test_specific_medical_cases()
|
| 117 |
+
test_performance()
|
customization_requirements.txt
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Customization Pipeline Requirements
|
| 2 |
+
# Generated from rag_env environment for hospital-specific document processing
|
| 3 |
+
#
|
| 4 |
+
# Key libraries:
|
| 5 |
+
# - sentence-transformers: Medical domain embeddings (BGE-Large-Medical)
|
| 6 |
+
# - torch: Deep learning framework
|
| 7 |
+
# - annoy: Fast vector similarity search indices
|
| 8 |
+
# - pdfplumber: PDF text and table extraction
|
| 9 |
+
# - llama-index: Document chunking and processing
|
| 10 |
+
# - transformers: Hugging Face model support
|
| 11 |
+
# - openai: LLM integration (optional)
|
| 12 |
+
#
|
| 13 |
+
# Install with: pip install -r customization_requirements.txt
|
| 14 |
+
#
|
| 15 |
+
accelerate==1.9.0
|
| 16 |
+
acres==0.5.0
|
| 17 |
+
aiohappyeyeballs==2.6.1
|
| 18 |
+
aiohttp==3.12.14
|
| 19 |
+
aiosignal==1.4.0
|
| 20 |
+
aiosqlite==0.21.0
|
| 21 |
+
annotated-types==0.7.0
|
| 22 |
+
annoy==1.17.3
|
| 23 |
+
anyio==4.9.0
|
| 24 |
+
appnope==0.1.4
|
| 25 |
+
asttokens==3.0.0
|
| 26 |
+
attrs==25.3.0
|
| 27 |
+
banks==2.2.0
|
| 28 |
+
beautifulsoup4==4.13.4
|
| 29 |
+
bm25s==0.2.13
|
| 30 |
+
certifi==2025.7.14
|
| 31 |
+
cffi==1.17.1
|
| 32 |
+
charset-normalizer==3.4.2
|
| 33 |
+
ci-info==0.3.0
|
| 34 |
+
click==8.2.1
|
| 35 |
+
colorama==0.4.6
|
| 36 |
+
comm==0.2.2
|
| 37 |
+
configobj==5.0.9
|
| 38 |
+
configparser==7.2.0
|
| 39 |
+
cryptography==45.0.5
|
| 40 |
+
dataclasses-json==0.6.7
|
| 41 |
+
debugpy==1.8.15
|
| 42 |
+
decorator==5.2.1
|
| 43 |
+
defusedxml==0.7.1
|
| 44 |
+
Deprecated==1.2.18
|
| 45 |
+
dirtyjson==1.0.8
|
| 46 |
+
distro==1.9.0
|
| 47 |
+
easyocr==1.7.2
|
| 48 |
+
etelemetry==0.3.1
|
| 49 |
+
executing==2.2.0
|
| 50 |
+
filelock==3.18.0
|
| 51 |
+
filetype==1.2.0
|
| 52 |
+
fitz==0.0.1.dev2
|
| 53 |
+
frozenlist==1.7.0
|
| 54 |
+
fsspec==2025.7.0
|
| 55 |
+
greenlet==3.2.3
|
| 56 |
+
griffe==1.7.3
|
| 57 |
+
h11==0.16.0
|
| 58 |
+
hf-xet==1.1.5
|
| 59 |
+
httpcore==1.0.9
|
| 60 |
+
httplib2==0.22.0
|
| 61 |
+
httpx==0.28.1
|
| 62 |
+
huggingface-hub==0.33.4
|
| 63 |
+
idna==3.10
|
| 64 |
+
imageio==2.37.0
|
| 65 |
+
ipykernel==6.30.0
|
| 66 |
+
ipython==9.4.0
|
| 67 |
+
ipython_pygments_lexers==1.1.1
|
| 68 |
+
jedi==0.19.2
|
| 69 |
+
Jinja2==3.1.6
|
| 70 |
+
jiter==0.10.0
|
| 71 |
+
joblib==1.5.1
|
| 72 |
+
jpype1==1.6.0
|
| 73 |
+
jupyter_client==8.6.3
|
| 74 |
+
jupyter_core==5.8.1
|
| 75 |
+
lazy_loader==0.4
|
| 76 |
+
llama-cloud==0.1.32
|
| 77 |
+
llama-cloud-services==0.6.43
|
| 78 |
+
llama-index==0.12.50
|
| 79 |
+
llama-index-agent-openai==0.4.12
|
| 80 |
+
llama-index-cli==0.4.4
|
| 81 |
+
llama-index-core==0.12.50
|
| 82 |
+
llama-index-embeddings-huggingface==0.5.5
|
| 83 |
+
llama-index-embeddings-openai==0.3.1
|
| 84 |
+
llama-index-indices-managed-llama-cloud==0.7.10
|
| 85 |
+
llama-index-instrumentation==0.3.0
|
| 86 |
+
llama-index-llms-huggingface==0.5.0
|
| 87 |
+
llama-index-llms-openai==0.4.7
|
| 88 |
+
llama-index-llms-openai-like==0.4.0
|
| 89 |
+
llama-index-llms-openrouter==0.3.2
|
| 90 |
+
llama-index-multi-modal-llms-openai==0.5.3
|
| 91 |
+
llama-index-program-openai==0.3.2
|
| 92 |
+
llama-index-question-gen-openai==0.3.1
|
| 93 |
+
llama-index-readers-file==0.4.11
|
| 94 |
+
llama-index-readers-llama-parse==0.4.0
|
| 95 |
+
llama-index-retrievers-bm25==0.5.2
|
| 96 |
+
llama-index-workflows==1.1.0
|
| 97 |
+
llama-parse==0.6.43
|
| 98 |
+
looseversion==1.3.0
|
| 99 |
+
lxml==6.0.0
|
| 100 |
+
MarkupSafe==3.0.2
|
| 101 |
+
marshmallow==3.26.1
|
| 102 |
+
matplotlib-inline==0.1.7
|
| 103 |
+
mpmath==1.3.0
|
| 104 |
+
multidict==6.6.3
|
| 105 |
+
mypy_extensions==1.1.0
|
| 106 |
+
nest-asyncio==1.6.0
|
| 107 |
+
networkx==3.5
|
| 108 |
+
nibabel==5.3.2
|
| 109 |
+
ninja==1.11.1.4
|
| 110 |
+
nipype==1.10.0
|
| 111 |
+
nltk==3.9.1
|
| 112 |
+
numpy==2.2.6
|
| 113 |
+
openai==1.97.0
|
| 114 |
+
opencv-python-headless==4.12.0.88
|
| 115 |
+
packaging==25.0
|
| 116 |
+
pandas==2.2.3
|
| 117 |
+
parso==0.8.4
|
| 118 |
+
pathlib==1.0.1
|
| 119 |
+
pdfminer.six==20250506
|
| 120 |
+
pdfplumber==0.11.7
|
| 121 |
+
pexpect==4.9.0
|
| 122 |
+
pillow==11.3.0
|
| 123 |
+
platformdirs==4.3.8
|
| 124 |
+
prompt_toolkit==3.0.51
|
| 125 |
+
propcache==0.3.2
|
| 126 |
+
prov==2.1.1
|
| 127 |
+
psutil==7.0.0
|
| 128 |
+
ptyprocess==0.7.0
|
| 129 |
+
pure_eval==0.2.3
|
| 130 |
+
puremagic==1.30
|
| 131 |
+
pyclipper==1.3.0.post6
|
| 132 |
+
pycparser==2.22
|
| 133 |
+
pydantic==2.11.7
|
| 134 |
+
pydantic_core==2.33.2
|
| 135 |
+
pydot==4.0.1
|
| 136 |
+
Pygments==2.19.2
|
| 137 |
+
PyMuPDF==1.26.3
|
| 138 |
+
pyparsing==3.2.3
|
| 139 |
+
pypdf==5.8.0
|
| 140 |
+
pypdfium2==4.30.0
|
| 141 |
+
PyStemmer==2.2.0.3
|
| 142 |
+
python-bidi==0.6.6
|
| 143 |
+
python-dateutil==2.9.0.post0
|
| 144 |
+
python-dotenv==1.1.1
|
| 145 |
+
pytz==2025.2
|
| 146 |
+
pyxnat==1.6.3
|
| 147 |
+
PyYAML==6.0.2
|
| 148 |
+
pyzmq==27.0.0
|
| 149 |
+
rdflib==7.1.4
|
| 150 |
+
regex==2024.11.6
|
| 151 |
+
requests==2.32.4
|
| 152 |
+
safetensors==0.5.3
|
| 153 |
+
scikit-image==0.25.2
|
| 154 |
+
scikit-learn==1.7.1
|
| 155 |
+
scipy==1.16.0
|
| 156 |
+
sentence-transformers==5.0.0
|
| 157 |
+
setuptools==80.9.0
|
| 158 |
+
shapely==2.1.1
|
| 159 |
+
simplejson==3.20.1
|
| 160 |
+
six==1.17.0
|
| 161 |
+
sniffio==1.3.1
|
| 162 |
+
soupsieve==2.7
|
| 163 |
+
SQLAlchemy==2.0.41
|
| 164 |
+
stack-data==0.6.3
|
| 165 |
+
striprtf==0.0.26
|
| 166 |
+
sympy==1.14.0
|
| 167 |
+
tabula-py==2.10.0
|
| 168 |
+
tabulate==0.9.0
|
| 169 |
+
tenacity==9.1.2
|
| 170 |
+
threadpoolctl==3.6.0
|
| 171 |
+
tifffile==2025.6.11
|
| 172 |
+
tiktoken==0.9.0
|
| 173 |
+
tokenizers==0.21.2
|
| 174 |
+
torch==2.7.1
|
| 175 |
+
torchvision==0.22.1
|
| 176 |
+
tornado==6.5.1
|
| 177 |
+
tqdm==4.67.1
|
| 178 |
+
traitlets==5.14.3
|
| 179 |
+
traits==7.0.2
|
| 180 |
+
transformers==4.53.2
|
| 181 |
+
typing-inspect==0.9.0
|
| 182 |
+
typing-inspection==0.4.1
|
| 183 |
+
typing_extensions==4.14.1
|
| 184 |
+
tzdata==2025.2
|
| 185 |
+
urllib3==2.5.0
|
| 186 |
+
wcwidth==0.2.13
|
| 187 |
+
wrapt==1.17.2
|
| 188 |
+
yarl==1.20.1
|
src/pdf-version/data/__init__.py
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
"""Data loading and PDF processing."""
|
| 2 |
-
|
| 3 |
-
from .loaders import load_annotations, filter_pdf_files
|
| 4 |
-
from .pdf_processing import (
|
| 5 |
-
extract_pdf_text,
|
| 6 |
-
extract_tables_from_pdf,
|
| 7 |
-
extract_images_ocr_from_pdf,
|
| 8 |
-
extract_pdf_content_enhanced
|
| 9 |
-
)
|
| 10 |
-
|
| 11 |
-
__all__ = [
|
| 12 |
-
'load_annotations', 'filter_pdf_files',
|
| 13 |
-
'extract_pdf_text', 'extract_tables_from_pdf',
|
| 14 |
-
'extract_images_ocr_from_pdf', 'extract_pdf_content_enhanced'
|
| 15 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pdf-version/main.py
DELETED
|
@@ -1,83 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""OnCall AI - Medical RAG System
|
| 3 |
-
|
| 4 |
-
Main entry point for the medical RAG system.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import sys
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
|
| 10 |
-
# Add pdf-version directory to Python path
|
| 11 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
| 12 |
-
|
| 13 |
-
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def main():
|
| 17 |
-
"""Main program entry point."""
|
| 18 |
-
try:
|
| 19 |
-
# Build the system with chunk embeddings
|
| 20 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 21 |
-
|
| 22 |
-
# Demo chunk-based retrieval
|
| 23 |
-
print("\n" + "="*80)
|
| 24 |
-
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
| 25 |
-
print("="*80)
|
| 26 |
-
demo_rag_query("chest pain and shortness of breath",
|
| 27 |
-
strategy="top_p", use_chunks=True, top_p=0.8)
|
| 28 |
-
|
| 29 |
-
except KeyboardInterrupt:
|
| 30 |
-
print("\n\nπ User interrupted, program exiting")
|
| 31 |
-
except Exception as e:
|
| 32 |
-
print(f"\nβ Program execution error: {e}")
|
| 33 |
-
import traceback
|
| 34 |
-
traceback.print_exc()
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def interactive_demo():
|
| 38 |
-
"""Interactive demo mode."""
|
| 39 |
-
print("π₯ OnCall AI - Interactive Demo Mode")
|
| 40 |
-
print("=" * 50)
|
| 41 |
-
|
| 42 |
-
while True:
|
| 43 |
-
print("\nOptions:")
|
| 44 |
-
print("1. Build/rebuild system")
|
| 45 |
-
print("2. Query with TOP-P strategy")
|
| 46 |
-
print("3. Query with TOP-K strategy")
|
| 47 |
-
print("4. Compare all strategies")
|
| 48 |
-
print("5. Custom query")
|
| 49 |
-
print("6. Exit")
|
| 50 |
-
|
| 51 |
-
choice = input("\nSelect option (1-6): ").strip()
|
| 52 |
-
|
| 53 |
-
if choice == "1":
|
| 54 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 55 |
-
elif choice == "2":
|
| 56 |
-
query = input("Enter your query: ").strip()
|
| 57 |
-
if query:
|
| 58 |
-
demo_rag_query(query, strategy="top_p", use_chunks=True)
|
| 59 |
-
elif choice == "3":
|
| 60 |
-
query = input("Enter your query: ").strip()
|
| 61 |
-
if query:
|
| 62 |
-
demo_rag_query(query, strategy="top_k", use_chunks=True, top_k=3)
|
| 63 |
-
elif choice == "4":
|
| 64 |
-
query = input("Enter your query: ").strip()
|
| 65 |
-
if query:
|
| 66 |
-
demo_all_strategies(query)
|
| 67 |
-
elif choice == "5":
|
| 68 |
-
query = input("Enter your query: ").strip()
|
| 69 |
-
strategy = input("Enter strategy (top_k/top_p/threshold): ").strip()
|
| 70 |
-
if query and strategy:
|
| 71 |
-
demo_rag_query(query, strategy=strategy, use_chunks=True)
|
| 72 |
-
elif choice == "6":
|
| 73 |
-
print("π Goodbye!")
|
| 74 |
-
break
|
| 75 |
-
else:
|
| 76 |
-
print("β Invalid option. Please select 1-6.")
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
if __name__ == "__main__":
|
| 80 |
-
if len(sys.argv) > 1 and sys.argv[1] == "--interactive":
|
| 81 |
-
interactive_demo()
|
| 82 |
-
else:
|
| 83 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/pdf-version/oncall_ai.py
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""OnCall AI - Medical RAG System (Backward Compatibility)
|
| 3 |
-
|
| 4 |
-
This file provides backward compatibility with the original rag.py interface.
|
| 5 |
-
Import everything from the new modular structure.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import sys
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
# Add pdf-version directory to Python path
|
| 12 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
| 13 |
-
|
| 14 |
-
# Import all functions for backward compatibility
|
| 15 |
-
from models.embedding_models import load_biomedbert_model, load_meditron_model
|
| 16 |
-
from data.loaders import load_annotations, filter_pdf_files
|
| 17 |
-
from data.pdf_processing import (
|
| 18 |
-
extract_pdf_text, extract_tables_from_pdf,
|
| 19 |
-
extract_images_ocr_from_pdf, extract_pdf_content_enhanced
|
| 20 |
-
)
|
| 21 |
-
from indexing.document_indexer import build_document_index, split_text_into_chunks
|
| 22 |
-
from indexing.embedding_creator import create_text_embedding, create_tag_embeddings, create_chunk_embeddings
|
| 23 |
-
from indexing.storage import save_document_system, load_document_system
|
| 24 |
-
from retrieval.document_retriever import (
|
| 25 |
-
find_relevant_documents_top_k, find_relevant_documents_top_p,
|
| 26 |
-
find_relevant_documents_threshold, find_relevant_documents,
|
| 27 |
-
create_document_tag_mapping
|
| 28 |
-
)
|
| 29 |
-
from retrieval.chunk_retriever import find_relevant_chunks, get_documents_for_rag, get_chunks_for_rag
|
| 30 |
-
from demos.demo_runner import build_medical_rag_system, demo_rag_query, demo_all_strategies
|
| 31 |
-
|
| 32 |
-
# Main function for backward compatibility
|
| 33 |
-
def main():
|
| 34 |
-
"""Main program entry compatible with original rag.py."""
|
| 35 |
-
try:
|
| 36 |
-
# Build the system with chunk embeddings
|
| 37 |
-
build_medical_rag_system(enable_chunk_embeddings=True)
|
| 38 |
-
|
| 39 |
-
# Demo chunk-based retrieval
|
| 40 |
-
print("\n" + "="*80)
|
| 41 |
-
print("π§© CHUNK-BASED RETRIEVAL DEMO")
|
| 42 |
-
print("="*80)
|
| 43 |
-
demo_rag_query("chest pain and shortness of breath",
|
| 44 |
-
strategy="top_p", use_chunks=True, top_p=0.8)
|
| 45 |
-
|
| 46 |
-
except KeyboardInterrupt:
|
| 47 |
-
print("\n\nπ User interrupted, program exiting")
|
| 48 |
-
except Exception as e:
|
| 49 |
-
print(f"\nβ Program execution error: {e}")
|
| 50 |
-
import traceback
|
| 51 |
-
traceback.print_exc()
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
if __name__ == "__main__":
|
| 55 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|