File size: 5,007 Bytes
d520909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
"""
Example: RAG Pipeline
Demonstrates:
1. Indexing documents into vector store
2. Semantic search
3. Question answering with citations
"""
from pathlib import Path
from loguru import logger
# Import RAG components
from src.rag import (
VectorStoreConfig,
EmbeddingConfig,
RetrieverConfig,
GeneratorConfig,
get_document_indexer,
get_document_retriever,
get_grounded_generator,
)
def example_indexing():
"""Index documents into vector store."""
print("=" * 50)
print("Document Indexing")
print("=" * 50)
# Get indexer
indexer = get_document_indexer()
# Index a document
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print(f"Sample document not found: {sample_doc}")
print("Create a sample PDF at ./data/sample.pdf")
return False
# Index
result = indexer.index_document(sample_doc)
if result.success:
print(f"\nIndexed: {result.source_path}")
print(f" Document ID: {result.document_id}")
print(f" Chunks indexed: {result.num_chunks_indexed}")
print(f" Chunks skipped: {result.num_chunks_skipped}")
else:
print(f"Indexing failed: {result.error}")
return False
# Show stats
stats = indexer.get_index_stats()
print(f"\nIndex Stats:")
print(f" Total chunks: {stats['total_chunks']}")
print(f" Documents: {stats['num_documents']}")
print(f" Embedding model: {stats['embedding_model']}")
return True
def example_search():
"""Search indexed documents."""
print("\n" + "=" * 50)
print("Semantic Search")
print("=" * 50)
# Get retriever
retriever = get_document_retriever()
# Search queries
queries = [
"What is the main topic?",
"key findings",
"conclusions and recommendations",
]
for query in queries:
print(f"\nQuery: '{query}'")
chunks = retriever.retrieve(query, top_k=3)
if not chunks:
print(" No results found")
continue
for i, chunk in enumerate(chunks, 1):
print(f"\n [{i}] Similarity: {chunk.similarity:.3f}")
if chunk.page is not None:
print(f" Page: {chunk.page + 1}")
print(f" Text: {chunk.text[:150]}...")
def example_question_answering():
"""Answer questions using RAG."""
print("\n" + "=" * 50)
print("Question Answering with Citations")
print("=" * 50)
# Get generator
generator = get_grounded_generator()
# Questions
questions = [
"What is the main purpose of this document?",
"What are the key findings?",
"What recommendations are made?",
]
for question in questions:
print(f"\nQuestion: {question}")
print("-" * 40)
result = generator.answer_question(question, top_k=5)
print(f"\nAnswer: {result.answer}")
print(f"\nConfidence: {result.confidence:.2f}")
if result.abstained:
print(f"Note: {result.abstain_reason}")
if result.citations:
print(f"\nCitations ({len(result.citations)}):")
for citation in result.citations:
page = f"Page {citation.page + 1}" if citation.page is not None else ""
print(f" [{citation.index}] {page}: {citation.text_snippet[:60]}...")
def example_filtered_search():
"""Search with metadata filters."""
print("\n" + "=" * 50)
print("Filtered Search")
print("=" * 50)
retriever = get_document_retriever()
# Search only in tables
print("\nSearching for tables only...")
table_chunks = retriever.retrieve_tables("data values", top_k=3)
if table_chunks:
print(f"Found {len(table_chunks)} table chunks:")
for chunk in table_chunks:
print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
else:
print("No table chunks found")
# Search specific page range
print("\nSearching pages 1-3...")
page_chunks = retriever.retrieve_by_page(
"introduction",
page_range=(0, 2),
top_k=3,
)
if page_chunks:
print(f"Found {len(page_chunks)} chunks in pages 1-3:")
for chunk in page_chunks:
print(f" - Page {chunk.page + 1}: {chunk.text[:100]}...")
else:
print("No chunks found in specified pages")
def example_full_pipeline():
"""Complete RAG pipeline demo."""
print("\n" + "=" * 50)
print("Full RAG Pipeline Demo")
print("=" * 50)
# Step 1: Index
print("\n[Step 1] Indexing documents...")
if not example_indexing():
return
# Step 2: Search
print("\n[Step 2] Testing search...")
example_search()
# Step 3: Q&A
print("\n[Step 3] Question answering...")
example_question_answering()
print("\n" + "=" * 50)
print("Pipeline demo complete!")
print("=" * 50)
if __name__ == "__main__":
# Run full pipeline
example_full_pipeline()
|