|
|
|
|
|
""" |
|
|
Document Intelligence RAG End-to-End Example |
|
|
|
|
|
Demonstrates the complete RAG workflow: |
|
|
1. Parse documents into semantic chunks |
|
|
2. Index chunks into vector store |
|
|
3. Semantic retrieval with filters |
|
|
4. Grounded question answering with evidence |
|
|
5. Evidence visualization |
|
|
|
|
|
Requirements: |
|
|
- ChromaDB: pip install chromadb |
|
|
- Ollama running with nomic-embed-text model: ollama pull nomic-embed-text |
|
|
- PyMuPDF: pip install pymupdf |
|
|
""" |
|
|
|
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
|
|
|
def check_dependencies(): |
|
|
"""Check that required dependencies are available.""" |
|
|
missing = [] |
|
|
|
|
|
try: |
|
|
import chromadb |
|
|
except ImportError: |
|
|
missing.append("chromadb") |
|
|
|
|
|
try: |
|
|
import fitz |
|
|
except ImportError: |
|
|
missing.append("pymupdf") |
|
|
|
|
|
if missing: |
|
|
print("Missing dependencies:") |
|
|
for dep in missing: |
|
|
print(f" - {dep}") |
|
|
print("\nInstall with: pip install " + " ".join(missing)) |
|
|
return False |
|
|
|
|
|
|
|
|
try: |
|
|
import requests |
|
|
response = requests.get("http://localhost:11434/api/tags", timeout=2) |
|
|
if response.status_code != 200: |
|
|
print("Warning: Ollama server not responding") |
|
|
print("Start Ollama with: ollama serve") |
|
|
print("Then pull the embedding model: ollama pull nomic-embed-text") |
|
|
except: |
|
|
print("Warning: Could not connect to Ollama server") |
|
|
print("The example will still work but with mock embeddings") |
|
|
|
|
|
return True |
|
|
|
|
|
|
|
|
def demo_parse_and_index(doc_paths: list): |
|
|
""" |
|
|
Demo: Parse documents and index into vector store. |
|
|
|
|
|
Args: |
|
|
doc_paths: List of document file paths |
|
|
""" |
|
|
print("\n" + "=" * 60) |
|
|
print("STEP 1: PARSE AND INDEX DOCUMENTS") |
|
|
print("=" * 60) |
|
|
|
|
|
from src.document_intelligence import DocumentParser, ParserConfig |
|
|
from src.document_intelligence.tools import get_rag_tool |
|
|
|
|
|
|
|
|
index_tool = get_rag_tool("index_document") |
|
|
|
|
|
results = [] |
|
|
for doc_path in doc_paths: |
|
|
print(f"\nProcessing: {doc_path}") |
|
|
|
|
|
|
|
|
config = ParserConfig(render_dpi=200, max_pages=10) |
|
|
parser = DocumentParser(config=config) |
|
|
|
|
|
try: |
|
|
parse_result = parser.parse(doc_path) |
|
|
print(f" Parsed: {len(parse_result.chunks)} chunks, {parse_result.num_pages} pages") |
|
|
|
|
|
|
|
|
result = index_tool.execute(parse_result=parse_result) |
|
|
|
|
|
if result.success: |
|
|
print(f" Indexed: {result.data['chunks_indexed']} chunks") |
|
|
print(f" Document ID: {result.data['document_id']}") |
|
|
results.append({ |
|
|
"path": doc_path, |
|
|
"doc_id": result.data['document_id'], |
|
|
"chunks": result.data['chunks_indexed'], |
|
|
}) |
|
|
else: |
|
|
print(f" Error: {result.error}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f" Failed: {e}") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def demo_semantic_retrieval(query: str, document_id: str = None): |
|
|
""" |
|
|
Demo: Semantic retrieval from vector store. |
|
|
|
|
|
Args: |
|
|
query: Search query |
|
|
document_id: Optional document filter |
|
|
""" |
|
|
print("\n" + "=" * 60) |
|
|
print("STEP 2: SEMANTIC RETRIEVAL") |
|
|
print("=" * 60) |
|
|
|
|
|
from src.document_intelligence.tools import get_rag_tool |
|
|
|
|
|
retrieve_tool = get_rag_tool("retrieve_chunks") |
|
|
|
|
|
print(f"\nQuery: \"{query}\"") |
|
|
if document_id: |
|
|
print(f"Document filter: {document_id}") |
|
|
|
|
|
result = retrieve_tool.execute( |
|
|
query=query, |
|
|
top_k=5, |
|
|
document_id=document_id, |
|
|
include_evidence=True, |
|
|
) |
|
|
|
|
|
if result.success: |
|
|
chunks = result.data.get("chunks", []) |
|
|
print(f"\nFound {len(chunks)} relevant chunks:\n") |
|
|
|
|
|
for i, chunk in enumerate(chunks, 1): |
|
|
print(f"{i}. [similarity={chunk['similarity']:.3f}]") |
|
|
print(f" Page {chunk.get('page', '?')}, Type: {chunk.get('chunk_type', 'unknown')}") |
|
|
print(f" Text: {chunk['text'][:150]}...") |
|
|
print() |
|
|
|
|
|
|
|
|
if result.evidence: |
|
|
print("Evidence references:") |
|
|
for ev in result.evidence[:3]: |
|
|
print(f" - Chunk {ev['chunk_id'][:12]}... Page {ev.get('page', '?')}") |
|
|
|
|
|
return chunks |
|
|
else: |
|
|
print(f"Error: {result.error}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def demo_grounded_qa(question: str, document_id: str = None): |
|
|
""" |
|
|
Demo: Grounded question answering with evidence. |
|
|
|
|
|
Args: |
|
|
question: Question to answer |
|
|
document_id: Optional document filter |
|
|
""" |
|
|
print("\n" + "=" * 60) |
|
|
print("STEP 3: GROUNDED QUESTION ANSWERING") |
|
|
print("=" * 60) |
|
|
|
|
|
from src.document_intelligence.tools import get_rag_tool |
|
|
|
|
|
qa_tool = get_rag_tool("rag_answer") |
|
|
|
|
|
print(f"\nQuestion: \"{question}\"") |
|
|
|
|
|
result = qa_tool.execute( |
|
|
question=question, |
|
|
document_id=document_id, |
|
|
top_k=5, |
|
|
) |
|
|
|
|
|
if result.success: |
|
|
data = result.data |
|
|
print(f"\nAnswer: {data.get('answer', 'No answer')}") |
|
|
print(f"Confidence: {data.get('confidence', 0):.2f}") |
|
|
|
|
|
if data.get('abstained'): |
|
|
print("Note: System abstained due to low confidence") |
|
|
|
|
|
|
|
|
citations = data.get('citations', []) |
|
|
if citations: |
|
|
print("\nCitations:") |
|
|
for cit in citations: |
|
|
print(f" [{cit['index']}] {cit.get('text', '')[:80]}...") |
|
|
|
|
|
|
|
|
if result.evidence: |
|
|
print("\nEvidence locations:") |
|
|
for ev in result.evidence: |
|
|
print(f" - Page {ev.get('page', '?')}: {ev.get('snippet', '')[:60]}...") |
|
|
|
|
|
return data |
|
|
else: |
|
|
print(f"Error: {result.error}") |
|
|
return None |
|
|
|
|
|
|
|
|
def demo_filtered_retrieval(): |
|
|
""" |
|
|
Demo: Retrieval with various filters. |
|
|
""" |
|
|
print("\n" + "=" * 60) |
|
|
print("STEP 4: FILTERED RETRIEVAL") |
|
|
print("=" * 60) |
|
|
|
|
|
from src.document_intelligence.tools import get_rag_tool |
|
|
|
|
|
retrieve_tool = get_rag_tool("retrieve_chunks") |
|
|
|
|
|
|
|
|
print("\n--- Retrieving only table chunks ---") |
|
|
result = retrieve_tool.execute( |
|
|
query="data values", |
|
|
top_k=3, |
|
|
chunk_types=["table"], |
|
|
) |
|
|
|
|
|
if result.success: |
|
|
chunks = result.data.get("chunks", []) |
|
|
print(f"Found {len(chunks)} table chunks") |
|
|
for chunk in chunks: |
|
|
print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...") |
|
|
|
|
|
|
|
|
print("\n--- Retrieving from pages 1-3 only ---") |
|
|
result = retrieve_tool.execute( |
|
|
query="content", |
|
|
top_k=3, |
|
|
page_range=(1, 3), |
|
|
) |
|
|
|
|
|
if result.success: |
|
|
chunks = result.data.get("chunks", []) |
|
|
print(f"Found {len(chunks)} chunks from pages 1-3") |
|
|
for chunk in chunks: |
|
|
print(f" - Page {chunk.get('page', '?')}: {chunk['text'][:80]}...") |
|
|
|
|
|
|
|
|
def demo_index_stats(): |
|
|
""" |
|
|
Demo: Show index statistics. |
|
|
""" |
|
|
print("\n" + "=" * 60) |
|
|
print("INDEX STATISTICS") |
|
|
print("=" * 60) |
|
|
|
|
|
from src.document_intelligence.tools import get_rag_tool |
|
|
|
|
|
stats_tool = get_rag_tool("get_index_stats") |
|
|
result = stats_tool.execute() |
|
|
|
|
|
if result.success: |
|
|
data = result.data |
|
|
print(f"\nTotal chunks indexed: {data.get('total_chunks', 0)}") |
|
|
print(f"Embedding model: {data.get('embedding_model', 'unknown')}") |
|
|
print(f"Embedding dimension: {data.get('embedding_dimension', 'unknown')}") |
|
|
else: |
|
|
print(f"Error: {result.error}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Run the complete RAG demo.""" |
|
|
print("=" * 60) |
|
|
print("SPARKNET Document Intelligence RAG Demo") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if not check_dependencies(): |
|
|
print("\nPlease install missing dependencies and try again.") |
|
|
return |
|
|
|
|
|
|
|
|
sample_paths = [ |
|
|
Path("Dataset/Patent_1.pdf"), |
|
|
Path("data/sample.pdf"), |
|
|
Path("tests/fixtures/sample.pdf"), |
|
|
] |
|
|
|
|
|
doc_paths = [] |
|
|
for path in sample_paths: |
|
|
if path.exists(): |
|
|
doc_paths.append(str(path)) |
|
|
break |
|
|
|
|
|
if not doc_paths: |
|
|
print("\nNo sample documents found.") |
|
|
print("Please provide a PDF file path as argument.") |
|
|
print("\nUsage: python document_rag_end_to_end.py [path/to/document.pdf]") |
|
|
|
|
|
if len(sys.argv) > 1: |
|
|
doc_paths = sys.argv[1:] |
|
|
else: |
|
|
return |
|
|
|
|
|
print(f"\nUsing documents: {doc_paths}") |
|
|
|
|
|
try: |
|
|
|
|
|
indexed_docs = demo_parse_and_index(doc_paths) |
|
|
|
|
|
if not indexed_docs: |
|
|
print("\nNo documents were indexed. Exiting.") |
|
|
return |
|
|
|
|
|
|
|
|
first_doc_id = indexed_docs[0]["doc_id"] |
|
|
|
|
|
|
|
|
demo_semantic_retrieval( |
|
|
query="main topic content", |
|
|
document_id=first_doc_id, |
|
|
) |
|
|
|
|
|
|
|
|
demo_grounded_qa( |
|
|
question="What is this document about?", |
|
|
document_id=first_doc_id, |
|
|
) |
|
|
|
|
|
|
|
|
demo_filtered_retrieval() |
|
|
|
|
|
|
|
|
demo_index_stats() |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Demo complete!") |
|
|
print("=" * 60) |
|
|
|
|
|
print("\nNext steps:") |
|
|
print(" 1. Try the CLI: sparknet docint index your_document.pdf") |
|
|
print(" 2. Query the index: sparknet docint retrieve 'your query'") |
|
|
print(" 3. Ask questions: sparknet docint ask doc.pdf 'question' --use-rag") |
|
|
|
|
|
except ImportError as e: |
|
|
print(f"\nImport error: {e}") |
|
|
print("Make sure all dependencies are installed:") |
|
|
print(" pip install pymupdf pillow numpy pydantic chromadb") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nError: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|