""" Document processing with LlamaIndex. Handles PDF parsing, indexing, and querying with citation tracking. """ import os import json from typing import Dict, Any, List from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings from llama_index.llms.openai import OpenAI from llama_index.core.node_parser import SimpleNodeParser from llama_index.core.schema import NodeWithScore class InvestmentDocumentProcessor: """Process investment documents (PDFs) and extract information with citations.""" def __init__(self, api_key: str): """Initialize the processor with OpenAI API key.""" # Configure OpenAI GPT-4o-mini (cheap and fast) self.llm = OpenAI( model="gpt-4o-mini", api_key=api_key, temperature=0.1 # Low temperature for factual extraction ) # Set global LLM (embeddings will use OpenAI default) Settings.llm = self.llm # Node parser to chunk documents while preserving metadata # Larger chunks to capture complete financial statements/tables self.node_parser = SimpleNodeParser.from_defaults( chunk_size=2048, chunk_overlap=400 ) self.index = None self.documents = None def load_pdf(self, pdf_path: str) -> None: """Load and index a PDF document.""" # Load PDF with metadata extraction reader = SimpleDirectoryReader( input_files=[pdf_path], filename_as_id=True ) self.documents = reader.load_data() # Add page numbers to metadata if not present for doc in self.documents: if 'page_label' not in doc.metadata: # SimpleDirectoryReader should add page info, but fallback doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown') # Create vector index self.index = VectorStoreIndex.from_documents( self.documents, node_parser=self.node_parser, show_progress=True ) def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]: """ Analyze the document against investment criteria. Returns analysis with citations. """ if self.index is None: raise ValueError("No document loaded. Call load_pdf() first.") # Create query engine with citation tracking query_engine = self.index.as_query_engine( similarity_top_k=20, # Increased to get more diverse context response_mode="compact" # More focused on relevant chunks ) # Query with the criteria prompt response = query_engine.query(criteria_prompt) # Extract citations from source nodes citations = self._extract_citations(response.source_nodes) # Parse the response (expecting JSON) try: analysis_result = json.loads(str(response)) except json.JSONDecodeError: # If not JSON, wrap in a structure analysis_result = { "raw_response": str(response), "parse_error": True } # Add citations analysis_result['citations'] = citations analysis_result['source_nodes_count'] = len(response.source_nodes) return analysis_result def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]: """Extract citation information from source nodes.""" citations = [] for idx, node in enumerate(source_nodes): page = node.node.metadata.get('page_label', node.node.metadata.get('page', 'Unknown')) citation = { "index": idx + 1, "page": page, "score": node.score, "text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text, "full_text": node.node.text, "is_truncated": len(node.node.text) > 350, "file_name": node.node.metadata.get('file_name', 'Unknown') } citations.append(citation) return citations def get_document_summary(self) -> Dict[str, Any]: """Get basic document information.""" if self.documents is None: return {"error": "No document loaded"} return { "num_pages": len(self.documents), "file_name": self.documents[0].metadata.get('file_name', 'Unknown'), "total_chars": sum(len(doc.text) for doc in self.documents) } def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]: """ Perform a quick search in the document. Useful for finding specific sections or terms. """ if self.index is None: raise ValueError("No document loaded. Call load_pdf() first.") query_engine = self.index.as_query_engine( similarity_top_k=top_k, response_mode="no_text" # Just return nodes, no generation ) response = query_engine.query(query) results = [] for node in response.source_nodes: page = node.node.metadata.get('page_label', node.node.metadata.get('page', 'Unknown')) results.append({ "page": page, "text": node.node.text, "score": node.score }) return results