Spaces:
Runtime error
Runtime error
File size: 5,539 Bytes
a921556 76cdde2 a921556 76cdde2 a921556 76cdde2 a921556 76cdde2 cdadb63 76cdde2 a921556 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
"""
Document processing with LlamaIndex.
Handles PDF parsing, indexing, and querying with citation tracking.
"""
import os
import json
from typing import Dict, Any, List
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import NodeWithScore
class InvestmentDocumentProcessor:
"""Process investment documents (PDFs) and extract information with citations."""
def __init__(self, api_key: str):
"""Initialize the processor with OpenAI API key."""
# Configure OpenAI GPT-4o-mini (cheap and fast)
self.llm = OpenAI(
model="gpt-4o-mini",
api_key=api_key,
temperature=0.1 # Low temperature for factual extraction
)
# Set global LLM (embeddings will use OpenAI default)
Settings.llm = self.llm
# Node parser to chunk documents while preserving metadata
# Larger chunks to capture complete financial statements/tables
self.node_parser = SimpleNodeParser.from_defaults(
chunk_size=2048,
chunk_overlap=400
)
self.index = None
self.documents = None
def load_pdf(self, pdf_path: str) -> None:
"""Load and index a PDF document."""
# Load PDF with metadata extraction
reader = SimpleDirectoryReader(
input_files=[pdf_path],
filename_as_id=True
)
self.documents = reader.load_data()
# Add page numbers to metadata if not present
for doc in self.documents:
if 'page_label' not in doc.metadata:
# SimpleDirectoryReader should add page info, but fallback
doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown')
# Create vector index
self.index = VectorStoreIndex.from_documents(
self.documents,
node_parser=self.node_parser,
show_progress=True
)
def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]:
"""
Analyze the document against investment criteria.
Returns analysis with citations.
"""
if self.index is None:
raise ValueError("No document loaded. Call load_pdf() first.")
# Create query engine with citation tracking
query_engine = self.index.as_query_engine(
similarity_top_k=20, # Increased to get more diverse context
response_mode="compact" # More focused on relevant chunks
)
# Query with the criteria prompt
response = query_engine.query(criteria_prompt)
# Extract citations from source nodes
citations = self._extract_citations(response.source_nodes)
# Parse the response (expecting JSON)
try:
analysis_result = json.loads(str(response))
except json.JSONDecodeError:
# If not JSON, wrap in a structure
analysis_result = {
"raw_response": str(response),
"parse_error": True
}
# Add citations
analysis_result['citations'] = citations
analysis_result['source_nodes_count'] = len(response.source_nodes)
return analysis_result
def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]:
"""Extract citation information from source nodes."""
citations = []
for idx, node in enumerate(source_nodes):
page = node.node.metadata.get('page_label',
node.node.metadata.get('page', 'Unknown'))
citation = {
"index": idx + 1,
"page": page,
"score": node.score,
"text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
"full_text": node.node.text,
"is_truncated": len(node.node.text) > 350,
"file_name": node.node.metadata.get('file_name', 'Unknown')
}
citations.append(citation)
return citations
def get_document_summary(self) -> Dict[str, Any]:
"""Get basic document information."""
if self.documents is None:
return {"error": "No document loaded"}
return {
"num_pages": len(self.documents),
"file_name": self.documents[0].metadata.get('file_name', 'Unknown'),
"total_chars": sum(len(doc.text) for doc in self.documents)
}
def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""
Perform a quick search in the document.
Useful for finding specific sections or terms.
"""
if self.index is None:
raise ValueError("No document loaded. Call load_pdf() first.")
query_engine = self.index.as_query_engine(
similarity_top_k=top_k,
response_mode="no_text" # Just return nodes, no generation
)
response = query_engine.query(query)
results = []
for node in response.source_nodes:
page = node.node.metadata.get('page_label',
node.node.metadata.get('page', 'Unknown'))
results.append({
"page": page,
"text": node.node.text,
"score": node.score
})
return results
|