File size: 5,539 Bytes
a921556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76cdde2
a921556
76cdde2
 
a921556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76cdde2
 
a921556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76cdde2
cdadb63
76cdde2
a921556
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Document processing with LlamaIndex.
Handles PDF parsing, indexing, and querying with citation tracking.
"""

import os
import json
from typing import Dict, Any, List
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import NodeWithScore


class InvestmentDocumentProcessor:
    """Process investment documents (PDFs) and extract information with citations."""

    def __init__(self, api_key: str):
        """Initialize the processor with OpenAI API key."""
        # Configure OpenAI GPT-4o-mini (cheap and fast)
        self.llm = OpenAI(
            model="gpt-4o-mini",
            api_key=api_key,
            temperature=0.1  # Low temperature for factual extraction
        )

        # Set global LLM (embeddings will use OpenAI default)
        Settings.llm = self.llm

        # Node parser to chunk documents while preserving metadata
        # Larger chunks to capture complete financial statements/tables
        self.node_parser = SimpleNodeParser.from_defaults(
            chunk_size=2048,
            chunk_overlap=400
        )

        self.index = None
        self.documents = None

    def load_pdf(self, pdf_path: str) -> None:
        """Load and index a PDF document."""
        # Load PDF with metadata extraction
        reader = SimpleDirectoryReader(
            input_files=[pdf_path],
            filename_as_id=True
        )

        self.documents = reader.load_data()

        # Add page numbers to metadata if not present
        for doc in self.documents:
            if 'page_label' not in doc.metadata:
                # SimpleDirectoryReader should add page info, but fallback
                doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown')

        # Create vector index
        self.index = VectorStoreIndex.from_documents(
            self.documents,
            node_parser=self.node_parser,
            show_progress=True
        )

    def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]:
        """
        Analyze the document against investment criteria.
        Returns analysis with citations.
        """
        if self.index is None:
            raise ValueError("No document loaded. Call load_pdf() first.")

        # Create query engine with citation tracking
        query_engine = self.index.as_query_engine(
            similarity_top_k=20,  # Increased to get more diverse context
            response_mode="compact"  # More focused on relevant chunks
        )

        # Query with the criteria prompt
        response = query_engine.query(criteria_prompt)

        # Extract citations from source nodes
        citations = self._extract_citations(response.source_nodes)

        # Parse the response (expecting JSON)
        try:
            analysis_result = json.loads(str(response))
        except json.JSONDecodeError:
            # If not JSON, wrap in a structure
            analysis_result = {
                "raw_response": str(response),
                "parse_error": True
            }

        # Add citations
        analysis_result['citations'] = citations
        analysis_result['source_nodes_count'] = len(response.source_nodes)

        return analysis_result

    def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]:
        """Extract citation information from source nodes."""
        citations = []

        for idx, node in enumerate(source_nodes):
            page = node.node.metadata.get('page_label',
                                          node.node.metadata.get('page', 'Unknown'))

            citation = {
                "index": idx + 1,
                "page": page,
                "score": node.score,
                "text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
                "full_text": node.node.text,
                "is_truncated": len(node.node.text) > 350,
                "file_name": node.node.metadata.get('file_name', 'Unknown')
            }
            citations.append(citation)

        return citations

    def get_document_summary(self) -> Dict[str, Any]:
        """Get basic document information."""
        if self.documents is None:
            return {"error": "No document loaded"}

        return {
            "num_pages": len(self.documents),
            "file_name": self.documents[0].metadata.get('file_name', 'Unknown'),
            "total_chars": sum(len(doc.text) for doc in self.documents)
        }

    def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Perform a quick search in the document.
        Useful for finding specific sections or terms.
        """
        if self.index is None:
            raise ValueError("No document loaded. Call load_pdf() first.")

        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="no_text"  # Just return nodes, no generation
        )

        response = query_engine.query(query)

        results = []
        for node in response.source_nodes:
            page = node.node.metadata.get('page_label',
                                          node.node.metadata.get('page', 'Unknown'))
            results.append({
                "page": page,
                "text": node.node.text,
                "score": node.score
            })

        return results