Spaces:

sukhrobnurali
/

financial-document-analyzer

Runtime error

File size: 5,539 Bytes

"""
Document processing with LlamaIndex.
Handles PDF parsing, indexing, and querying with citation tracking.
"""

import os
import json
from typing import Dict, Any, List
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import NodeWithScore


class InvestmentDocumentProcessor:
    """Process investment documents (PDFs) and extract information with citations."""

    def __init__(self, api_key: str):
        """Initialize the processor with OpenAI API key."""
        # Configure OpenAI GPT-4o-mini (cheap and fast)
        self.llm = OpenAI(
            model="gpt-4o-mini",
            api_key=api_key,
            temperature=0.1  # Low temperature for factual extraction
        )

        # Set global LLM (embeddings will use OpenAI default)
        Settings.llm = self.llm

        # Node parser to chunk documents while preserving metadata
        # Larger chunks to capture complete financial statements/tables
        self.node_parser = SimpleNodeParser.from_defaults(
            chunk_size=2048,
            chunk_overlap=400
        )

        self.index = None
        self.documents = None

    def load_pdf(self, pdf_path: str) -> None:
        """Load and index a PDF document."""
        # Load PDF with metadata extraction
        reader = SimpleDirectoryReader(
            input_files=[pdf_path],
            filename_as_id=True
        )

        self.documents = reader.load_data()

        # Add page numbers to metadata if not present
        for doc in self.documents:
            if 'page_label' not in doc.metadata:
                # SimpleDirectoryReader should add page info, but fallback
                doc.metadata['page_label'] = doc.metadata.get('page', 'Unknown')

        # Create vector index
        self.index = VectorStoreIndex.from_documents(
            self.documents,
            node_parser=self.node_parser,
            show_progress=True
        )

    def analyze_with_criteria(self, criteria_prompt: str) -> Dict[str, Any]:
        """
        Analyze the document against investment criteria.
        Returns analysis with citations.
        """
        if self.index is None:
            raise ValueError("No document loaded. Call load_pdf() first.")

        # Create query engine with citation tracking
        query_engine = self.index.as_query_engine(
            similarity_top_k=20,  # Increased to get more diverse context
            response_mode="compact"  # More focused on relevant chunks
        )

        # Query with the criteria prompt
        response = query_engine.query(criteria_prompt)

        # Extract citations from source nodes
        citations = self._extract_citations(response.source_nodes)

        # Parse the response (expecting JSON)
        try:
            analysis_result = json.loads(str(response))
        except json.JSONDecodeError:
            # If not JSON, wrap in a structure
            analysis_result = {
                "raw_response": str(response),
                "parse_error": True
            }

        # Add citations
        analysis_result['citations'] = citations
        analysis_result['source_nodes_count'] = len(response.source_nodes)

        return analysis_result

    def _extract_citations(self, source_nodes: List[NodeWithScore]) -> List[Dict[str, Any]]:
        """Extract citation information from source nodes."""
        citations = []

        for idx, node in enumerate(source_nodes):
            page = node.node.metadata.get('page_label',
                                          node.node.metadata.get('page', 'Unknown'))

            citation = {
                "index": idx + 1,
                "page": page,
                "score": node.score,
                "text_preview": node.node.text[:350] + "..." if len(node.node.text) > 350 else node.node.text,
                "full_text": node.node.text,
                "is_truncated": len(node.node.text) > 350,
                "file_name": node.node.metadata.get('file_name', 'Unknown')
            }
            citations.append(citation)

        return citations

    def get_document_summary(self) -> Dict[str, Any]:
        """Get basic document information."""
        if self.documents is None:
            return {"error": "No document loaded"}

        return {
            "num_pages": len(self.documents),
            "file_name": self.documents[0].metadata.get('file_name', 'Unknown'),
            "total_chars": sum(len(doc.text) for doc in self.documents)
        }

    def quick_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Perform a quick search in the document.
        Useful for finding specific sections or terms.
        """
        if self.index is None:
            raise ValueError("No document loaded. Call load_pdf() first.")

        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="no_text"  # Just return nodes, no generation
        )

        response = query_engine.query(query)

        results = []
        for node in response.source_nodes:
            page = node.node.metadata.get('page_label',
                                          node.node.metadata.get('page', 'Unknown'))
            results.append({
                "page": page,
                "text": node.node.text,
                "score": node.score
            })

        return results