File size: 4,297 Bytes
4e71548
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import asyncio
import json
from typing import Dict, Any, Optional
from openai import AsyncOpenAI
from huggingface_hub import AsyncInferenceClient
from src.config.config import settings


class GroqClient:
    """Async client for Groq API."""
    
    def __init__(self):
        self.client = AsyncOpenAI(
            base_url=settings.groq_base_url,
            api_key=settings.groq_api_key,
        )
    
    async def __aenter__(self):
        return self
    
    async def __aexit__(self, exc_type, exc_value, traceback):
        await self.client.close()
    
    async def extract_account_details(self, text: str) -> str:
        """Extract account details using LLM."""
        system_prompt = """
        You are a financial document parser that extracts structured data from bank statements.

        Your task is to extract the following fields and return only valid JSON:

        - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
        - Ending balance can also be referred with "Balance this statement" in pdfs.

        {
        "bank_name": "string",
        "account_holder": "string",
        "accounts": [{
            "account_name": "string",
            "account_number": "string",
            "starting_balance": float,
            "ending_balance": float,
            "statement_start_date": "YYYY-MM-DD",
            "statement_end_date": "YYYY-MM-DD"
        }]
        }

        Guidelines:
        - Return strictly valid JSON (no markdown, comments, or extra explanation).
        - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
        - Dates must follow the format `"YYYY-MM-DD"`.
        - Do not respond with anything other than the JSON object.
        - If multiple account are there then include all the account list in a list.
        """
        
        response = await self.client.chat.completions.create(
            model=settings.llm_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": text},
            ],
        )
        
        return response.choices[0].message.content


class HuggingFaceClient:
    """Async client for HuggingFace Inference API."""
    
    def __init__(self):
        self.client = AsyncInferenceClient(
            provider=settings.huggingface_provider,
            api_key=settings.huggingface_api_key,
        )
    
    async def __aenter__(self):
        return self
    
    async def __aexit__(self, exc_type, exc_value, traceback):
        pass
    
    async def extract_account_details(self, text: str) -> str:
        """Extract account details using HuggingFace model."""
        # This is a placeholder - you can implement HuggingFace specific logic here
        # For now, we'll use the same prompt as Groq
        system_prompt = """
        You are a financial document parser that extracts structured data from bank statements.

        Your task is to extract the following fields and return only valid JSON:

        - Starting balance can also be referred with "Balance last statement" or "Balance previous statement" in pdfs.
        - Ending balance can also be referred with "Balance this statement" in pdfs.

        {
        "bank_name": "string",
        "account_holder": "string",
        "accounts": [{
            "account_name": "string",
            "account_number": "string",
            "starting_balance": float,
            "ending_balance": float,
            "statement_start_date": "YYYY-MM-DD",
            "statement_end_date": "YYYY-MM-DD"
        }]
        }

        Guidelines:
        - Return strictly valid JSON (no markdown, comments, or extra explanation).
        - `starting_balance` and `ending_balance` must be `float` (no currency symbol).
        - Dates must follow the format `"YYYY-MM-DD"`.
        - Do not respond with anything other than the JSON object.
        - If multiple account are there then include all the account list in a list.
        """
        
        # This would need to be implemented based on the specific HuggingFace model
        # For now, returning a placeholder
        return '{"bank_name": "Unknown", "account_holder": "Unknown", "accounts": []}'