File size: 4,366 Bytes
aa8e38b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Google Gemini-based fallback parser for when regex extraction fails.
"""
import json
from typing import Optional

from app.config import settings
from app.utils.logging import get_logger
from app.utils.exceptions import PDFExtractionError

logger = get_logger(__name__)


class GeminiParser:
    """
    Uses Google Gemini models to extract T1 tax line values when regex fails.
    """

    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or settings.google_api_key
        self.model_name = settings.gemini_model
        self._model = None

        if self.api_key:
            try:
                import google.generativeai as genai
                genai.configure(api_key=self.api_key)
                self._model = genai.GenerativeModel(self.model_name)
                logger.info(f"Gemini client initialized with model: {self.model_name}")
            except ImportError:
                logger.warning("google-generativeai package not installed")
            except Exception as e:
                logger.warning(f"Failed to initialize Gemini client: {e}")

    def is_available(self) -> bool:
        """Check if Gemini parsing is available."""
        return self._model is not None

    def extract_line_values(
        self,
        text: str,
        line_numbers: Optional[list[str]] = None
    ) -> dict[str, Optional[str]]:
        """
        Use Gemini to extract T1 tax line values from text.

        Args:
            text: Text content from PDF.
            line_numbers: Specific line numbers to extract.

        Returns:
            Dictionary mapping line numbers to values.
        """
        if not self._model:
            raise PDFExtractionError("Gemini client not available")

        if line_numbers:
            lines_str = ", ".join(line_numbers)
            target_lines = f"Extract values for these specific lines: {lines_str}"
        else:
            target_lines = "Extract all T1 tax line values you can find"

        prompt = f"""You are a Canadian T1 tax form data extractor.

Given the following text extracted from a T1 tax return PDF, extract the line values.

{target_lines}

T1 tax lines are 5-digit numbers (like 15000, 23600, 26000) followed by dollar amounts.
The format may vary:
- "15000 Total Income: $50,000.00"
- "Line 15000  50000"
- "Total income (line 15000) 50,000"

Return ONLY a valid JSON object mapping line numbers to their numeric values (without $ or commas).
Example: {{"15000": "50000.00", "23600": "45000.00"}}

If a line is not found, omit it from the response.

TEXT:
{text[:8000]}
"""

        try:
            response = self._model.generate_content(
                prompt,
                generation_config={
                    "temperature": 0.1,
                    "max_output_tokens": 1000,
                }
            )

            result_text = response.text.strip()

            # Clean up markdown code blocks if present
            if result_text.startswith("```"):
                result_text = result_text.split("```")[1]
                if result_text.startswith("json"):
                    result_text = result_text[4:]
                result_text = result_text.strip()

            extracted = json.loads(result_text)
            logger.info(f"Gemini extracted {len(extracted)} line values")

            # Clean and validate values
            cleaned = {}
            for line_num, value in extracted.items():
                if isinstance(value, (int, float)):
                    cleaned[line_num] = str(value)
                elif isinstance(value, str):
                    clean_value = value.replace(",", "").replace("$", "").strip()
                    try:
                        float(clean_value)
                        cleaned[line_num] = clean_value
                    except ValueError:
                        logger.warning(f"Invalid value for line {line_num}: {value}")

            return cleaned

        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse Gemini response as JSON: {e}")
            raise PDFExtractionError(f"Gemini returned invalid JSON: {e}")
        except Exception as e:
            logger.error(f"Gemini extraction failed: {e}")
            raise PDFExtractionError(f"Gemini extraction failed: {e}")


# Global parser instance
gemini_parser = GeminiParser()