Spaces:
Build error
Build error
File size: 4,366 Bytes
aa8e38b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | """
Google Gemini-based fallback parser for when regex extraction fails.
"""
import json
from typing import Optional
from app.config import settings
from app.utils.logging import get_logger
from app.utils.exceptions import PDFExtractionError
logger = get_logger(__name__)
class GeminiParser:
"""
Uses Google Gemini models to extract T1 tax line values when regex fails.
"""
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or settings.google_api_key
self.model_name = settings.gemini_model
self._model = None
if self.api_key:
try:
import google.generativeai as genai
genai.configure(api_key=self.api_key)
self._model = genai.GenerativeModel(self.model_name)
logger.info(f"Gemini client initialized with model: {self.model_name}")
except ImportError:
logger.warning("google-generativeai package not installed")
except Exception as e:
logger.warning(f"Failed to initialize Gemini client: {e}")
def is_available(self) -> bool:
"""Check if Gemini parsing is available."""
return self._model is not None
def extract_line_values(
self,
text: str,
line_numbers: Optional[list[str]] = None
) -> dict[str, Optional[str]]:
"""
Use Gemini to extract T1 tax line values from text.
Args:
text: Text content from PDF.
line_numbers: Specific line numbers to extract.
Returns:
Dictionary mapping line numbers to values.
"""
if not self._model:
raise PDFExtractionError("Gemini client not available")
if line_numbers:
lines_str = ", ".join(line_numbers)
target_lines = f"Extract values for these specific lines: {lines_str}"
else:
target_lines = "Extract all T1 tax line values you can find"
prompt = f"""You are a Canadian T1 tax form data extractor.
Given the following text extracted from a T1 tax return PDF, extract the line values.
{target_lines}
T1 tax lines are 5-digit numbers (like 15000, 23600, 26000) followed by dollar amounts.
The format may vary:
- "15000 Total Income: $50,000.00"
- "Line 15000 50000"
- "Total income (line 15000) 50,000"
Return ONLY a valid JSON object mapping line numbers to their numeric values (without $ or commas).
Example: {{"15000": "50000.00", "23600": "45000.00"}}
If a line is not found, omit it from the response.
TEXT:
{text[:8000]}
"""
try:
response = self._model.generate_content(
prompt,
generation_config={
"temperature": 0.1,
"max_output_tokens": 1000,
}
)
result_text = response.text.strip()
# Clean up markdown code blocks if present
if result_text.startswith("```"):
result_text = result_text.split("```")[1]
if result_text.startswith("json"):
result_text = result_text[4:]
result_text = result_text.strip()
extracted = json.loads(result_text)
logger.info(f"Gemini extracted {len(extracted)} line values")
# Clean and validate values
cleaned = {}
for line_num, value in extracted.items():
if isinstance(value, (int, float)):
cleaned[line_num] = str(value)
elif isinstance(value, str):
clean_value = value.replace(",", "").replace("$", "").strip()
try:
float(clean_value)
cleaned[line_num] = clean_value
except ValueError:
logger.warning(f"Invalid value for line {line_num}: {value}")
return cleaned
except json.JSONDecodeError as e:
logger.error(f"Failed to parse Gemini response as JSON: {e}")
raise PDFExtractionError(f"Gemini returned invalid JSON: {e}")
except Exception as e:
logger.error(f"Gemini extraction failed: {e}")
raise PDFExtractionError(f"Gemini extraction failed: {e}")
# Global parser instance
gemini_parser = GeminiParser()
|