""" Google Gemini-based fallback parser for when regex extraction fails. """ import json from typing import Optional from app.config import settings from app.utils.logging import get_logger from app.utils.exceptions import PDFExtractionError logger = get_logger(__name__) class GeminiParser: """ Uses Google Gemini models to extract T1 tax line values when regex fails. """ def __init__(self, api_key: Optional[str] = None): self.api_key = api_key or settings.google_api_key self.model_name = settings.gemini_model self._model = None if self.api_key: try: import google.generativeai as genai genai.configure(api_key=self.api_key) self._model = genai.GenerativeModel(self.model_name) logger.info(f"Gemini client initialized with model: {self.model_name}") except ImportError: logger.warning("google-generativeai package not installed") except Exception as e: logger.warning(f"Failed to initialize Gemini client: {e}") def is_available(self) -> bool: """Check if Gemini parsing is available.""" return self._model is not None def extract_line_values( self, text: str, line_numbers: Optional[list[str]] = None ) -> dict[str, Optional[str]]: """ Use Gemini to extract T1 tax line values from text. Args: text: Text content from PDF. line_numbers: Specific line numbers to extract. Returns: Dictionary mapping line numbers to values. """ if not self._model: raise PDFExtractionError("Gemini client not available") if line_numbers: lines_str = ", ".join(line_numbers) target_lines = f"Extract values for these specific lines: {lines_str}" else: target_lines = "Extract all T1 tax line values you can find" prompt = f"""You are a Canadian T1 tax form data extractor. Given the following text extracted from a T1 tax return PDF, extract the line values. {target_lines} T1 tax lines are 5-digit numbers (like 15000, 23600, 26000) followed by dollar amounts. The format may vary: - "15000 Total Income: $50,000.00" - "Line 15000 50000" - "Total income (line 15000) 50,000" Return ONLY a valid JSON object mapping line numbers to their numeric values (without $ or commas). Example: {{"15000": "50000.00", "23600": "45000.00"}} If a line is not found, omit it from the response. TEXT: {text[:8000]} """ try: response = self._model.generate_content( prompt, generation_config={ "temperature": 0.1, "max_output_tokens": 1000, } ) result_text = response.text.strip() # Clean up markdown code blocks if present if result_text.startswith("```"): result_text = result_text.split("```")[1] if result_text.startswith("json"): result_text = result_text[4:] result_text = result_text.strip() extracted = json.loads(result_text) logger.info(f"Gemini extracted {len(extracted)} line values") # Clean and validate values cleaned = {} for line_num, value in extracted.items(): if isinstance(value, (int, float)): cleaned[line_num] = str(value) elif isinstance(value, str): clean_value = value.replace(",", "").replace("$", "").strip() try: float(clean_value) cleaned[line_num] = clean_value except ValueError: logger.warning(f"Invalid value for line {line_num}: {value}") return cleaned except json.JSONDecodeError as e: logger.error(f"Failed to parse Gemini response as JSON: {e}") raise PDFExtractionError(f"Gemini returned invalid JSON: {e}") except Exception as e: logger.error(f"Gemini extraction failed: {e}") raise PDFExtractionError(f"Gemini extraction failed: {e}") # Global parser instance gemini_parser = GeminiParser()