Spaces:
Build error
Build error
| """ | |
| Google Gemini-based fallback parser for when regex extraction fails. | |
| """ | |
| import json | |
| from typing import Optional | |
| from app.config import settings | |
| from app.utils.logging import get_logger | |
| from app.utils.exceptions import PDFExtractionError | |
| logger = get_logger(__name__) | |
| class GeminiParser: | |
| """ | |
| Uses Google Gemini models to extract T1 tax line values when regex fails. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| self.api_key = api_key or settings.google_api_key | |
| self.model_name = settings.gemini_model | |
| self._model = None | |
| if self.api_key: | |
| try: | |
| import google.generativeai as genai | |
| genai.configure(api_key=self.api_key) | |
| self._model = genai.GenerativeModel(self.model_name) | |
| logger.info(f"Gemini client initialized with model: {self.model_name}") | |
| except ImportError: | |
| logger.warning("google-generativeai package not installed") | |
| except Exception as e: | |
| logger.warning(f"Failed to initialize Gemini client: {e}") | |
| def is_available(self) -> bool: | |
| """Check if Gemini parsing is available.""" | |
| return self._model is not None | |
| def extract_line_values( | |
| self, | |
| text: str, | |
| line_numbers: Optional[list[str]] = None | |
| ) -> dict[str, Optional[str]]: | |
| """ | |
| Use Gemini to extract T1 tax line values from text. | |
| Args: | |
| text: Text content from PDF. | |
| line_numbers: Specific line numbers to extract. | |
| Returns: | |
| Dictionary mapping line numbers to values. | |
| """ | |
| if not self._model: | |
| raise PDFExtractionError("Gemini client not available") | |
| if line_numbers: | |
| lines_str = ", ".join(line_numbers) | |
| target_lines = f"Extract values for these specific lines: {lines_str}" | |
| else: | |
| target_lines = "Extract all T1 tax line values you can find" | |
| prompt = f"""You are a Canadian T1 tax form data extractor. | |
| Given the following text extracted from a T1 tax return PDF, extract the line values. | |
| {target_lines} | |
| T1 tax lines are 5-digit numbers (like 15000, 23600, 26000) followed by dollar amounts. | |
| The format may vary: | |
| - "15000 Total Income: $50,000.00" | |
| - "Line 15000 50000" | |
| - "Total income (line 15000) 50,000" | |
| Return ONLY a valid JSON object mapping line numbers to their numeric values (without $ or commas). | |
| Example: {{"15000": "50000.00", "23600": "45000.00"}} | |
| If a line is not found, omit it from the response. | |
| TEXT: | |
| {text[:8000]} | |
| """ | |
| try: | |
| response = self._model.generate_content( | |
| prompt, | |
| generation_config={ | |
| "temperature": 0.1, | |
| "max_output_tokens": 1000, | |
| } | |
| ) | |
| result_text = response.text.strip() | |
| # Clean up markdown code blocks if present | |
| if result_text.startswith("```"): | |
| result_text = result_text.split("```")[1] | |
| if result_text.startswith("json"): | |
| result_text = result_text[4:] | |
| result_text = result_text.strip() | |
| extracted = json.loads(result_text) | |
| logger.info(f"Gemini extracted {len(extracted)} line values") | |
| # Clean and validate values | |
| cleaned = {} | |
| for line_num, value in extracted.items(): | |
| if isinstance(value, (int, float)): | |
| cleaned[line_num] = str(value) | |
| elif isinstance(value, str): | |
| clean_value = value.replace(",", "").replace("$", "").strip() | |
| try: | |
| float(clean_value) | |
| cleaned[line_num] = clean_value | |
| except ValueError: | |
| logger.warning(f"Invalid value for line {line_num}: {value}") | |
| return cleaned | |
| except json.JSONDecodeError as e: | |
| logger.error(f"Failed to parse Gemini response as JSON: {e}") | |
| raise PDFExtractionError(f"Gemini returned invalid JSON: {e}") | |
| except Exception as e: | |
| logger.error(f"Gemini extraction failed: {e}") | |
| raise PDFExtractionError(f"Gemini extraction failed: {e}") | |
| # Global parser instance | |
| gemini_parser = GeminiParser() | |