t1-tax-pdf-processor / app /utils /gemini_parser.py
Hamza4100's picture
Upload 23 files
aa8e38b verified
"""
Google Gemini-based fallback parser for when regex extraction fails.
"""
import json
from typing import Optional
from app.config import settings
from app.utils.logging import get_logger
from app.utils.exceptions import PDFExtractionError
logger = get_logger(__name__)
class GeminiParser:
"""
Uses Google Gemini models to extract T1 tax line values when regex fails.
"""
def __init__(self, api_key: Optional[str] = None):
self.api_key = api_key or settings.google_api_key
self.model_name = settings.gemini_model
self._model = None
if self.api_key:
try:
import google.generativeai as genai
genai.configure(api_key=self.api_key)
self._model = genai.GenerativeModel(self.model_name)
logger.info(f"Gemini client initialized with model: {self.model_name}")
except ImportError:
logger.warning("google-generativeai package not installed")
except Exception as e:
logger.warning(f"Failed to initialize Gemini client: {e}")
def is_available(self) -> bool:
"""Check if Gemini parsing is available."""
return self._model is not None
def extract_line_values(
self,
text: str,
line_numbers: Optional[list[str]] = None
) -> dict[str, Optional[str]]:
"""
Use Gemini to extract T1 tax line values from text.
Args:
text: Text content from PDF.
line_numbers: Specific line numbers to extract.
Returns:
Dictionary mapping line numbers to values.
"""
if not self._model:
raise PDFExtractionError("Gemini client not available")
if line_numbers:
lines_str = ", ".join(line_numbers)
target_lines = f"Extract values for these specific lines: {lines_str}"
else:
target_lines = "Extract all T1 tax line values you can find"
prompt = f"""You are a Canadian T1 tax form data extractor.
Given the following text extracted from a T1 tax return PDF, extract the line values.
{target_lines}
T1 tax lines are 5-digit numbers (like 15000, 23600, 26000) followed by dollar amounts.
The format may vary:
- "15000 Total Income: $50,000.00"
- "Line 15000 50000"
- "Total income (line 15000) 50,000"
Return ONLY a valid JSON object mapping line numbers to their numeric values (without $ or commas).
Example: {{"15000": "50000.00", "23600": "45000.00"}}
If a line is not found, omit it from the response.
TEXT:
{text[:8000]}
"""
try:
response = self._model.generate_content(
prompt,
generation_config={
"temperature": 0.1,
"max_output_tokens": 1000,
}
)
result_text = response.text.strip()
# Clean up markdown code blocks if present
if result_text.startswith("```"):
result_text = result_text.split("```")[1]
if result_text.startswith("json"):
result_text = result_text[4:]
result_text = result_text.strip()
extracted = json.loads(result_text)
logger.info(f"Gemini extracted {len(extracted)} line values")
# Clean and validate values
cleaned = {}
for line_num, value in extracted.items():
if isinstance(value, (int, float)):
cleaned[line_num] = str(value)
elif isinstance(value, str):
clean_value = value.replace(",", "").replace("$", "").strip()
try:
float(clean_value)
cleaned[line_num] = clean_value
except ValueError:
logger.warning(f"Invalid value for line {line_num}: {value}")
return cleaned
except json.JSONDecodeError as e:
logger.error(f"Failed to parse Gemini response as JSON: {e}")
raise PDFExtractionError(f"Gemini returned invalid JSON: {e}")
except Exception as e:
logger.error(f"Gemini extraction failed: {e}")
raise PDFExtractionError(f"Gemini extraction failed: {e}")
# Global parser instance
gemini_parser = GeminiParser()