financial_analyst / ocr_parser.py
Dmitry Beresnev
fix max tickers limit
41f5ef9
"""
OCR and portfolio parsing module.
Handles:
- Text extraction from portfolio screenshots using Tesseract OCR
- Parsing tickers and amounts using regex
- JSON validation for user-edited portfolio data
- Image preprocessing for better OCR accuracy
"""
import re
import json
from typing import Dict, Tuple, Optional
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import numpy as np
MAX_TICKERS = 100
# Multiple regex patterns to handle different formats
TICKER_PATTERNS = [
# Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)',
# Pattern 2: Amount followed by ticker ($5,000 AAPL)
r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})',
# Pattern 3: Ticker on one line, amount on next (multi-line)
r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)',
# Pattern 4: With separators (AAPL | $5,000.00)
r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)',
# Pattern 5: Revolut format - line with ticker and dash
# Example: "8,31 MU - 411,50'$ 4123,26%"
r'[\d,]+\.?\d*\s+([A-Z]{2,5})\s*[-–]\s*[\d,]+',
]
# Revolut-specific pattern: Company name followed by portfolio value
# Example: "@ Micron Technology 3420,14$" followed by "8,31 MU - 411,50'$ 4123,26%"
REVOLUT_PATTERN = r'([\d,]+\.?\d*)\s*[\$€£]\s*\n.*?\s+([A-Z]{2,5})\s*[-–]'
def is_dark_theme(image: Image.Image) -> bool:
"""
Detect if image uses dark theme (dark background, light text).
Args:
image: PIL Image object
Returns:
True if dark theme detected, False otherwise
"""
# Convert to grayscale
gray = image.convert('L')
# Sample pixels from center region (avoid edges)
width, height = gray.size
sample_region = gray.crop((
width // 4,
height // 4,
3 * width // 4,
3 * height // 4
))
# Calculate average brightness
pixels = np.array(sample_region)
avg_brightness = np.mean(pixels)
# If average brightness < 128, it's likely a dark theme
return avg_brightness < 128
def preprocess_image(image: Image.Image) -> Image.Image:
"""
Preprocess image for better OCR accuracy.
Applies:
- Dark theme detection and inversion if needed
- Grayscale conversion
- Contrast enhancement
- Sharpening
- Noise reduction
- Upscaling for small images
Args:
image: PIL Image object
Returns:
Preprocessed PIL Image object
"""
# Detect dark theme and invert if necessary
if is_dark_theme(image):
# Invert colors for dark theme (makes OCR more accurate)
from PIL import ImageOps
image = ImageOps.invert(image.convert('RGB')).convert('L')
else:
# Convert to grayscale
image = image.convert('L')
# Increase contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Sharpen
image = image.filter(ImageFilter.SHARPEN)
# Resize if image is too small (helps with OCR)
width, height = image.size
if width < 800 or height < 800:
scale = max(800 / width, 800 / height)
new_size = (int(width * scale), int(height * scale))
image = image.resize(new_size, Image.Resampling.LANCZOS)
return image
def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
"""
Extract text from uploaded portfolio screenshot using Tesseract OCR.
Uses image preprocessing and custom Tesseract config for better accuracy.
Args:
image: PIL Image object
Returns:
Tuple of (extracted_text, error_message)
- If successful: (text, None)
- If failed: (None, error_message)
"""
try:
# Verify tesseract is available
pytesseract.get_tesseract_version()
# Preprocess image for better OCR
processed_image = preprocess_image(image)
# Custom Tesseract configuration for better accuracy
# --psm 6: Assume a single uniform block of text
# --oem 3: Use default OCR Engine mode
custom_config = r'--oem 3 --psm 6'
# Extract text with custom config
text = pytesseract.image_to_string(processed_image, config=custom_config)
# If first attempt fails, try with different PSM mode
if not text.strip():
# PSM 4: Assume a single column of text of variable sizes
custom_config = r'--oem 3 --psm 4'
text = pytesseract.image_to_string(processed_image, config=custom_config)
# Check if any text was detected
if not text.strip():
return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."
return text, None
except pytesseract.TesseractNotFoundError:
return None, "OCR engine (Tesseract) not available. Please check installation."
except Exception as e:
return None, f"OCR failed: {str(e)}"
def parse_revolut_format(text: str) -> Dict[str, float]:
"""
Parse Revolut-specific format.
Revolut format (typically 2 lines per stock):
Line 1: [icon] Company Name [portfolio_value]$
Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]
Examples:
Line 1: "@ Micron Technology 3 212,85 $"
Line 2: "8,31 MU» 386,56 $ 4 109,73%"
Handles variations:
- Spaces in numbers: "3 256,40"
- Different separators after ticker: "-", ":", "*", "»", "«"
- Numbers without decimals: "172312"
- Negative values in change column
Args:
text: Extracted text from OCR
Returns:
Dictionary mapping tickers to amounts
"""
portfolio = {}
lines = text.split('\n')
# Process lines
i = 0
while i < len(lines):
current_line = lines[i].strip()
# Skip empty lines
if not current_line:
i += 1
continue
# Check if this is a TICKER line (not a value line)
# Ticker lines start with: [shares] [TICKER][separator]
# Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line)
if is_ticker_line:
# This is a ticker line, skip it (it's already been processed as lookahead)
i += 1
continue
# Look for portfolio value line (contains amount with $, €, £)
# IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
# Avoid matching negative change values like "-1080,46$"
# Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)
if value_match:
portfolio_value_str = value_match.group(1)
# Clean portfolio value:
# 1. Remove spaces: "3 256,40" -> "3256,40"
clean_value = portfolio_value_str.replace(' ', '')
# 2. Handle numbers without decimal separators
# If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
# Example: "172312" -> "1723.12"
if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
# Insert decimal before last 2 digits
clean_value = clean_value[:-2] + '.' + clean_value[-2:]
else:
# 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
clean_value = clean_value.replace(',', '.')
try:
amount = float(clean_value)
# Filter out very small amounts (likely percentages, share counts, or other data)
# Portfolio positions are typically > 50 (even small positions)
if amount < 50:
i += 1
continue
except ValueError:
i += 1
continue
# Look ahead 1-2 lines for ticker
ticker_found = False
for lookahead in range(1, 3): # Check next 1-2 lines
if i + lookahead >= len(lines):
break
check_line = lines[i + lookahead].strip()
# Match ticker patterns: [shares] [TICKER][separator]
# Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
# Also handles OCR errors with missing space: "0,94LLY"
# Ticker can be followed by: -, :, *, », «, space, or end of significant text
ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line)
if ticker_match:
ticker = ticker_match.group(1)
# Validate ticker (not a word fragment or common false positive)
if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
# Only add if not already present (avoid duplicates)
if ticker not in portfolio:
portfolio[ticker] = amount
ticker_found = True
i += lookahead + 1 # Skip to line after ticker line
break
if not ticker_found:
i += 1
else:
i += 1
return portfolio
def parse_portfolio(text: str) -> Dict[str, float]:
"""
Parse portfolio from extracted text using multiple regex patterns.
Tries various patterns to handle different screenshot formats:
- Revolut format (priority)
- Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
- Amount followed by ticker: "$5,000 AAPL"
- Multi-line format: ticker on one line, amount on next
- With separators: "AAPL | $5,000.00"
Args:
text: Extracted text from OCR
Returns:
Dictionary mapping tickers to amounts: {ticker: amount}
Returns empty dict if no valid tickers found
"""
if not text:
return {}
# First, try Revolut-specific parser
revolut_portfolio = parse_revolut_format(text)
if revolut_portfolio:
return revolut_portfolio
# Fall back to generic patterns
portfolio = {}
# Try each pattern
for pattern in TICKER_PATTERNS:
matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
for match in matches:
try:
# Determine which group is ticker and which is amount
# Check which one looks like a number
group1, group2 = match
# Check if group1 is a number (amount first format)
if re.match(r'^[\d,.]+$', group1):
amount_str = group1
ticker = group2.upper()
else:
ticker = group1.upper()
amount_str = group2
# Validate ticker (1-10 uppercase letters)
if not re.match(r'^[A-Z]{1,10}$', ticker):
continue
# Clean and parse amount
# Remove currency symbols, commas, spaces
clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
# Handle European decimal format (comma as decimal separator)
clean_amount = clean_amount.replace(',', '.')
# Convert to float
amount = float(clean_amount)
# Only include positive amounts > 1 (filter out percentages, etc.)
if amount > 1:
# If ticker already exists, keep the larger amount
if ticker not in portfolio or amount > portfolio[ticker]:
portfolio[ticker] = amount
except (ValueError, IndexError, AttributeError):
# Skip invalid matches
continue
# Additional heuristics: filter out common false positives
# Remove entries that look like dates, IDs, etc.
false_positive_patterns = [
r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', r'^CHF$', # Currency codes
r'^AM$', r'^PM$', # Time indicators
r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$', # Months
r'^[A-Z]{6,}$', # Very long "tickers" (likely parts of words)
]
filtered_portfolio = {}
for ticker, amount in portfolio.items():
is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
if not is_false_positive:
filtered_portfolio[ticker] = amount
return filtered_portfolio
def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:
"""
Validate user-edited portfolio JSON.
Expected format: {"AAPL": 5000, "GOOGL": 3000, ...}
Args:
json_str: JSON string to validate
Returns:
Tuple of (is_valid, parsed_dict, error_message)
- If valid: (True, portfolio_dict, "")
- If invalid: (False, None, error_message)
"""
if not json_str or not json_str.strip():
return False, None, "JSON is empty"
try:
# Parse JSON
data = json.loads(json_str)
# Validate it's a dictionary
if not isinstance(data, dict):
return False, None, "JSON must be a dictionary/object, not a list or other type"
# Validate all keys are strings and all values are numbers
portfolio = {}
for ticker, amount in data.items():
# Check ticker is string
if not isinstance(ticker, str):
return False, None, f"Ticker '{ticker}' must be a string"
# Check ticker is uppercase (optional validation)
if not ticker.isupper():
return False, None, f"Ticker '{ticker}' should be uppercase (e.g., 'AAPL' not 'aapl')"
# Check ticker length (1-5 characters is typical)
if len(ticker) < 1 or len(ticker) > 10:
return False, None, f"Ticker '{ticker}' length should be between 1-10 characters"
# Check amount is numeric
try:
amount_float = float(amount)
except (TypeError, ValueError):
return False, None, f"Amount for {ticker} must be a number, got: {amount}"
# Check amount is positive
if amount_float <= 0:
return False, None, f"Amount for {ticker} must be positive, got: {amount_float}"
portfolio[ticker] = amount_float
# Check we have at least one ticker
if len(portfolio) == 0:
return False, None, "Portfolio must contain at least one ticker"
# Check we don't exceed maximum tickers (optional limit)
if len(portfolio) > MAX_TICKERS:
return False, None, f"Portfolio exceeds maximum of {MAX_TICKERS} tickers"
return True, portfolio, ""
except json.JSONDecodeError as e:
return False, None, f"Invalid JSON format: {str(e)}"
except Exception as e:
return False, None, f"Validation error: {str(e)}"
def merge_portfolios(portfolios: list[Dict[str, float]]) -> Dict[str, float]:
"""
Merge multiple portfolio dictionaries.
If the same ticker appears in multiple portfolios, amounts are summed.
Args:
portfolios: List of portfolio dictionaries
Returns:
Merged portfolio dictionary with summed amounts
"""
merged = {}
for portfolio in portfolios:
for ticker, amount in portfolio.items():
if ticker in merged:
merged[ticker] += amount
else:
merged[ticker] = amount
return merged
def format_portfolio_json(portfolio: Dict[str, float], indent: int = 2) -> str:
"""
Format portfolio dictionary as pretty-printed JSON.
Args:
portfolio: Dictionary of {ticker: amount}
indent: Number of spaces for indentation
Returns:
Formatted JSON string
"""
return json.dumps(portfolio, indent=indent, sort_keys=True)