Spaces:

ResearchEngineering
/

financial_analyst

Running

financial_analyst / ocr_parser.py

Dmitry Beresnev

fix max tickers limit

41f5ef9 1 day ago

16.1 kB

	"""
	OCR and portfolio parsing module.

	Handles:
	- Text extraction from portfolio screenshots using Tesseract OCR
	- Parsing tickers and amounts using regex
	- JSON validation for user-edited portfolio data
	- Image preprocessing for better OCR accuracy
	"""

	import re
	import json
	from typing import Dict, Tuple, Optional
	from PIL import Image, ImageEnhance, ImageFilter
	import pytesseract
	import numpy as np


	MAX_TICKERS = 100


	# Multiple regex patterns to handle different formats
	TICKER_PATTERNS = [
	# Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
	r'([A-Z]{1,5})\s[\$€£]?\s([\d,]+\.?\d*)',
	# Pattern 2: Amount followed by ticker ($5,000 AAPL)
	r'[\$€£]?\s([\d,]+\.?\d)\s+([A-Z]{1,5})',
	# Pattern 3: Ticker on one line, amount on next (multi-line)
	r'([A-Z]{1,5})\s\n\s[\$€£]?\s([\d,]+\.?\d)',
	# Pattern 4: With separators (AAPL \| $5,000.00)
	r'([A-Z]{1,5})\s[:\|]\s[\$€£]?\s([\d,]+\.?\d)',
	# Pattern 5: Revolut format - line with ticker and dash
	# Example: "8,31 MU - 411,50'$ 4123,26%"
	r'[\d,]+\.?\d\s+([A-Z]{2,5})\s[-–]\s*[\d,]+',
	]

	# Revolut-specific pattern: Company name followed by portfolio value
	# Example: "@ Micron Technology 3420,14$" followed by "8,31 MU - 411,50'$ 4123,26%"
	REVOLUT_PATTERN = r'([\d,]+\.?\d)\s[\$€£]\s\n.?\s+([A-Z]{2,5})\s*[-–]'


	def is_dark_theme(image: Image.Image) -> bool:
	"""
	Detect if image uses dark theme (dark background, light text).

	Args:
	image: PIL Image object

	Returns:
	True if dark theme detected, False otherwise
	"""
	# Convert to grayscale
	gray = image.convert('L')

	# Sample pixels from center region (avoid edges)
	width, height = gray.size
	sample_region = gray.crop((
	width // 4,
	height // 4,
	3 * width // 4,
	3 * height // 4
	))

	# Calculate average brightness
	pixels = np.array(sample_region)
	avg_brightness = np.mean(pixels)

	# If average brightness < 128, it's likely a dark theme
	return avg_brightness < 128


	def preprocess_image(image: Image.Image) -> Image.Image:
	"""
	Preprocess image for better OCR accuracy.

	Applies:
	- Dark theme detection and inversion if needed
	- Grayscale conversion
	- Contrast enhancement
	- Sharpening
	- Noise reduction
	- Upscaling for small images

	Args:
	image: PIL Image object

	Returns:
	Preprocessed PIL Image object
	"""
	# Detect dark theme and invert if necessary
	if is_dark_theme(image):
	# Invert colors for dark theme (makes OCR more accurate)
	from PIL import ImageOps
	image = ImageOps.invert(image.convert('RGB')).convert('L')
	else:
	# Convert to grayscale
	image = image.convert('L')

	# Increase contrast
	enhancer = ImageEnhance.Contrast(image)
	image = enhancer.enhance(2.0)

	# Sharpen
	image = image.filter(ImageFilter.SHARPEN)

	# Resize if image is too small (helps with OCR)
	width, height = image.size
	if width < 800 or height < 800:
	scale = max(800 / width, 800 / height)
	new_size = (int(width * scale), int(height * scale))
	image = image.resize(new_size, Image.Resampling.LANCZOS)

	return image


	def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
	"""
	Extract text from uploaded portfolio screenshot using Tesseract OCR.

	Uses image preprocessing and custom Tesseract config for better accuracy.

	Args:
	image: PIL Image object

	Returns:
	Tuple of (extracted_text, error_message)
	- If successful: (text, None)
	- If failed: (None, error_message)
	"""
	try:
	# Verify tesseract is available
	pytesseract.get_tesseract_version()

	# Preprocess image for better OCR
	processed_image = preprocess_image(image)

	# Custom Tesseract configuration for better accuracy
	# --psm 6: Assume a single uniform block of text
	# --oem 3: Use default OCR Engine mode
	custom_config = r'--oem 3 --psm 6'

	# Extract text with custom config
	text = pytesseract.image_to_string(processed_image, config=custom_config)

	# If first attempt fails, try with different PSM mode
	if not text.strip():
	# PSM 4: Assume a single column of text of variable sizes
	custom_config = r'--oem 3 --psm 4'
	text = pytesseract.image_to_string(processed_image, config=custom_config)

	# Check if any text was detected
	if not text.strip():
	return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."

	return text, None

	except pytesseract.TesseractNotFoundError:
	return None, "OCR engine (Tesseract) not available. Please check installation."
	except Exception as e:
	return None, f"OCR failed: {str(e)}"


	def parse_revolut_format(text: str) -> Dict[str, float]:
	"""
	Parse Revolut-specific format.

	Revolut format (typically 2 lines per stock):
	Line 1: [icon] Company Name [portfolio_value]$
	Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]

	Examples:
	Line 1: "@ Micron Technology 3 212,85 $"
	Line 2: "8,31 MU» 386,56 $ 4 109,73%"

	Handles variations:
	- Spaces in numbers: "3 256,40"
	- Different separators after ticker: "-", ":", "*", "»", "«"
	- Numbers without decimals: "172312"
	- Negative values in change column

	Args:
	text: Extracted text from OCR

	Returns:
	Dictionary mapping tickers to amounts
	"""
	portfolio = {}
	lines = text.split('\n')

	# Process lines
	i = 0
	while i < len(lines):
	current_line = lines[i].strip()

	# Skip empty lines
	if not current_line:
	i += 1
	continue

	# Check if this is a TICKER line (not a value line)
	# Ticker lines start with: [shares] [TICKER][separator]
	# Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
	is_ticker_line = re.match(r'^[\d,]+[.,]?\d\s[A-Z]{2,5}[\s\-–:*«»]', current_line)

	if is_ticker_line:
	# This is a ticker line, skip it (it's already been processed as lookahead)
	i += 1
	continue

	# Look for portfolio value line (contains amount with $, €, £)
	# IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
	# Avoid matching negative change values like "-1080,46$"
	# Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
	value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)

	if value_match:
	portfolio_value_str = value_match.group(1)

	# Clean portfolio value:
	# 1. Remove spaces: "3 256,40" -> "3256,40"
	clean_value = portfolio_value_str.replace(' ', '')

	# 2. Handle numbers without decimal separators
	# If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
	# Example: "172312" -> "1723.12"
	if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
	# Insert decimal before last 2 digits
	clean_value = clean_value[:-2] + '.' + clean_value[-2:]
	else:
	# 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
	clean_value = clean_value.replace(',', '.')

	try:
	amount = float(clean_value)
	# Filter out very small amounts (likely percentages, share counts, or other data)
	# Portfolio positions are typically > 50 (even small positions)
	if amount < 50:
	i += 1
	continue
	except ValueError:
	i += 1
	continue

	# Look ahead 1-2 lines for ticker
	ticker_found = False
	for lookahead in range(1, 3): # Check next 1-2 lines
	if i + lookahead >= len(lines):
	break

	check_line = lines[i + lookahead].strip()

	# Match ticker patterns: [shares] [TICKER][separator]
	# Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
	# Also handles OCR errors with missing space: "0,94LLY"
	# Ticker can be followed by: -, :, *, », «, space, or end of significant text
	ticker_match = re.search(r'[\d,]+[.,]?\d\s([A-Z]{2,5})[\s\-–:*«»]', check_line)

	if ticker_match:
	ticker = ticker_match.group(1)

	# Validate ticker (not a word fragment or common false positive)
	if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
	# Only add if not already present (avoid duplicates)
	if ticker not in portfolio:
	portfolio[ticker] = amount
	ticker_found = True
	i += lookahead + 1 # Skip to line after ticker line
	break

	if not ticker_found:
	i += 1
	else:
	i += 1

	return portfolio


	def parse_portfolio(text: str) -> Dict[str, float]:
	"""
	Parse portfolio from extracted text using multiple regex patterns.

	Tries various patterns to handle different screenshot formats:
	- Revolut format (priority)
	- Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
	- Amount followed by ticker: "$5,000 AAPL"
	- Multi-line format: ticker on one line, amount on next
	- With separators: "AAPL \| $5,000.00"

	Args:
	text: Extracted text from OCR

	Returns:
	Dictionary mapping tickers to amounts: {ticker: amount}
	Returns empty dict if no valid tickers found
	"""
	if not text:
	return {}

	# First, try Revolut-specific parser
	revolut_portfolio = parse_revolut_format(text)
	if revolut_portfolio:
	return revolut_portfolio

	# Fall back to generic patterns
	portfolio = {}

	# Try each pattern
	for pattern in TICKER_PATTERNS:
	matches = re.findall(pattern, text, re.MULTILINE \| re.IGNORECASE)

	for match in matches:
	try:
	# Determine which group is ticker and which is amount
	# Check which one looks like a number
	group1, group2 = match

	# Check if group1 is a number (amount first format)
	if re.match(r'^[\d,.]+$', group1):
	amount_str = group1
	ticker = group2.upper()
	else:
	ticker = group1.upper()
	amount_str = group2

	# Validate ticker (1-10 uppercase letters)
	if not re.match(r'^[A-Z]{1,10}$', ticker):
	continue

	# Clean and parse amount
	# Remove currency symbols, commas, spaces
	clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
	# Handle European decimal format (comma as decimal separator)
	clean_amount = clean_amount.replace(',', '.')

	# Convert to float
	amount = float(clean_amount)

	# Only include positive amounts > 1 (filter out percentages, etc.)
	if amount > 1:
	# If ticker already exists, keep the larger amount
	if ticker not in portfolio or amount > portfolio[ticker]:
	portfolio[ticker] = amount

	except (ValueError, IndexError, AttributeError):
	# Skip invalid matches
	continue

	# Additional heuristics: filter out common false positives
	# Remove entries that look like dates, IDs, etc.
	false_positive_patterns = [
	r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', r'^CHF$', # Currency codes
	r'^AM$', r'^PM$', # Time indicators
	r'^JAN\|FEB\|MAR\|APR\|MAY\|JUN\|JUL\|AUG\|SEP\|OCT\|NOV\|DEC$', # Months
	r'^[A-Z]{6,}$', # Very long "tickers" (likely parts of words)
	]

	filtered_portfolio = {}
	for ticker, amount in portfolio.items():
	is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
	if not is_false_positive:
	filtered_portfolio[ticker] = amount

	return filtered_portfolio


	def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:
	"""
	Validate user-edited portfolio JSON.

	Expected format: {"AAPL": 5000, "GOOGL": 3000, ...}

	Args:
	json_str: JSON string to validate

	Returns:
	Tuple of (is_valid, parsed_dict, error_message)
	- If valid: (True, portfolio_dict, "")
	- If invalid: (False, None, error_message)
	"""
	if not json_str or not json_str.strip():
	return False, None, "JSON is empty"

	try:
	# Parse JSON
	data = json.loads(json_str)

	# Validate it's a dictionary
	if not isinstance(data, dict):
	return False, None, "JSON must be a dictionary/object, not a list or other type"

	# Validate all keys are strings and all values are numbers
	portfolio = {}
	for ticker, amount in data.items():
	# Check ticker is string
	if not isinstance(ticker, str):
	return False, None, f"Ticker '{ticker}' must be a string"

	# Check ticker is uppercase (optional validation)
	if not ticker.isupper():
	return False, None, f"Ticker '{ticker}' should be uppercase (e.g., 'AAPL' not 'aapl')"

	# Check ticker length (1-5 characters is typical)
	if len(ticker) < 1 or len(ticker) > 10:
	return False, None, f"Ticker '{ticker}' length should be between 1-10 characters"

	# Check amount is numeric
	try:
	amount_float = float(amount)
	except (TypeError, ValueError):
	return False, None, f"Amount for {ticker} must be a number, got: {amount}"

	# Check amount is positive
	if amount_float <= 0:
	return False, None, f"Amount for {ticker} must be positive, got: {amount_float}"

	portfolio[ticker] = amount_float

	# Check we have at least one ticker
	if len(portfolio) == 0:
	return False, None, "Portfolio must contain at least one ticker"

	# Check we don't exceed maximum tickers (optional limit)
	if len(portfolio) > MAX_TICKERS:
	return False, None, f"Portfolio exceeds maximum of {MAX_TICKERS} tickers"

	return True, portfolio, ""

	except json.JSONDecodeError as e:
	return False, None, f"Invalid JSON format: {str(e)}"
	except Exception as e:
	return False, None, f"Validation error: {str(e)}"


	def merge_portfolios(portfolios: list[Dict[str, float]]) -> Dict[str, float]:
	"""
	Merge multiple portfolio dictionaries.

	If the same ticker appears in multiple portfolios, amounts are summed.

	Args:
	portfolios: List of portfolio dictionaries

	Returns:
	Merged portfolio dictionary with summed amounts
	"""
	merged = {}

	for portfolio in portfolios:
	for ticker, amount in portfolio.items():
	if ticker in merged:
	merged[ticker] += amount
	else:
	merged[ticker] = amount

	return merged


	def format_portfolio_json(portfolio: Dict[str, float], indent: int = 2) -> str:
	"""
	Format portfolio dictionary as pretty-printed JSON.

	Args:
	portfolio: Dictionary of {ticker: amount}
	indent: Number of spaces for indentation

	Returns:
	Formatted JSON string
	"""
	return json.dumps(portfolio, indent=indent, sort_keys=True)