""" OCR and portfolio parsing module. Handles: - Text extraction from portfolio screenshots using Tesseract OCR - Parsing tickers and amounts using regex - JSON validation for user-edited portfolio data - Image preprocessing for better OCR accuracy """ import re import json from typing import Dict, Tuple, Optional from PIL import Image, ImageEnhance, ImageFilter import pytesseract import numpy as np MAX_TICKERS = 100 # Multiple regex patterns to handle different formats TICKER_PATTERNS = [ # Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00) r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)', # Pattern 2: Amount followed by ticker ($5,000 AAPL) r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})', # Pattern 3: Ticker on one line, amount on next (multi-line) r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)', # Pattern 4: With separators (AAPL | $5,000.00) r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)', # Pattern 5: Revolut format - line with ticker and dash # Example: "8,31 MU - 411,50'$ 4123,26%" r'[\d,]+\.?\d*\s+([A-Z]{2,5})\s*[-–]\s*[\d,]+', ] # Revolut-specific pattern: Company name followed by portfolio value # Example: "@ Micron Technology 3420,14$" followed by "8,31 MU - 411,50'$ 4123,26%" REVOLUT_PATTERN = r'([\d,]+\.?\d*)\s*[\$€£]\s*\n.*?\s+([A-Z]{2,5})\s*[-–]' def is_dark_theme(image: Image.Image) -> bool: """ Detect if image uses dark theme (dark background, light text). Args: image: PIL Image object Returns: True if dark theme detected, False otherwise """ # Convert to grayscale gray = image.convert('L') # Sample pixels from center region (avoid edges) width, height = gray.size sample_region = gray.crop(( width // 4, height // 4, 3 * width // 4, 3 * height // 4 )) # Calculate average brightness pixels = np.array(sample_region) avg_brightness = np.mean(pixels) # If average brightness < 128, it's likely a dark theme return avg_brightness < 128 def preprocess_image(image: Image.Image) -> Image.Image: """ Preprocess image for better OCR accuracy. Applies: - Dark theme detection and inversion if needed - Grayscale conversion - Contrast enhancement - Sharpening - Noise reduction - Upscaling for small images Args: image: PIL Image object Returns: Preprocessed PIL Image object """ # Detect dark theme and invert if necessary if is_dark_theme(image): # Invert colors for dark theme (makes OCR more accurate) from PIL import ImageOps image = ImageOps.invert(image.convert('RGB')).convert('L') else: # Convert to grayscale image = image.convert('L') # Increase contrast enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(2.0) # Sharpen image = image.filter(ImageFilter.SHARPEN) # Resize if image is too small (helps with OCR) width, height = image.size if width < 800 or height < 800: scale = max(800 / width, 800 / height) new_size = (int(width * scale), int(height * scale)) image = image.resize(new_size, Image.Resampling.LANCZOS) return image def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]: """ Extract text from uploaded portfolio screenshot using Tesseract OCR. Uses image preprocessing and custom Tesseract config for better accuracy. Args: image: PIL Image object Returns: Tuple of (extracted_text, error_message) - If successful: (text, None) - If failed: (None, error_message) """ try: # Verify tesseract is available pytesseract.get_tesseract_version() # Preprocess image for better OCR processed_image = preprocess_image(image) # Custom Tesseract configuration for better accuracy # --psm 6: Assume a single uniform block of text # --oem 3: Use default OCR Engine mode custom_config = r'--oem 3 --psm 6' # Extract text with custom config text = pytesseract.image_to_string(processed_image, config=custom_config) # If first attempt fails, try with different PSM mode if not text.strip(): # PSM 4: Assume a single column of text of variable sizes custom_config = r'--oem 3 --psm 4' text = pytesseract.image_to_string(processed_image, config=custom_config) # Check if any text was detected if not text.strip(): return None, "No text detected in image. Please upload a clearer screenshot or enter data manually." return text, None except pytesseract.TesseractNotFoundError: return None, "OCR engine (Tesseract) not available. Please check installation." except Exception as e: return None, f"OCR failed: {str(e)}" def parse_revolut_format(text: str) -> Dict[str, float]: """ Parse Revolut-specific format. Revolut format (typically 2 lines per stock): Line 1: [icon] Company Name [portfolio_value]$ Line 2: [shares] TICKER[separator] [price_per_share]$ [change%] Examples: Line 1: "@ Micron Technology 3 212,85 $" Line 2: "8,31 MU» 386,56 $ 4 109,73%" Handles variations: - Spaces in numbers: "3 256,40" - Different separators after ticker: "-", ":", "*", "»", "«" - Numbers without decimals: "172312" - Negative values in change column Args: text: Extracted text from OCR Returns: Dictionary mapping tickers to amounts """ portfolio = {} lines = text.split('\n') # Process lines i = 0 while i < len(lines): current_line = lines[i].strip() # Skip empty lines if not current_line: i += 1 continue # Check if this is a TICKER line (not a value line) # Ticker lines start with: [shares] [TICKER][separator] # Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080" is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line) if is_ticker_line: # This is a ticker line, skip it (it's already been processed as lookahead) i += 1 continue # Look for portfolio value line (contains amount with $, €, £) # IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign # Avoid matching negative change values like "-1080,46$" # Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$" value_match = re.search(r'(? "3256,40" clean_value = portfolio_value_str.replace(' ', '') # 2. Handle numbers without decimal separators # If no decimal (. or ,) and more than 2 digits, assume last 2 are cents # Example: "172312" -> "1723.12" if not re.search(r'[.,]', clean_value) and len(clean_value) > 2: # Insert decimal before last 2 digits clean_value = clean_value[:-2] + '.' + clean_value[-2:] else: # 3. Replace comma with dot for European format: "3256,40" -> "3256.40" clean_value = clean_value.replace(',', '.') try: amount = float(clean_value) # Filter out very small amounts (likely percentages, share counts, or other data) # Portfolio positions are typically > 50 (even small positions) if amount < 50: i += 1 continue except ValueError: i += 1 continue # Look ahead 1-2 lines for ticker ticker_found = False for lookahead in range(1, 3): # Check next 1-2 lines if i + lookahead >= len(lines): break check_line = lines[i + lookahead].strip() # Match ticker patterns: [shares] [TICKER][separator] # Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «" # Also handles OCR errors with missing space: "0,94LLY" # Ticker can be followed by: -, :, *, », «, space, or end of significant text ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line) if ticker_match: ticker = ticker_match.group(1) # Validate ticker (not a word fragment or common false positive) if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']: # Only add if not already present (avoid duplicates) if ticker not in portfolio: portfolio[ticker] = amount ticker_found = True i += lookahead + 1 # Skip to line after ticker line break if not ticker_found: i += 1 else: i += 1 return portfolio def parse_portfolio(text: str) -> Dict[str, float]: """ Parse portfolio from extracted text using multiple regex patterns. Tries various patterns to handle different screenshot formats: - Revolut format (priority) - Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00" - Amount followed by ticker: "$5,000 AAPL" - Multi-line format: ticker on one line, amount on next - With separators: "AAPL | $5,000.00" Args: text: Extracted text from OCR Returns: Dictionary mapping tickers to amounts: {ticker: amount} Returns empty dict if no valid tickers found """ if not text: return {} # First, try Revolut-specific parser revolut_portfolio = parse_revolut_format(text) if revolut_portfolio: return revolut_portfolio # Fall back to generic patterns portfolio = {} # Try each pattern for pattern in TICKER_PATTERNS: matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE) for match in matches: try: # Determine which group is ticker and which is amount # Check which one looks like a number group1, group2 = match # Check if group1 is a number (amount first format) if re.match(r'^[\d,.]+$', group1): amount_str = group1 ticker = group2.upper() else: ticker = group1.upper() amount_str = group2 # Validate ticker (1-10 uppercase letters) if not re.match(r'^[A-Z]{1,10}$', ticker): continue # Clean and parse amount # Remove currency symbols, commas, spaces clean_amount = re.sub(r'[\$€£,\s]', '', amount_str) # Handle European decimal format (comma as decimal separator) clean_amount = clean_amount.replace(',', '.') # Convert to float amount = float(clean_amount) # Only include positive amounts > 1 (filter out percentages, etc.) if amount > 1: # If ticker already exists, keep the larger amount if ticker not in portfolio or amount > portfolio[ticker]: portfolio[ticker] = amount except (ValueError, IndexError, AttributeError): # Skip invalid matches continue # Additional heuristics: filter out common false positives # Remove entries that look like dates, IDs, etc. false_positive_patterns = [ r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', r'^CHF$', # Currency codes r'^AM$', r'^PM$', # Time indicators r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$', # Months r'^[A-Z]{6,}$', # Very long "tickers" (likely parts of words) ] filtered_portfolio = {} for ticker, amount in portfolio.items(): is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns) if not is_false_positive: filtered_portfolio[ticker] = amount return filtered_portfolio def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]: """ Validate user-edited portfolio JSON. Expected format: {"AAPL": 5000, "GOOGL": 3000, ...} Args: json_str: JSON string to validate Returns: Tuple of (is_valid, parsed_dict, error_message) - If valid: (True, portfolio_dict, "") - If invalid: (False, None, error_message) """ if not json_str or not json_str.strip(): return False, None, "JSON is empty" try: # Parse JSON data = json.loads(json_str) # Validate it's a dictionary if not isinstance(data, dict): return False, None, "JSON must be a dictionary/object, not a list or other type" # Validate all keys are strings and all values are numbers portfolio = {} for ticker, amount in data.items(): # Check ticker is string if not isinstance(ticker, str): return False, None, f"Ticker '{ticker}' must be a string" # Check ticker is uppercase (optional validation) if not ticker.isupper(): return False, None, f"Ticker '{ticker}' should be uppercase (e.g., 'AAPL' not 'aapl')" # Check ticker length (1-5 characters is typical) if len(ticker) < 1 or len(ticker) > 10: return False, None, f"Ticker '{ticker}' length should be between 1-10 characters" # Check amount is numeric try: amount_float = float(amount) except (TypeError, ValueError): return False, None, f"Amount for {ticker} must be a number, got: {amount}" # Check amount is positive if amount_float <= 0: return False, None, f"Amount for {ticker} must be positive, got: {amount_float}" portfolio[ticker] = amount_float # Check we have at least one ticker if len(portfolio) == 0: return False, None, "Portfolio must contain at least one ticker" # Check we don't exceed maximum tickers (optional limit) if len(portfolio) > MAX_TICKERS: return False, None, f"Portfolio exceeds maximum of {MAX_TICKERS} tickers" return True, portfolio, "" except json.JSONDecodeError as e: return False, None, f"Invalid JSON format: {str(e)}" except Exception as e: return False, None, f"Validation error: {str(e)}" def merge_portfolios(portfolios: list[Dict[str, float]]) -> Dict[str, float]: """ Merge multiple portfolio dictionaries. If the same ticker appears in multiple portfolios, amounts are summed. Args: portfolios: List of portfolio dictionaries Returns: Merged portfolio dictionary with summed amounts """ merged = {} for portfolio in portfolios: for ticker, amount in portfolio.items(): if ticker in merged: merged[ticker] += amount else: merged[ticker] = amount return merged def format_portfolio_json(portfolio: Dict[str, float], indent: int = 2) -> str: """ Format portfolio dictionary as pretty-printed JSON. Args: portfolio: Dictionary of {ticker: amount} indent: Number of spaces for indentation Returns: Formatted JSON string """ return json.dumps(portfolio, indent=indent, sort_keys=True)