Spaces:

ResearchEngineering
/

financial_analyst

Running

File size: 16,085 Bytes

"""
OCR and portfolio parsing module.

Handles:
- Text extraction from portfolio screenshots using Tesseract OCR
- Parsing tickers and amounts using regex
- JSON validation for user-edited portfolio data
- Image preprocessing for better OCR accuracy
"""

import re
import json
from typing import Dict, Tuple, Optional
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import numpy as np


MAX_TICKERS = 100


# Multiple regex patterns to handle different formats
TICKER_PATTERNS = [
    # Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
    r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 2: Amount followed by ticker ($5,000 AAPL)
    r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})',
    # Pattern 3: Ticker on one line, amount on next (multi-line)
    r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 4: With separators (AAPL | $5,000.00)
    r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 5: Revolut format - line with ticker and dash
    # Example: "8,31 MU - 411,50'$ 4123,26%"
    r'[\d,]+\.?\d*\s+([A-Z]{2,5})\s*[-–]\s*[\d,]+',
]

# Revolut-specific pattern: Company name followed by portfolio value
# Example: "@ Micron Technology 3420,14$" followed by "8,31 MU - 411,50'$ 4123,26%"
REVOLUT_PATTERN = r'([\d,]+\.?\d*)\s*[\$€£]\s*\n.*?\s+([A-Z]{2,5})\s*[-–]'


def is_dark_theme(image: Image.Image) -> bool:
    """
    Detect if image uses dark theme (dark background, light text).

    Args:
        image: PIL Image object

    Returns:
        True if dark theme detected, False otherwise
    """
    # Convert to grayscale
    gray = image.convert('L')

    # Sample pixels from center region (avoid edges)
    width, height = gray.size
    sample_region = gray.crop((
        width // 4,
        height // 4,
        3 * width // 4,
        3 * height // 4
    ))

    # Calculate average brightness
    pixels = np.array(sample_region)
    avg_brightness = np.mean(pixels)

    # If average brightness < 128, it's likely a dark theme
    return avg_brightness < 128


def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess image for better OCR accuracy.

    Applies:
    - Dark theme detection and inversion if needed
    - Grayscale conversion
    - Contrast enhancement
    - Sharpening
    - Noise reduction
    - Upscaling for small images

    Args:
        image: PIL Image object

    Returns:
        Preprocessed PIL Image object
    """
    # Detect dark theme and invert if necessary
    if is_dark_theme(image):
        # Invert colors for dark theme (makes OCR more accurate)
        from PIL import ImageOps
        image = ImageOps.invert(image.convert('RGB')).convert('L')
    else:
        # Convert to grayscale
        image = image.convert('L')

    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)

    # Sharpen
    image = image.filter(ImageFilter.SHARPEN)

    # Resize if image is too small (helps with OCR)
    width, height = image.size
    if width < 800 or height < 800:
        scale = max(800 / width, 800 / height)
        new_size = (int(width * scale), int(height * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    return image


def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
    """
    Extract text from uploaded portfolio screenshot using Tesseract OCR.

    Uses image preprocessing and custom Tesseract config for better accuracy.

    Args:
        image: PIL Image object

    Returns:
        Tuple of (extracted_text, error_message)
        - If successful: (text, None)
        - If failed: (None, error_message)
    """
    try:
        # Verify tesseract is available
        pytesseract.get_tesseract_version()

        # Preprocess image for better OCR
        processed_image = preprocess_image(image)

        # Custom Tesseract configuration for better accuracy
        # --psm 6: Assume a single uniform block of text
        # --oem 3: Use default OCR Engine mode
        custom_config = r'--oem 3 --psm 6'

        # Extract text with custom config
        text = pytesseract.image_to_string(processed_image, config=custom_config)

        # If first attempt fails, try with different PSM mode
        if not text.strip():
            # PSM 4: Assume a single column of text of variable sizes
            custom_config = r'--oem 3 --psm 4'
            text = pytesseract.image_to_string(processed_image, config=custom_config)

        # Check if any text was detected
        if not text.strip():
            return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."

        return text, None

    except pytesseract.TesseractNotFoundError:
        return None, "OCR engine (Tesseract) not available. Please check installation."
    except Exception as e:
        return None, f"OCR failed: {str(e)}"


def parse_revolut_format(text: str) -> Dict[str, float]:
    """
    Parse Revolut-specific format.

    Revolut format (typically 2 lines per stock):
    Line 1: [icon] Company Name [portfolio_value]$
    Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]

    Examples:
    Line 1: "@ Micron Technology 3 212,85 $"
    Line 2: "8,31 MU» 386,56 $ 4 109,73%"

    Handles variations:
    - Spaces in numbers: "3 256,40"
    - Different separators after ticker: "-", ":", "*", "»", "«"
    - Numbers without decimals: "172312"
    - Negative values in change column

    Args:
        text: Extracted text from OCR

    Returns:
        Dictionary mapping tickers to amounts
    """
    portfolio = {}
    lines = text.split('\n')

    # Process lines
    i = 0
    while i < len(lines):
        current_line = lines[i].strip()

        # Skip empty lines
        if not current_line:
            i += 1
            continue

        # Check if this is a TICKER line (not a value line)
        # Ticker lines start with: [shares] [TICKER][separator]
        # Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
        is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line)

        if is_ticker_line:
            # This is a ticker line, skip it (it's already been processed as lookahead)
            i += 1
            continue

        # Look for portfolio value line (contains amount with $, €, £)
        # IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
        # Avoid matching negative change values like "-1080,46$"
        # Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
        value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)

        if value_match:
            portfolio_value_str = value_match.group(1)

            # Clean portfolio value:
            # 1. Remove spaces: "3 256,40" -> "3256,40"
            clean_value = portfolio_value_str.replace(' ', '')

            # 2. Handle numbers without decimal separators
            # If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
            # Example: "172312" -> "1723.12"
            if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
                # Insert decimal before last 2 digits
                clean_value = clean_value[:-2] + '.' + clean_value[-2:]
            else:
                # 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
                clean_value = clean_value.replace(',', '.')

            try:
                amount = float(clean_value)
                # Filter out very small amounts (likely percentages, share counts, or other data)
                # Portfolio positions are typically > 50 (even small positions)
                if amount < 50:
                    i += 1
                    continue
            except ValueError:
                i += 1
                continue

            # Look ahead 1-2 lines for ticker
            ticker_found = False
            for lookahead in range(1, 3):  # Check next 1-2 lines
                if i + lookahead >= len(lines):
                    break

                check_line = lines[i + lookahead].strip()

                # Match ticker patterns: [shares] [TICKER][separator]
                # Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
                # Also handles OCR errors with missing space: "0,94LLY"
                # Ticker can be followed by: -, :, *, », «, space, or end of significant text
                ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line)

                if ticker_match:
                    ticker = ticker_match.group(1)

                    # Validate ticker (not a word fragment or common false positive)
                    if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
                        # Only add if not already present (avoid duplicates)
                        if ticker not in portfolio:
                            portfolio[ticker] = amount
                        ticker_found = True
                        i += lookahead + 1  # Skip to line after ticker line
                        break

            if not ticker_found:
                i += 1
        else:
            i += 1

    return portfolio


def parse_portfolio(text: str) -> Dict[str, float]:
    """
    Parse portfolio from extracted text using multiple regex patterns.

    Tries various patterns to handle different screenshot formats:
    - Revolut format (priority)
    - Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
    - Amount followed by ticker: "$5,000 AAPL"
    - Multi-line format: ticker on one line, amount on next
    - With separators: "AAPL | $5,000.00"

    Args:
        text: Extracted text from OCR

    Returns:
        Dictionary mapping tickers to amounts: {ticker: amount}
        Returns empty dict if no valid tickers found
    """
    if not text:
        return {}

    # First, try Revolut-specific parser
    revolut_portfolio = parse_revolut_format(text)
    if revolut_portfolio:
        return revolut_portfolio

    # Fall back to generic patterns
    portfolio = {}

    # Try each pattern
    for pattern in TICKER_PATTERNS:
        matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)

        for match in matches:
            try:
                # Determine which group is ticker and which is amount
                # Check which one looks like a number
                group1, group2 = match

                # Check if group1 is a number (amount first format)
                if re.match(r'^[\d,.]+$', group1):
                    amount_str = group1
                    ticker = group2.upper()
                else:
                    ticker = group1.upper()
                    amount_str = group2

                # Validate ticker (1-10 uppercase letters)
                if not re.match(r'^[A-Z]{1,10}$', ticker):
                    continue

                # Clean and parse amount
                # Remove currency symbols, commas, spaces
                clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
                # Handle European decimal format (comma as decimal separator)
                clean_amount = clean_amount.replace(',', '.')

                # Convert to float
                amount = float(clean_amount)

                # Only include positive amounts > 1 (filter out percentages, etc.)
                if amount > 1:
                    # If ticker already exists, keep the larger amount
                    if ticker not in portfolio or amount > portfolio[ticker]:
                        portfolio[ticker] = amount

            except (ValueError, IndexError, AttributeError):
                # Skip invalid matches
                continue

    # Additional heuristics: filter out common false positives
    # Remove entries that look like dates, IDs, etc.
    false_positive_patterns = [
        r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', r'^CHF$',  # Currency codes
        r'^AM$', r'^PM$',  # Time indicators
        r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$',  # Months
        r'^[A-Z]{6,}$',  # Very long "tickers" (likely parts of words)
    ]

    filtered_portfolio = {}
    for ticker, amount in portfolio.items():
        is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
        if not is_false_positive:
            filtered_portfolio[ticker] = amount

    return filtered_portfolio


def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:
    """
    Validate user-edited portfolio JSON.

    Expected format: {"AAPL": 5000, "GOOGL": 3000, ...}

    Args:
        json_str: JSON string to validate

    Returns:
        Tuple of (is_valid, parsed_dict, error_message)
        - If valid: (True, portfolio_dict, "")
        - If invalid: (False, None, error_message)
    """
    if not json_str or not json_str.strip():
        return False, None, "JSON is empty"

    try:
        # Parse JSON
        data = json.loads(json_str)

        # Validate it's a dictionary
        if not isinstance(data, dict):
            return False, None, "JSON must be a dictionary/object, not a list or other type"

        # Validate all keys are strings and all values are numbers
        portfolio = {}
        for ticker, amount in data.items():
            # Check ticker is string
            if not isinstance(ticker, str):
                return False, None, f"Ticker '{ticker}' must be a string"

            # Check ticker is uppercase (optional validation)
            if not ticker.isupper():
                return False, None, f"Ticker '{ticker}' should be uppercase (e.g., 'AAPL' not 'aapl')"

            # Check ticker length (1-5 characters is typical)
            if len(ticker) < 1 or len(ticker) > 10:
                return False, None, f"Ticker '{ticker}' length should be between 1-10 characters"

            # Check amount is numeric
            try:
                amount_float = float(amount)
            except (TypeError, ValueError):
                return False, None, f"Amount for {ticker} must be a number, got: {amount}"

            # Check amount is positive
            if amount_float <= 0:
                return False, None, f"Amount for {ticker} must be positive, got: {amount_float}"

            portfolio[ticker] = amount_float

        # Check we have at least one ticker
        if len(portfolio) == 0:
            return False, None, "Portfolio must contain at least one ticker"

        # Check we don't exceed maximum tickers (optional limit)
        if len(portfolio) > MAX_TICKERS:
            return False, None, f"Portfolio exceeds maximum of {MAX_TICKERS} tickers"

        return True, portfolio, ""

    except json.JSONDecodeError as e:
        return False, None, f"Invalid JSON format: {str(e)}"
    except Exception as e:
        return False, None, f"Validation error: {str(e)}"


def merge_portfolios(portfolios: list[Dict[str, float]]) -> Dict[str, float]:
    """
    Merge multiple portfolio dictionaries.

    If the same ticker appears in multiple portfolios, amounts are summed.

    Args:
        portfolios: List of portfolio dictionaries

    Returns:
        Merged portfolio dictionary with summed amounts
    """
    merged = {}

    for portfolio in portfolios:
        for ticker, amount in portfolio.items():
            if ticker in merged:
                merged[ticker] += amount
            else:
                merged[ticker] = amount

    return merged


def format_portfolio_json(portfolio: Dict[str, float], indent: int = 2) -> str:
    """
    Format portfolio dictionary as pretty-printed JSON.

    Args:
        portfolio: Dictionary of {ticker: amount}
        indent: Number of spaces for indentation

    Returns:
        Formatted JSON string
    """
    return json.dumps(portfolio, indent=indent, sort_keys=True)