File size: 16,085 Bytes
e6b8a0f
 
 
 
 
 
 
bd3f2a3
e6b8a0f
 
 
 
 
bd3f2a3
e6b8a0f
bd3f2a3
e6b8a0f
 
41f5ef9
 
 
bd3f2a3
 
 
 
 
 
 
 
 
 
634a20f
 
 
bd3f2a3
 
634a20f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd3f2a3
 
 
 
 
 
634a20f
bd3f2a3
 
 
 
634a20f
bd3f2a3
 
 
 
 
 
 
634a20f
 
 
 
 
 
 
 
bd3f2a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6b8a0f
 
 
 
 
 
bd3f2a3
 
e6b8a0f
 
 
 
 
 
 
 
 
 
 
 
bd3f2a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6b8a0f
 
 
bd3f2a3
e6b8a0f
 
 
 
 
 
 
 
 
634a20f
 
 
 
deaa7ee
 
 
 
 
 
 
634a20f
c226f41
 
deaa7ee
c226f41
deaa7ee
634a20f
 
 
 
 
 
 
 
 
 
c226f41
634a20f
 
 
 
deaa7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634a20f
deaa7ee
 
 
 
634a20f
c226f41
634a20f
 
c226f41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deaa7ee
 
 
c226f41
 
 
 
 
634a20f
c226f41
 
 
 
 
634a20f
c226f41
634a20f
deaa7ee
 
 
 
 
634a20f
c226f41
 
634a20f
c226f41
deaa7ee
 
 
 
c226f41
deaa7ee
c226f41
 
 
 
 
 
634a20f
 
 
 
e6b8a0f
 
bd3f2a3
e6b8a0f
bd3f2a3
634a20f
bd3f2a3
 
 
 
e6b8a0f
 
 
 
 
 
 
 
 
 
 
634a20f
 
 
 
 
 
e6b8a0f
 
bd3f2a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634a20f
 
bd3f2a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634a20f
bd3f2a3
 
634a20f
bd3f2a3
 
 
 
 
 
 
 
 
e6b8a0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f25c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6b8a0f
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
"""
OCR and portfolio parsing module.

Handles:
- Text extraction from portfolio screenshots using Tesseract OCR
- Parsing tickers and amounts using regex
- JSON validation for user-edited portfolio data
- Image preprocessing for better OCR accuracy
"""

import re
import json
from typing import Dict, Tuple, Optional
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import numpy as np


MAX_TICKERS = 100


# Multiple regex patterns to handle different formats
TICKER_PATTERNS = [
    # Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
    r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 2: Amount followed by ticker ($5,000 AAPL)
    r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})',
    # Pattern 3: Ticker on one line, amount on next (multi-line)
    r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 4: With separators (AAPL | $5,000.00)
    r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)',
    # Pattern 5: Revolut format - line with ticker and dash
    # Example: "8,31 MU - 411,50'$ 4123,26%"
    r'[\d,]+\.?\d*\s+([A-Z]{2,5})\s*[-–]\s*[\d,]+',
]

# Revolut-specific pattern: Company name followed by portfolio value
# Example: "@ Micron Technology 3420,14$" followed by "8,31 MU - 411,50'$ 4123,26%"
REVOLUT_PATTERN = r'([\d,]+\.?\d*)\s*[\$€£]\s*\n.*?\s+([A-Z]{2,5})\s*[-–]'


def is_dark_theme(image: Image.Image) -> bool:
    """
    Detect if image uses dark theme (dark background, light text).

    Args:
        image: PIL Image object

    Returns:
        True if dark theme detected, False otherwise
    """
    # Convert to grayscale
    gray = image.convert('L')

    # Sample pixels from center region (avoid edges)
    width, height = gray.size
    sample_region = gray.crop((
        width // 4,
        height // 4,
        3 * width // 4,
        3 * height // 4
    ))

    # Calculate average brightness
    pixels = np.array(sample_region)
    avg_brightness = np.mean(pixels)

    # If average brightness < 128, it's likely a dark theme
    return avg_brightness < 128


def preprocess_image(image: Image.Image) -> Image.Image:
    """
    Preprocess image for better OCR accuracy.

    Applies:
    - Dark theme detection and inversion if needed
    - Grayscale conversion
    - Contrast enhancement
    - Sharpening
    - Noise reduction
    - Upscaling for small images

    Args:
        image: PIL Image object

    Returns:
        Preprocessed PIL Image object
    """
    # Detect dark theme and invert if necessary
    if is_dark_theme(image):
        # Invert colors for dark theme (makes OCR more accurate)
        from PIL import ImageOps
        image = ImageOps.invert(image.convert('RGB')).convert('L')
    else:
        # Convert to grayscale
        image = image.convert('L')

    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)

    # Sharpen
    image = image.filter(ImageFilter.SHARPEN)

    # Resize if image is too small (helps with OCR)
    width, height = image.size
    if width < 800 or height < 800:
        scale = max(800 / width, 800 / height)
        new_size = (int(width * scale), int(height * scale))
        image = image.resize(new_size, Image.Resampling.LANCZOS)

    return image


def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
    """
    Extract text from uploaded portfolio screenshot using Tesseract OCR.

    Uses image preprocessing and custom Tesseract config for better accuracy.

    Args:
        image: PIL Image object

    Returns:
        Tuple of (extracted_text, error_message)
        - If successful: (text, None)
        - If failed: (None, error_message)
    """
    try:
        # Verify tesseract is available
        pytesseract.get_tesseract_version()

        # Preprocess image for better OCR
        processed_image = preprocess_image(image)

        # Custom Tesseract configuration for better accuracy
        # --psm 6: Assume a single uniform block of text
        # --oem 3: Use default OCR Engine mode
        custom_config = r'--oem 3 --psm 6'

        # Extract text with custom config
        text = pytesseract.image_to_string(processed_image, config=custom_config)

        # If first attempt fails, try with different PSM mode
        if not text.strip():
            # PSM 4: Assume a single column of text of variable sizes
            custom_config = r'--oem 3 --psm 4'
            text = pytesseract.image_to_string(processed_image, config=custom_config)

        # Check if any text was detected
        if not text.strip():
            return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."

        return text, None

    except pytesseract.TesseractNotFoundError:
        return None, "OCR engine (Tesseract) not available. Please check installation."
    except Exception as e:
        return None, f"OCR failed: {str(e)}"


def parse_revolut_format(text: str) -> Dict[str, float]:
    """
    Parse Revolut-specific format.

    Revolut format (typically 2 lines per stock):
    Line 1: [icon] Company Name [portfolio_value]$
    Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]

    Examples:
    Line 1: "@ Micron Technology 3 212,85 $"
    Line 2: "8,31 MU» 386,56 $ 4 109,73%"

    Handles variations:
    - Spaces in numbers: "3 256,40"
    - Different separators after ticker: "-", ":", "*", "»", "«"
    - Numbers without decimals: "172312"
    - Negative values in change column

    Args:
        text: Extracted text from OCR

    Returns:
        Dictionary mapping tickers to amounts
    """
    portfolio = {}
    lines = text.split('\n')

    # Process lines
    i = 0
    while i < len(lines):
        current_line = lines[i].strip()

        # Skip empty lines
        if not current_line:
            i += 1
            continue

        # Check if this is a TICKER line (not a value line)
        # Ticker lines start with: [shares] [TICKER][separator]
        # Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
        is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line)

        if is_ticker_line:
            # This is a ticker line, skip it (it's already been processed as lookahead)
            i += 1
            continue

        # Look for portfolio value line (contains amount with $, €, £)
        # IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
        # Avoid matching negative change values like "-1080,46$"
        # Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
        value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)

        if value_match:
            portfolio_value_str = value_match.group(1)

            # Clean portfolio value:
            # 1. Remove spaces: "3 256,40" -> "3256,40"
            clean_value = portfolio_value_str.replace(' ', '')

            # 2. Handle numbers without decimal separators
            # If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
            # Example: "172312" -> "1723.12"
            if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
                # Insert decimal before last 2 digits
                clean_value = clean_value[:-2] + '.' + clean_value[-2:]
            else:
                # 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
                clean_value = clean_value.replace(',', '.')

            try:
                amount = float(clean_value)
                # Filter out very small amounts (likely percentages, share counts, or other data)
                # Portfolio positions are typically > 50 (even small positions)
                if amount < 50:
                    i += 1
                    continue
            except ValueError:
                i += 1
                continue

            # Look ahead 1-2 lines for ticker
            ticker_found = False
            for lookahead in range(1, 3):  # Check next 1-2 lines
                if i + lookahead >= len(lines):
                    break

                check_line = lines[i + lookahead].strip()

                # Match ticker patterns: [shares] [TICKER][separator]
                # Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
                # Also handles OCR errors with missing space: "0,94LLY"
                # Ticker can be followed by: -, :, *, », «, space, or end of significant text
                ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line)

                if ticker_match:
                    ticker = ticker_match.group(1)

                    # Validate ticker (not a word fragment or common false positive)
                    if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
                        # Only add if not already present (avoid duplicates)
                        if ticker not in portfolio:
                            portfolio[ticker] = amount
                        ticker_found = True
                        i += lookahead + 1  # Skip to line after ticker line
                        break

            if not ticker_found:
                i += 1
        else:
            i += 1

    return portfolio


def parse_portfolio(text: str) -> Dict[str, float]:
    """
    Parse portfolio from extracted text using multiple regex patterns.

    Tries various patterns to handle different screenshot formats:
    - Revolut format (priority)
    - Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
    - Amount followed by ticker: "$5,000 AAPL"
    - Multi-line format: ticker on one line, amount on next
    - With separators: "AAPL | $5,000.00"

    Args:
        text: Extracted text from OCR

    Returns:
        Dictionary mapping tickers to amounts: {ticker: amount}
        Returns empty dict if no valid tickers found
    """
    if not text:
        return {}

    # First, try Revolut-specific parser
    revolut_portfolio = parse_revolut_format(text)
    if revolut_portfolio:
        return revolut_portfolio

    # Fall back to generic patterns
    portfolio = {}

    # Try each pattern
    for pattern in TICKER_PATTERNS:
        matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)

        for match in matches:
            try:
                # Determine which group is ticker and which is amount
                # Check which one looks like a number
                group1, group2 = match

                # Check if group1 is a number (amount first format)
                if re.match(r'^[\d,.]+$', group1):
                    amount_str = group1
                    ticker = group2.upper()
                else:
                    ticker = group1.upper()
                    amount_str = group2

                # Validate ticker (1-10 uppercase letters)
                if not re.match(r'^[A-Z]{1,10}$', ticker):
                    continue

                # Clean and parse amount
                # Remove currency symbols, commas, spaces
                clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
                # Handle European decimal format (comma as decimal separator)
                clean_amount = clean_amount.replace(',', '.')

                # Convert to float
                amount = float(clean_amount)

                # Only include positive amounts > 1 (filter out percentages, etc.)
                if amount > 1:
                    # If ticker already exists, keep the larger amount
                    if ticker not in portfolio or amount > portfolio[ticker]:
                        portfolio[ticker] = amount

            except (ValueError, IndexError, AttributeError):
                # Skip invalid matches
                continue

    # Additional heuristics: filter out common false positives
    # Remove entries that look like dates, IDs, etc.
    false_positive_patterns = [
        r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', r'^CHF$',  # Currency codes
        r'^AM$', r'^PM$',  # Time indicators
        r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$',  # Months
        r'^[A-Z]{6,}$',  # Very long "tickers" (likely parts of words)
    ]

    filtered_portfolio = {}
    for ticker, amount in portfolio.items():
        is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
        if not is_false_positive:
            filtered_portfolio[ticker] = amount

    return filtered_portfolio


def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:
    """
    Validate user-edited portfolio JSON.

    Expected format: {"AAPL": 5000, "GOOGL": 3000, ...}

    Args:
        json_str: JSON string to validate

    Returns:
        Tuple of (is_valid, parsed_dict, error_message)
        - If valid: (True, portfolio_dict, "")
        - If invalid: (False, None, error_message)
    """
    if not json_str or not json_str.strip():
        return False, None, "JSON is empty"

    try:
        # Parse JSON
        data = json.loads(json_str)

        # Validate it's a dictionary
        if not isinstance(data, dict):
            return False, None, "JSON must be a dictionary/object, not a list or other type"

        # Validate all keys are strings and all values are numbers
        portfolio = {}
        for ticker, amount in data.items():
            # Check ticker is string
            if not isinstance(ticker, str):
                return False, None, f"Ticker '{ticker}' must be a string"

            # Check ticker is uppercase (optional validation)
            if not ticker.isupper():
                return False, None, f"Ticker '{ticker}' should be uppercase (e.g., 'AAPL' not 'aapl')"

            # Check ticker length (1-5 characters is typical)
            if len(ticker) < 1 or len(ticker) > 10:
                return False, None, f"Ticker '{ticker}' length should be between 1-10 characters"

            # Check amount is numeric
            try:
                amount_float = float(amount)
            except (TypeError, ValueError):
                return False, None, f"Amount for {ticker} must be a number, got: {amount}"

            # Check amount is positive
            if amount_float <= 0:
                return False, None, f"Amount for {ticker} must be positive, got: {amount_float}"

            portfolio[ticker] = amount_float

        # Check we have at least one ticker
        if len(portfolio) == 0:
            return False, None, "Portfolio must contain at least one ticker"

        # Check we don't exceed maximum tickers (optional limit)
        if len(portfolio) > MAX_TICKERS:
            return False, None, f"Portfolio exceeds maximum of {MAX_TICKERS} tickers"

        return True, portfolio, ""

    except json.JSONDecodeError as e:
        return False, None, f"Invalid JSON format: {str(e)}"
    except Exception as e:
        return False, None, f"Validation error: {str(e)}"


def merge_portfolios(portfolios: list[Dict[str, float]]) -> Dict[str, float]:
    """
    Merge multiple portfolio dictionaries.

    If the same ticker appears in multiple portfolios, amounts are summed.

    Args:
        portfolios: List of portfolio dictionaries

    Returns:
        Merged portfolio dictionary with summed amounts
    """
    merged = {}

    for portfolio in portfolios:
        for ticker, amount in portfolio.items():
            if ticker in merged:
                merged[ticker] += amount
            else:
                merged[ticker] = amount

    return merged


def format_portfolio_json(portfolio: Dict[str, float], indent: int = 2) -> str:
    """
    Format portfolio dictionary as pretty-printed JSON.

    Args:
        portfolio: Dictionary of {ticker: amount}
        indent: Number of spaces for indentation

    Returns:
        Formatted JSON string
    """
    return json.dumps(portfolio, indent=indent, sort_keys=True)