Spaces:

ResearchEngineering
/

financial_analyst

Running

App Files Files Community

Dmitry Beresnev commited on 7 days ago

Commit

c226f41

1 Parent(s): 634a20f

fix OCR

Browse files

Files changed (1) hide show

ocr_parser.py +55 -24

ocr_parser.py CHANGED Viewed

@@ -161,11 +161,15 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
     """
     Parse Revolut-specific format.
-    Revolut format (2 lines per stock):
     Line 1: @ Company Name 3420,14$
     Line 2: 8,31 MU - 411,50'$ 4123,26%
-    We need to extract the portfolio value from line 1 and ticker from line 2.
     Args:
         text: Extracted text from OCR
@@ -176,41 +180,68 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
     portfolio = {}
     lines = text.split('\n')
-    # Process lines in pairs
     i = 0
     while i < len(lines):
         current_line = lines[i].strip()
         # Look for portfolio value line (contains amount with $, €, £)
-        value_match = re.search(r'([\d,]+[.,]\d{1,2})\s*[\$€£]', current_line)
-        if value_match and i + 1 < len(lines):
             portfolio_value_str = value_match.group(1)
-            # Look ahead for ticker in next line
-            next_line = lines[i + 1].strip()
-            # Match pattern like: "8,31 MU - 411,50"
-            # Ticker is 2-5 uppercase letters after some numbers
-            ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})\s*[-–]', next_line)
-            if ticker_match:
-                ticker = ticker_match.group(1)
-                # Clean portfolio value (replace , with . for European format)
-                clean_value = portfolio_value_str.replace(',', '.')
-                try:
-                    amount = float(clean_value)
-                    if amount > 1:  # Valid amount
-                        portfolio[ticker] = amount
-                except ValueError:
-                    pass
-                # Skip the next line since we've processed it
-                i += 2
-                continue
-        i += 1
     return portfolio

     """
     Parse Revolut-specific format.
+    Revolut format (typically 2-3 lines per stock):
     Line 1: @ Company Name 3420,14$
     Line 2: 8,31 MU - 411,50'$ 4123,26%
+    (Sometimes line 3 if company name is long)
+    Handles variations:
+    - Spaces in numbers: "3 256,40"
+    - Different separators: "-", ":", "*"
+    - Numbers without decimals: "172312"
     Args:
         text: Extracted text from OCR
     portfolio = {}
     lines = text.split('\n')
+    # Process lines
     i = 0
     while i < len(lines):
         current_line = lines[i].strip()
         # Look for portfolio value line (contains amount with $, €, £)
+        # Handle spaces in numbers: "3 256,40" or "172312" or "3420,14"
+        value_match = re.search(r'([\d\s,]+(?:[.,]\d{1,2})?)\s*[\$€£]', current_line)
+        if value_match:
             portfolio_value_str = value_match.group(1)
+            # Clean portfolio value:
+            # 1. Remove spaces: "3 256,40" -> "3256,40"
+            clean_value = portfolio_value_str.replace(' ', '')
+            # 2. Handle numbers without decimal separators
+            # If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
+            # Example: "172312" -> "1723.12"
+            if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
+                # Insert decimal before last 2 digits
+                clean_value = clean_value[:-2] + '.' + clean_value[-2:]
+            else:
+                # 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
+                clean_value = clean_value.replace(',', '.')
+            try:
+                amount = float(clean_value)
+                if amount < 1:  # Skip invalid amounts
+                    i += 1
+                    continue
+            except ValueError:
+                i += 1
+                continue
+            # Look ahead 1-2 lines for ticker
+            ticker_found = False
+            for lookahead in range(1, 3):  # Check next 1-2 lines
+                if i + lookahead >= len(lines):
+                    break
+                check_line = lines[i + lookahead].strip()
+                # Match ticker patterns (more flexible):
+                # "8,31 MU - 411,50" or "52,03 AMKR: 51$" or "GOOGL* 335,15"
+                # Ticker can be followed by: -, :, *, space, or «
+                ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})[\s\-–:*«]', check_line)
+                if ticker_match:
+                    ticker = ticker_match.group(1)
+                    # Validate ticker (not a word fragment or common false positive)
+                    if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY']:
+                        portfolio[ticker] = amount
+                        ticker_found = True
+                        i += lookahead + 1  # Skip processed lines
+                        break
+            if not ticker_found:
+                i += 1
+        else:
+            i += 1
     return portfolio