Dmitry Beresnev
commited on
Commit
·
deaa7ee
1
Parent(s):
3036bb1
fix OCR
Browse files- ocr_parser.py +41 -15
ocr_parser.py
CHANGED
|
@@ -161,15 +161,19 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 161 |
"""
|
| 162 |
Parse Revolut-specific format.
|
| 163 |
|
| 164 |
-
Revolut format (typically 2
|
| 165 |
-
Line 1:
|
| 166 |
-
Line 2:
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
Handles variations:
|
| 170 |
- Spaces in numbers: "3 256,40"
|
| 171 |
-
- Different separators: "-", ":", "*"
|
| 172 |
- Numbers without decimals: "172312"
|
|
|
|
| 173 |
|
| 174 |
Args:
|
| 175 |
text: Extracted text from OCR
|
|
@@ -185,9 +189,26 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 185 |
while i < len(lines):
|
| 186 |
current_line = lines[i].strip()
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
# Look for portfolio value line (contains amount with $, €, £)
|
| 189 |
-
#
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
|
| 192 |
if value_match:
|
| 193 |
portfolio_value_str = value_match.group(1)
|
|
@@ -208,7 +229,9 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 208 |
|
| 209 |
try:
|
| 210 |
amount = float(clean_value)
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
i += 1
|
| 213 |
continue
|
| 214 |
except ValueError:
|
|
@@ -223,19 +246,22 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 223 |
|
| 224 |
check_line = lines[i + lookahead].strip()
|
| 225 |
|
| 226 |
-
# Match ticker patterns
|
| 227 |
-
# "8,31 MU -
|
| 228 |
-
#
|
| 229 |
-
|
|
|
|
| 230 |
|
| 231 |
if ticker_match:
|
| 232 |
ticker = ticker_match.group(1)
|
| 233 |
|
| 234 |
# Validate ticker (not a word fragment or common false positive)
|
| 235 |
-
if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY']:
|
| 236 |
-
|
|
|
|
|
|
|
| 237 |
ticker_found = True
|
| 238 |
-
i += lookahead + 1 # Skip
|
| 239 |
break
|
| 240 |
|
| 241 |
if not ticker_found:
|
|
|
|
| 161 |
"""
|
| 162 |
Parse Revolut-specific format.
|
| 163 |
|
| 164 |
+
Revolut format (typically 2 lines per stock):
|
| 165 |
+
Line 1: [icon] Company Name [portfolio_value]$
|
| 166 |
+
Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]
|
| 167 |
+
|
| 168 |
+
Examples:
|
| 169 |
+
Line 1: "@ Micron Technology 3 212,85 $"
|
| 170 |
+
Line 2: "8,31 MU» 386,56 $ 4 109,73%"
|
| 171 |
|
| 172 |
Handles variations:
|
| 173 |
- Spaces in numbers: "3 256,40"
|
| 174 |
+
- Different separators after ticker: "-", ":", "*", "»", "«"
|
| 175 |
- Numbers without decimals: "172312"
|
| 176 |
+
- Negative values in change column
|
| 177 |
|
| 178 |
Args:
|
| 179 |
text: Extracted text from OCR
|
|
|
|
| 189 |
while i < len(lines):
|
| 190 |
current_line = lines[i].strip()
|
| 191 |
|
| 192 |
+
# Skip empty lines
|
| 193 |
+
if not current_line:
|
| 194 |
+
i += 1
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
# Check if this is a TICKER line (not a value line)
|
| 198 |
+
# Ticker lines start with: [shares] [TICKER][separator]
|
| 199 |
+
# Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
|
| 200 |
+
is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line)
|
| 201 |
+
|
| 202 |
+
if is_ticker_line:
|
| 203 |
+
# This is a ticker line, skip it (it's already been processed as lookahead)
|
| 204 |
+
i += 1
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
# Look for portfolio value line (contains amount with $, €, £)
|
| 208 |
+
# IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
|
| 209 |
+
# Avoid matching negative change values like "-1080,46$"
|
| 210 |
+
# Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
|
| 211 |
+
value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)
|
| 212 |
|
| 213 |
if value_match:
|
| 214 |
portfolio_value_str = value_match.group(1)
|
|
|
|
| 229 |
|
| 230 |
try:
|
| 231 |
amount = float(clean_value)
|
| 232 |
+
# Filter out very small amounts (likely percentages, share counts, or other data)
|
| 233 |
+
# Portfolio positions are typically > 50 (even small positions)
|
| 234 |
+
if amount < 50:
|
| 235 |
i += 1
|
| 236 |
continue
|
| 237 |
except ValueError:
|
|
|
|
| 246 |
|
| 247 |
check_line = lines[i + lookahead].strip()
|
| 248 |
|
| 249 |
+
# Match ticker patterns: [shares] [TICKER][separator]
|
| 250 |
+
# Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
|
| 251 |
+
# Also handles OCR errors with missing space: "0,94LLY"
|
| 252 |
+
# Ticker can be followed by: -, :, *, », «, space, or end of significant text
|
| 253 |
+
ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line)
|
| 254 |
|
| 255 |
if ticker_match:
|
| 256 |
ticker = ticker_match.group(1)
|
| 257 |
|
| 258 |
# Validate ticker (not a word fragment or common false positive)
|
| 259 |
+
if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
|
| 260 |
+
# Only add if not already present (avoid duplicates)
|
| 261 |
+
if ticker not in portfolio:
|
| 262 |
+
portfolio[ticker] = amount
|
| 263 |
ticker_found = True
|
| 264 |
+
i += lookahead + 1 # Skip to line after ticker line
|
| 265 |
break
|
| 266 |
|
| 267 |
if not ticker_found:
|