Dmitry Beresnev commited on
Commit
deaa7ee
·
1 Parent(s): 3036bb1
Files changed (1) hide show
  1. ocr_parser.py +41 -15
ocr_parser.py CHANGED
@@ -161,15 +161,19 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
161
  """
162
  Parse Revolut-specific format.
163
 
164
- Revolut format (typically 2-3 lines per stock):
165
- Line 1: @ Company Name 3420,14$
166
- Line 2: 8,31 MU - 411,50'$ 4123,26%
167
- (Sometimes line 3 if company name is long)
 
 
 
168
 
169
  Handles variations:
170
  - Spaces in numbers: "3 256,40"
171
- - Different separators: "-", ":", "*"
172
  - Numbers without decimals: "172312"
 
173
 
174
  Args:
175
  text: Extracted text from OCR
@@ -185,9 +189,26 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
185
  while i < len(lines):
186
  current_line = lines[i].strip()
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  # Look for portfolio value line (contains amount with $, €, £)
189
- # Handle spaces in numbers: "3 256,40" or "172312" or "3420,14"
190
- value_match = re.search(r'([\d\s,]+(?:[.,]\d{1,2})?)\s*[\$€£]', current_line)
 
 
191
 
192
  if value_match:
193
  portfolio_value_str = value_match.group(1)
@@ -208,7 +229,9 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
208
 
209
  try:
210
  amount = float(clean_value)
211
- if amount < 1: # Skip invalid amounts
 
 
212
  i += 1
213
  continue
214
  except ValueError:
@@ -223,19 +246,22 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
223
 
224
  check_line = lines[i + lookahead].strip()
225
 
226
- # Match ticker patterns (more flexible):
227
- # "8,31 MU - 411,50" or "52,03 AMKR: 51$" or "GOOGL* 335,15"
228
- # Ticker can be followed by: -, :, *, space, or «
229
- ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})[\s\-–:*«]', check_line)
 
230
 
231
  if ticker_match:
232
  ticker = ticker_match.group(1)
233
 
234
  # Validate ticker (not a word fragment or common false positive)
235
- if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY']:
236
- portfolio[ticker] = amount
 
 
237
  ticker_found = True
238
- i += lookahead + 1 # Skip processed lines
239
  break
240
 
241
  if not ticker_found:
 
161
  """
162
  Parse Revolut-specific format.
163
 
164
+ Revolut format (typically 2 lines per stock):
165
+ Line 1: [icon] Company Name [portfolio_value]$
166
+ Line 2: [shares] TICKER[separator] [price_per_share]$ [change%]
167
+
168
+ Examples:
169
+ Line 1: "@ Micron Technology 3 212,85 $"
170
+ Line 2: "8,31 MU» 386,56 $ 4 109,73%"
171
 
172
  Handles variations:
173
  - Spaces in numbers: "3 256,40"
174
+ - Different separators after ticker: "-", ":", "*", "»", "«"
175
  - Numbers without decimals: "172312"
176
+ - Negative values in change column
177
 
178
  Args:
179
  text: Extracted text from OCR
 
189
  while i < len(lines):
190
  current_line = lines[i].strip()
191
 
192
+ # Skip empty lines
193
+ if not current_line:
194
+ i += 1
195
+ continue
196
+
197
+ # Check if this is a TICKER line (not a value line)
198
+ # Ticker lines start with: [shares] [TICKER][separator]
199
+ # Example: "8,31 MU» 386,56 $" or "52,03 AMKR: 51$" or "0,94LLY -1080"
200
+ is_ticker_line = re.match(r'^[\d,]+[.,]?\d*\s*[A-Z]{2,5}[\s\-–:*«»]', current_line)
201
+
202
+ if is_ticker_line:
203
+ # This is a ticker line, skip it (it's already been processed as lookahead)
204
+ i += 1
205
+ continue
206
+
207
  # Look for portfolio value line (contains amount with $, €, £)
208
+ # IMPORTANT: Match dollar amounts that are NOT preceded by a negative sign
209
+ # Avoid matching negative change values like "-1080,46$"
210
+ # Allow optional colon/apostrophe before currency: "3 120,52: $" or "240,92'$"
211
+ value_match = re.search(r'(?<![\-–])([\d\s,]+(?:[.,]\d{1,2})?)[:\']?\s*[\$€£]', current_line)
212
 
213
  if value_match:
214
  portfolio_value_str = value_match.group(1)
 
229
 
230
  try:
231
  amount = float(clean_value)
232
+ # Filter out very small amounts (likely percentages, share counts, or other data)
233
+ # Portfolio positions are typically > 50 (even small positions)
234
+ if amount < 50:
235
  i += 1
236
  continue
237
  except ValueError:
 
246
 
247
  check_line = lines[i + lookahead].strip()
248
 
249
+ # Match ticker patterns: [shares] [TICKER][separator]
250
+ # Examples: "8,31 MU -" or "52,03 AMKR:" or "5,06 GOOGL*" or "5,06 TSM «"
251
+ # Also handles OCR errors with missing space: "0,94LLY"
252
+ # Ticker can be followed by: -, :, *, », «, space, or end of significant text
253
+ ticker_match = re.search(r'[\d,]+[.,]?\d*\s*([A-Z]{2,5})[\s\-–:*«»]', check_line)
254
 
255
  if ticker_match:
256
  ticker = ticker_match.group(1)
257
 
258
  # Validate ticker (not a word fragment or common false positive)
259
+ if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY', 'CHF']:
260
+ # Only add if not already present (avoid duplicates)
261
+ if ticker not in portfolio:
262
+ portfolio[ticker] = amount
263
  ticker_found = True
264
+ i += lookahead + 1 # Skip to line after ticker line
265
  break
266
 
267
  if not ticker_found: