Dmitry Beresnev commited on
Commit
c226f41
·
1 Parent(s): 634a20f
Files changed (1) hide show
  1. ocr_parser.py +55 -24
ocr_parser.py CHANGED
@@ -161,11 +161,15 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
161
  """
162
  Parse Revolut-specific format.
163
 
164
- Revolut format (2 lines per stock):
165
  Line 1: @ Company Name 3420,14$
166
  Line 2: 8,31 MU - 411,50'$ 4123,26%
 
167
 
168
- We need to extract the portfolio value from line 1 and ticker from line 2.
 
 
 
169
 
170
  Args:
171
  text: Extracted text from OCR
@@ -176,41 +180,68 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
176
  portfolio = {}
177
  lines = text.split('\n')
178
 
179
- # Process lines in pairs
180
  i = 0
181
  while i < len(lines):
182
  current_line = lines[i].strip()
183
 
184
  # Look for portfolio value line (contains amount with $, €, £)
185
- value_match = re.search(r'([\d,]+[.,]\d{1,2})\s*[\$€£]', current_line)
 
186
 
187
- if value_match and i + 1 < len(lines):
188
  portfolio_value_str = value_match.group(1)
189
 
190
- # Look ahead for ticker in next line
191
- next_line = lines[i + 1].strip()
 
192
 
193
- # Match pattern like: "8,31 MU - 411,50"
194
- # Ticker is 2-5 uppercase letters after some numbers
195
- ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})\s*[-–]', next_line)
 
 
 
 
 
 
196
 
197
- if ticker_match:
198
- ticker = ticker_match.group(1)
 
 
 
 
 
 
199
 
200
- # Clean portfolio value (replace , with . for European format)
201
- clean_value = portfolio_value_str.replace(',', '.')
202
- try:
203
- amount = float(clean_value)
204
- if amount > 1: # Valid amount
205
- portfolio[ticker] = amount
206
- except ValueError:
207
- pass
208
 
209
- # Skip the next line since we've processed it
210
- i += 2
211
- continue
 
 
 
212
 
213
- i += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  return portfolio
216
 
 
161
  """
162
  Parse Revolut-specific format.
163
 
164
+ Revolut format (typically 2-3 lines per stock):
165
  Line 1: @ Company Name 3420,14$
166
  Line 2: 8,31 MU - 411,50'$ 4123,26%
167
+ (Sometimes line 3 if company name is long)
168
 
169
+ Handles variations:
170
+ - Spaces in numbers: "3 256,40"
171
+ - Different separators: "-", ":", "*"
172
+ - Numbers without decimals: "172312"
173
 
174
  Args:
175
  text: Extracted text from OCR
 
180
  portfolio = {}
181
  lines = text.split('\n')
182
 
183
+ # Process lines
184
  i = 0
185
  while i < len(lines):
186
  current_line = lines[i].strip()
187
 
188
  # Look for portfolio value line (contains amount with $, €, £)
189
+ # Handle spaces in numbers: "3 256,40" or "172312" or "3420,14"
190
+ value_match = re.search(r'([\d\s,]+(?:[.,]\d{1,2})?)\s*[\$€£]', current_line)
191
 
192
+ if value_match:
193
  portfolio_value_str = value_match.group(1)
194
 
195
+ # Clean portfolio value:
196
+ # 1. Remove spaces: "3 256,40" -> "3256,40"
197
+ clean_value = portfolio_value_str.replace(' ', '')
198
 
199
+ # 2. Handle numbers without decimal separators
200
+ # If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
201
+ # Example: "172312" -> "1723.12"
202
+ if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
203
+ # Insert decimal before last 2 digits
204
+ clean_value = clean_value[:-2] + '.' + clean_value[-2:]
205
+ else:
206
+ # 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
207
+ clean_value = clean_value.replace(',', '.')
208
 
209
+ try:
210
+ amount = float(clean_value)
211
+ if amount < 1: # Skip invalid amounts
212
+ i += 1
213
+ continue
214
+ except ValueError:
215
+ i += 1
216
+ continue
217
 
218
+ # Look ahead 1-2 lines for ticker
219
+ ticker_found = False
220
+ for lookahead in range(1, 3): # Check next 1-2 lines
221
+ if i + lookahead >= len(lines):
222
+ break
 
 
 
223
 
224
+ check_line = lines[i + lookahead].strip()
225
+
226
+ # Match ticker patterns (more flexible):
227
+ # "8,31 MU - 411,50" or "52,03 AMKR: 51$" or "GOOGL* 335,15"
228
+ # Ticker can be followed by: -, :, *, space, or «
229
+ ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})[\s\-–:*«]', check_line)
230
 
231
+ if ticker_match:
232
+ ticker = ticker_match.group(1)
233
+
234
+ # Validate ticker (not a word fragment or common false positive)
235
+ if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY']:
236
+ portfolio[ticker] = amount
237
+ ticker_found = True
238
+ i += lookahead + 1 # Skip processed lines
239
+ break
240
+
241
+ if not ticker_found:
242
+ i += 1
243
+ else:
244
+ i += 1
245
 
246
  return portfolio
247