Dmitry Beresnev
commited on
Commit
·
c226f41
1
Parent(s):
634a20f
fix OCR
Browse files- ocr_parser.py +55 -24
ocr_parser.py
CHANGED
|
@@ -161,11 +161,15 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 161 |
"""
|
| 162 |
Parse Revolut-specific format.
|
| 163 |
|
| 164 |
-
Revolut format (2 lines per stock):
|
| 165 |
Line 1: @ Company Name 3420,14$
|
| 166 |
Line 2: 8,31 MU - 411,50'$ 4123,26%
|
|
|
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
Args:
|
| 171 |
text: Extracted text from OCR
|
|
@@ -176,41 +180,68 @@ def parse_revolut_format(text: str) -> Dict[str, float]:
|
|
| 176 |
portfolio = {}
|
| 177 |
lines = text.split('\n')
|
| 178 |
|
| 179 |
-
# Process lines
|
| 180 |
i = 0
|
| 181 |
while i < len(lines):
|
| 182 |
current_line = lines[i].strip()
|
| 183 |
|
| 184 |
# Look for portfolio value line (contains amount with $, €, £)
|
| 185 |
-
|
|
|
|
| 186 |
|
| 187 |
-
if value_match
|
| 188 |
portfolio_value_str = value_match.group(1)
|
| 189 |
|
| 190 |
-
#
|
| 191 |
-
|
|
|
|
| 192 |
|
| 193 |
-
#
|
| 194 |
-
#
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
portfolio[ticker] = amount
|
| 206 |
-
except ValueError:
|
| 207 |
-
pass
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
return portfolio
|
| 216 |
|
|
|
|
| 161 |
"""
|
| 162 |
Parse Revolut-specific format.
|
| 163 |
|
| 164 |
+
Revolut format (typically 2-3 lines per stock):
|
| 165 |
Line 1: @ Company Name 3420,14$
|
| 166 |
Line 2: 8,31 MU - 411,50'$ 4123,26%
|
| 167 |
+
(Sometimes line 3 if company name is long)
|
| 168 |
|
| 169 |
+
Handles variations:
|
| 170 |
+
- Spaces in numbers: "3 256,40"
|
| 171 |
+
- Different separators: "-", ":", "*"
|
| 172 |
+
- Numbers without decimals: "172312"
|
| 173 |
|
| 174 |
Args:
|
| 175 |
text: Extracted text from OCR
|
|
|
|
| 180 |
portfolio = {}
|
| 181 |
lines = text.split('\n')
|
| 182 |
|
| 183 |
+
# Process lines
|
| 184 |
i = 0
|
| 185 |
while i < len(lines):
|
| 186 |
current_line = lines[i].strip()
|
| 187 |
|
| 188 |
# Look for portfolio value line (contains amount with $, €, £)
|
| 189 |
+
# Handle spaces in numbers: "3 256,40" or "172312" or "3420,14"
|
| 190 |
+
value_match = re.search(r'([\d\s,]+(?:[.,]\d{1,2})?)\s*[\$€£]', current_line)
|
| 191 |
|
| 192 |
+
if value_match:
|
| 193 |
portfolio_value_str = value_match.group(1)
|
| 194 |
|
| 195 |
+
# Clean portfolio value:
|
| 196 |
+
# 1. Remove spaces: "3 256,40" -> "3256,40"
|
| 197 |
+
clean_value = portfolio_value_str.replace(' ', '')
|
| 198 |
|
| 199 |
+
# 2. Handle numbers without decimal separators
|
| 200 |
+
# If no decimal (. or ,) and more than 2 digits, assume last 2 are cents
|
| 201 |
+
# Example: "172312" -> "1723.12"
|
| 202 |
+
if not re.search(r'[.,]', clean_value) and len(clean_value) > 2:
|
| 203 |
+
# Insert decimal before last 2 digits
|
| 204 |
+
clean_value = clean_value[:-2] + '.' + clean_value[-2:]
|
| 205 |
+
else:
|
| 206 |
+
# 3. Replace comma with dot for European format: "3256,40" -> "3256.40"
|
| 207 |
+
clean_value = clean_value.replace(',', '.')
|
| 208 |
|
| 209 |
+
try:
|
| 210 |
+
amount = float(clean_value)
|
| 211 |
+
if amount < 1: # Skip invalid amounts
|
| 212 |
+
i += 1
|
| 213 |
+
continue
|
| 214 |
+
except ValueError:
|
| 215 |
+
i += 1
|
| 216 |
+
continue
|
| 217 |
|
| 218 |
+
# Look ahead 1-2 lines for ticker
|
| 219 |
+
ticker_found = False
|
| 220 |
+
for lookahead in range(1, 3): # Check next 1-2 lines
|
| 221 |
+
if i + lookahead >= len(lines):
|
| 222 |
+
break
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
+
check_line = lines[i + lookahead].strip()
|
| 225 |
+
|
| 226 |
+
# Match ticker patterns (more flexible):
|
| 227 |
+
# "8,31 MU - 411,50" or "52,03 AMKR: 51$" or "GOOGL* 335,15"
|
| 228 |
+
# Ticker can be followed by: -, :, *, space, or «
|
| 229 |
+
ticker_match = re.search(r'[\d,]+[.,]?\d*\s+([A-Z]{2,5})[\s\-–:*«]', check_line)
|
| 230 |
|
| 231 |
+
if ticker_match:
|
| 232 |
+
ticker = ticker_match.group(1)
|
| 233 |
+
|
| 234 |
+
# Validate ticker (not a word fragment or common false positive)
|
| 235 |
+
if len(ticker) >= 2 and ticker not in ['AM', 'PM', 'USD', 'EUR', 'GBP', 'JPY']:
|
| 236 |
+
portfolio[ticker] = amount
|
| 237 |
+
ticker_found = True
|
| 238 |
+
i += lookahead + 1 # Skip processed lines
|
| 239 |
+
break
|
| 240 |
+
|
| 241 |
+
if not ticker_found:
|
| 242 |
+
i += 1
|
| 243 |
+
else:
|
| 244 |
+
i += 1
|
| 245 |
|
| 246 |
return portfolio
|
| 247 |
|