Dmitry Beresnev commited on
Commit
bd3f2a3
·
1 Parent(s): 71f44e2

fix OCR module

Browse files
Files changed (2) hide show
  1. app.py +27 -7
  2. ocr_parser.py +128 -30
app.py CHANGED
@@ -86,25 +86,45 @@ with col1:
86
  if error:
87
  st.error(f"❌ {error}")
88
  else:
89
- # Show extracted text
90
- with st.expander("📄 Extracted Text"):
91
- st.text(text)
92
 
93
  # Parse portfolio
94
  portfolio = ocr_parser.parse_portfolio(text)
95
 
96
  if portfolio:
97
- st.success(f"✅ Found {len(portfolio)} tickers")
 
98
  st.session_state.portfolio_data = portfolio
99
  else:
100
- st.warning("⚠️ No valid tickers found. Please edit manually below.")
 
 
 
 
 
 
 
 
 
101
  st.session_state.portfolio_data = {}
102
 
103
  with col2:
104
  st.subheader("✏️ Edit Portfolio (JSON)")
105
 
 
 
 
 
 
 
 
 
 
 
106
  # Get initial JSON value
107
- if st.session_state.portfolio_data is not None:
108
  initial_json = ocr_parser.format_portfolio_json(st.session_state.portfolio_data)
109
  else:
110
  # Default example
@@ -118,7 +138,7 @@ with col2:
118
  edited_json = st.text_area(
119
  "Portfolio (JSON format)",
120
  value=initial_json,
121
- height=300,
122
  help="Edit the portfolio in JSON format: {\"TICKER\": amount, ...}"
123
  )
124
 
 
86
  if error:
87
  st.error(f"❌ {error}")
88
  else:
89
+ # Show extracted text prominently
90
+ st.info("📄 **Extracted Text from Image:**")
91
+ st.text_area("Raw OCR Output", text, height=150, disabled=True)
92
 
93
  # Parse portfolio
94
  portfolio = ocr_parser.parse_portfolio(text)
95
 
96
  if portfolio:
97
+ st.success(f"✅ Found {len(portfolio)} tickers: {', '.join(portfolio.keys())}")
98
+ st.json(portfolio)
99
  st.session_state.portfolio_data = portfolio
100
  else:
101
+ st.warning("⚠️ **No valid tickers found in the image.**")
102
+ st.info("""
103
+ **Possible reasons:**
104
+ - Tickers are not in uppercase (e.g., 'aapl' instead of 'AAPL')
105
+ - Company names instead of ticker symbols (e.g., 'Apple Inc.' instead of 'AAPL')
106
+ - Unusual formatting or layout
107
+ - Poor image quality
108
+
109
+ **Solution:** Please manually enter your portfolio in the JSON editor below.
110
+ """)
111
  st.session_state.portfolio_data = {}
112
 
113
  with col2:
114
  st.subheader("✏️ Edit Portfolio (JSON)")
115
 
116
+ st.info("""
117
+ **Format:** `{"TICKER": amount, ...}`
118
+
119
+ **Important:**
120
+ - Use **ticker symbols** (e.g., AAPL, GOOGL, MSFT)
121
+ - NOT company names (e.g., ❌ "Apple Inc.")
122
+ - Tickers must be UPPERCASE
123
+ - Amounts in your portfolio currency
124
+ """)
125
+
126
  # Get initial JSON value
127
+ if st.session_state.portfolio_data is not None and len(st.session_state.portfolio_data) > 0:
128
  initial_json = ocr_parser.format_portfolio_json(st.session_state.portfolio_data)
129
  else:
130
  # Default example
 
138
  edited_json = st.text_area(
139
  "Portfolio (JSON format)",
140
  value=initial_json,
141
+ height=250,
142
  help="Edit the portfolio in JSON format: {\"TICKER\": amount, ...}"
143
  )
144
 
ocr_parser.py CHANGED
@@ -5,24 +5,72 @@ Handles:
5
  - Text extraction from portfolio screenshots using Tesseract OCR
6
  - Parsing tickers and amounts using regex
7
  - JSON validation for user-edited portfolio data
 
8
  """
9
 
10
  import re
11
  import json
12
  from typing import Dict, Tuple, Optional
13
- from PIL import Image
14
  import pytesseract
 
15
 
16
 
17
- # Regex pattern for ticker extraction: ([A-Z]{1,5})\s+([\d,.]+)
18
- # Matches: 1-5 uppercase letters followed by whitespace and a number (with optional commas)
19
- TICKER_PATTERN = r'([A-Z]{1,5})\s+([\d,.]+)'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
23
  """
24
  Extract text from uploaded portfolio screenshot using Tesseract OCR.
25
 
 
 
26
  Args:
27
  image: PIL Image object
28
 
@@ -35,12 +83,26 @@ def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional
35
  # Verify tesseract is available
36
  pytesseract.get_tesseract_version()
37
 
38
- # Extract text
39
- text = pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Check if any text was detected
42
  if not text.strip():
43
- return None, "No text detected in image. Please upload a clearer screenshot."
44
 
45
  return text, None
46
 
@@ -52,10 +114,13 @@ def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional
52
 
53
  def parse_portfolio(text: str) -> Dict[str, float]:
54
  """
55
- Parse portfolio from extracted text using regex.
56
 
57
- Pattern: ([A-Z]{1,5})\\s+([\\d,.]+)
58
- Extracts ticker symbols (1-5 uppercase letters) and amounts (numbers with optional commas).
 
 
 
59
 
60
  Args:
61
  text: Extracted text from OCR
@@ -67,29 +132,62 @@ def parse_portfolio(text: str) -> Dict[str, float]:
67
  if not text:
68
  return {}
69
 
70
- # Find all matches of pattern
71
- matches = re.findall(TICKER_PATTERN, text)
72
-
73
- if not matches:
74
- return {}
75
-
76
  portfolio = {}
77
 
78
- for ticker, amount_str in matches:
79
- try:
80
- # Remove commas from numbers (e.g., "1,234.56" -> "1234.56")
81
- clean_amount = amount_str.replace(",", "")
82
- amount = float(clean_amount)
83
 
84
- # Only include positive amounts
85
- if amount > 0:
86
- portfolio[ticker] = amount
87
-
88
- except ValueError:
89
- # Skip invalid number formats
90
- continue
91
-
92
- return portfolio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
  def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:
 
5
  - Text extraction from portfolio screenshots using Tesseract OCR
6
  - Parsing tickers and amounts using regex
7
  - JSON validation for user-edited portfolio data
8
+ - Image preprocessing for better OCR accuracy
9
  """
10
 
11
  import re
12
  import json
13
  from typing import Dict, Tuple, Optional
14
+ from PIL import Image, ImageEnhance, ImageFilter
15
  import pytesseract
16
+ import numpy as np
17
 
18
 
19
+ # Multiple regex patterns to handle different formats
20
+ TICKER_PATTERNS = [
21
+ # Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
22
+ r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)',
23
+ # Pattern 2: Amount followed by ticker ($5,000 AAPL)
24
+ r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})',
25
+ # Pattern 3: Ticker on one line, amount on next (multi-line)
26
+ r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)',
27
+ # Pattern 4: With separators (AAPL | $5,000.00)
28
+ r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)',
29
+ ]
30
+
31
+
32
+ def preprocess_image(image: Image.Image) -> Image.Image:
33
+ """
34
+ Preprocess image for better OCR accuracy.
35
+
36
+ Applies:
37
+ - Grayscale conversion
38
+ - Contrast enhancement
39
+ - Sharpening
40
+ - Noise reduction
41
+
42
+ Args:
43
+ image: PIL Image object
44
+
45
+ Returns:
46
+ Preprocessed PIL Image object
47
+ """
48
+ # Convert to grayscale
49
+ image = image.convert('L')
50
+
51
+ # Increase contrast
52
+ enhancer = ImageEnhance.Contrast(image)
53
+ image = enhancer.enhance(2.0)
54
+
55
+ # Sharpen
56
+ image = image.filter(ImageFilter.SHARPEN)
57
+
58
+ # Resize if image is too small (helps with OCR)
59
+ width, height = image.size
60
+ if width < 800 or height < 800:
61
+ scale = max(800 / width, 800 / height)
62
+ new_size = (int(width * scale), int(height * scale))
63
+ image = image.resize(new_size, Image.Resampling.LANCZOS)
64
+
65
+ return image
66
 
67
 
68
  def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
69
  """
70
  Extract text from uploaded portfolio screenshot using Tesseract OCR.
71
 
72
+ Uses image preprocessing and custom Tesseract config for better accuracy.
73
+
74
  Args:
75
  image: PIL Image object
76
 
 
83
  # Verify tesseract is available
84
  pytesseract.get_tesseract_version()
85
 
86
+ # Preprocess image for better OCR
87
+ processed_image = preprocess_image(image)
88
+
89
+ # Custom Tesseract configuration for better accuracy
90
+ # --psm 6: Assume a single uniform block of text
91
+ # --oem 3: Use default OCR Engine mode
92
+ custom_config = r'--oem 3 --psm 6'
93
+
94
+ # Extract text with custom config
95
+ text = pytesseract.image_to_string(processed_image, config=custom_config)
96
+
97
+ # If first attempt fails, try with different PSM mode
98
+ if not text.strip():
99
+ # PSM 4: Assume a single column of text of variable sizes
100
+ custom_config = r'--oem 3 --psm 4'
101
+ text = pytesseract.image_to_string(processed_image, config=custom_config)
102
 
103
  # Check if any text was detected
104
  if not text.strip():
105
+ return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."
106
 
107
  return text, None
108
 
 
114
 
115
  def parse_portfolio(text: str) -> Dict[str, float]:
116
  """
117
+ Parse portfolio from extracted text using multiple regex patterns.
118
 
119
+ Tries various patterns to handle different screenshot formats:
120
+ - Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
121
+ - Amount followed by ticker: "$5,000 AAPL"
122
+ - Multi-line format: ticker on one line, amount on next
123
+ - With separators: "AAPL | $5,000.00"
124
 
125
  Args:
126
  text: Extracted text from OCR
 
132
  if not text:
133
  return {}
134
 
 
 
 
 
 
 
135
  portfolio = {}
136
 
137
+ # Try each pattern
138
+ for pattern in TICKER_PATTERNS:
139
+ matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
 
 
140
 
141
+ for match in matches:
142
+ try:
143
+ # Determine which group is ticker and which is amount
144
+ # Check which one looks like a number
145
+ group1, group2 = match
146
+
147
+ # Check if group1 is a number (amount first format)
148
+ if re.match(r'^[\d,.]+$', group1):
149
+ amount_str = group1
150
+ ticker = group2.upper()
151
+ else:
152
+ ticker = group1.upper()
153
+ amount_str = group2
154
+
155
+ # Validate ticker (1-10 uppercase letters)
156
+ if not re.match(r'^[A-Z]{1,10}$', ticker):
157
+ continue
158
+
159
+ # Clean and parse amount
160
+ # Remove currency symbols, commas, spaces
161
+ clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
162
+
163
+ # Convert to float
164
+ amount = float(clean_amount)
165
+
166
+ # Only include positive amounts > 1 (filter out percentages, etc.)
167
+ if amount > 1:
168
+ # If ticker already exists, keep the larger amount
169
+ if ticker not in portfolio or amount > portfolio[ticker]:
170
+ portfolio[ticker] = amount
171
+
172
+ except (ValueError, IndexError, AttributeError):
173
+ # Skip invalid matches
174
+ continue
175
+
176
+ # Additional heuristics: filter out common false positives
177
+ # Remove entries that look like dates, IDs, etc.
178
+ false_positive_patterns = [
179
+ r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$', # Currency codes
180
+ r'^AM$', r'^PM$', # Time indicators
181
+ r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$', # Months
182
+ ]
183
+
184
+ filtered_portfolio = {}
185
+ for ticker, amount in portfolio.items():
186
+ is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
187
+ if not is_false_positive:
188
+ filtered_portfolio[ticker] = amount
189
+
190
+ return filtered_portfolio
191
 
192
 
193
  def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]: