Spaces:

ResearchEngineering
/

financial_analyst

Sleeping

App Files Files Community

Dmitry Beresnev commited on 22 days ago

Commit

bd3f2a3

1 Parent(s): 71f44e2

fix OCR module

Browse files

Files changed (2) hide show

app.py +27 -7
ocr_parser.py +128 -30

app.py CHANGED Viewed

@@ -86,25 +86,45 @@ with col1:
         if error:
             st.error(f"❌ {error}")
         else:
-            # Show extracted text
-            with st.expander("📄 Extracted Text"):
-                st.text(text)
             # Parse portfolio
             portfolio = ocr_parser.parse_portfolio(text)
             if portfolio:
-                st.success(f"✅ Found {len(portfolio)} tickers")
                 st.session_state.portfolio_data = portfolio
             else:
-                st.warning("⚠️ No valid tickers found. Please edit manually below.")
                 st.session_state.portfolio_data = {}
 with col2:
     st.subheader("✏️ Edit Portfolio (JSON)")
     # Get initial JSON value
-    if st.session_state.portfolio_data is not None:
         initial_json = ocr_parser.format_portfolio_json(st.session_state.portfolio_data)
     else:
         # Default example
@@ -118,7 +138,7 @@ with col2:
     edited_json = st.text_area(
         "Portfolio (JSON format)",
         value=initial_json,
-        height=300,
         help="Edit the portfolio in JSON format: {\"TICKER\": amount, ...}"
     )

         if error:
             st.error(f"❌ {error}")
         else:
+            # Show extracted text prominently
+            st.info("📄 **Extracted Text from Image:**")
+            st.text_area("Raw OCR Output", text, height=150, disabled=True)
             # Parse portfolio
             portfolio = ocr_parser.parse_portfolio(text)
             if portfolio:
+                st.success(f"✅ Found {len(portfolio)} tickers: {', '.join(portfolio.keys())}")
+                st.json(portfolio)
                 st.session_state.portfolio_data = portfolio
             else:
+                st.warning("⚠️ **No valid tickers found in the image.**")
+                st.info("""
+                **Possible reasons:**
+                - Tickers are not in uppercase (e.g., 'aapl' instead of 'AAPL')
+                - Company names instead of ticker symbols (e.g., 'Apple Inc.' instead of 'AAPL')
+                - Unusual formatting or layout
+                - Poor image quality
+                **Solution:** Please manually enter your portfolio in the JSON editor below.
+                """)
                 st.session_state.portfolio_data = {}
 with col2:
     st.subheader("✏️ Edit Portfolio (JSON)")
+    st.info("""
+    **Format:** `{"TICKER": amount, ...}`
+    **Important:**
+    - Use **ticker symbols** (e.g., AAPL, GOOGL, MSFT)
+    - NOT company names (e.g., ❌ "Apple Inc.")
+    - Tickers must be UPPERCASE
+    - Amounts in your portfolio currency
+    """)
     # Get initial JSON value
+    if st.session_state.portfolio_data is not None and len(st.session_state.portfolio_data) > 0:
         initial_json = ocr_parser.format_portfolio_json(st.session_state.portfolio_data)
     else:
         # Default example
     edited_json = st.text_area(
         "Portfolio (JSON format)",
         value=initial_json,
+        height=250,
         help="Edit the portfolio in JSON format: {\"TICKER\": amount, ...}"
     )

ocr_parser.py CHANGED Viewed

@@ -5,24 +5,72 @@ Handles:
 - Text extraction from portfolio screenshots using Tesseract OCR
 - Parsing tickers and amounts using regex
 - JSON validation for user-edited portfolio data
 """
 import re
 import json
 from typing import Dict, Tuple, Optional
-from PIL import Image
 import pytesseract
-# Regex pattern for ticker extraction: ([A-Z]{1,5})\s+([\d,.]+)
-# Matches: 1-5 uppercase letters followed by whitespace and a number (with optional commas)
-TICKER_PATTERN = r'([A-Z]{1,5})\s+([\d,.]+)'
 def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
     """
     Extract text from uploaded portfolio screenshot using Tesseract OCR.
     Args:
         image: PIL Image object
@@ -35,12 +83,26 @@ def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional
         # Verify tesseract is available
         pytesseract.get_tesseract_version()
-        # Extract text
-        text = pytesseract.image_to_string(image)
         # Check if any text was detected
         if not text.strip():
-            return None, "No text detected in image. Please upload a clearer screenshot."
         return text, None
@@ -52,10 +114,13 @@ def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional
 def parse_portfolio(text: str) -> Dict[str, float]:
     """
-    Parse portfolio from extracted text using regex.
-    Pattern: ([A-Z]{1,5})\\s+([\\d,.]+)
-    Extracts ticker symbols (1-5 uppercase letters) and amounts (numbers with optional commas).
     Args:
         text: Extracted text from OCR
@@ -67,29 +132,62 @@ def parse_portfolio(text: str) -> Dict[str, float]:
     if not text:
         return {}
-    # Find all matches of pattern
-    matches = re.findall(TICKER_PATTERN, text)
-    if not matches:
-        return {}
     portfolio = {}
-    for ticker, amount_str in matches:
-        try:
-            # Remove commas from numbers (e.g., "1,234.56" -> "1234.56")
-            clean_amount = amount_str.replace(",", "")
-            amount = float(clean_amount)
-            # Only include positive amounts
-            if amount > 0:
-                portfolio[ticker] = amount
-        except ValueError:
-            # Skip invalid number formats
-            continue
-    return portfolio
 def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]:

 - Text extraction from portfolio screenshots using Tesseract OCR
 - Parsing tickers and amounts using regex
 - JSON validation for user-edited portfolio data
+- Image preprocessing for better OCR accuracy
 """
 import re
 import json
 from typing import Dict, Tuple, Optional
+from PIL import Image, ImageEnhance, ImageFilter
 import pytesseract
+import numpy as np
+# Multiple regex patterns to handle different formats
+TICKER_PATTERNS = [
+    # Pattern 1: Ticker followed by amount (AAPL 5000 or AAPL $5,000.00)
+    r'([A-Z]{1,5})\s*[\$€£]?\s*([\d,]+\.?\d*)',
+    # Pattern 2: Amount followed by ticker ($5,000 AAPL)
+    r'[\$€£]?\s*([\d,]+\.?\d*)\s+([A-Z]{1,5})',
+    # Pattern 3: Ticker on one line, amount on next (multi-line)
+    r'([A-Z]{1,5})\s*\n\s*[\$€£]?\s*([\d,]+\.?\d*)',
+    # Pattern 4: With separators (AAPL | $5,000.00)
+    r'([A-Z]{1,5})\s*[:|]\s*[\$€£]?\s*([\d,]+\.?\d*)',
+]
+def preprocess_image(image: Image.Image) -> Image.Image:
+    """
+    Preprocess image for better OCR accuracy.
+    Applies:
+    - Grayscale conversion
+    - Contrast enhancement
+    - Sharpening
+    - Noise reduction
+    Args:
+        image: PIL Image object
+    Returns:
+        Preprocessed PIL Image object
+    """
+    # Convert to grayscale
+    image = image.convert('L')
+    # Increase contrast
+    enhancer = ImageEnhance.Contrast(image)
+    image = enhancer.enhance(2.0)
+    # Sharpen
+    image = image.filter(ImageFilter.SHARPEN)
+    # Resize if image is too small (helps with OCR)
+    width, height = image.size
+    if width < 800 or height < 800:
+        scale = max(800 / width, 800 / height)
+        new_size = (int(width * scale), int(height * scale))
+        image = image.resize(new_size, Image.Resampling.LANCZOS)
+    return image
 def extract_text_from_image(image: Image.Image) -> Tuple[Optional[str], Optional[str]]:
     """
     Extract text from uploaded portfolio screenshot using Tesseract OCR.
+    Uses image preprocessing and custom Tesseract config for better accuracy.
     Args:
         image: PIL Image object
         # Verify tesseract is available
         pytesseract.get_tesseract_version()
+        # Preprocess image for better OCR
+        processed_image = preprocess_image(image)
+        # Custom Tesseract configuration for better accuracy
+        # --psm 6: Assume a single uniform block of text
+        # --oem 3: Use default OCR Engine mode
+        custom_config = r'--oem 3 --psm 6'
+        # Extract text with custom config
+        text = pytesseract.image_to_string(processed_image, config=custom_config)
+        # If first attempt fails, try with different PSM mode
+        if not text.strip():
+            # PSM 4: Assume a single column of text of variable sizes
+            custom_config = r'--oem 3 --psm 4'
+            text = pytesseract.image_to_string(processed_image, config=custom_config)
         # Check if any text was detected
         if not text.strip():
+            return None, "No text detected in image. Please upload a clearer screenshot or enter data manually."
         return text, None
 def parse_portfolio(text: str) -> Dict[str, float]:
     """
+    Parse portfolio from extracted text using multiple regex patterns.
+    Tries various patterns to handle different screenshot formats:
+    - Ticker followed by amount: "AAPL 5000" or "AAPL $5,000.00"
+    - Amount followed by ticker: "$5,000 AAPL"
+    - Multi-line format: ticker on one line, amount on next
+    - With separators: "AAPL | $5,000.00"
     Args:
         text: Extracted text from OCR
     if not text:
         return {}
     portfolio = {}
+    # Try each pattern
+    for pattern in TICKER_PATTERNS:
+        matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
+        for match in matches:
+            try:
+                # Determine which group is ticker and which is amount
+                # Check which one looks like a number
+                group1, group2 = match
+                # Check if group1 is a number (amount first format)
+                if re.match(r'^[\d,.]+$', group1):
+                    amount_str = group1
+                    ticker = group2.upper()
+                else:
+                    ticker = group1.upper()
+                    amount_str = group2
+                # Validate ticker (1-10 uppercase letters)
+                if not re.match(r'^[A-Z]{1,10}$', ticker):
+                    continue
+                # Clean and parse amount
+                # Remove currency symbols, commas, spaces
+                clean_amount = re.sub(r'[\$€£,\s]', '', amount_str)
+                # Convert to float
+                amount = float(clean_amount)
+                # Only include positive amounts > 1 (filter out percentages, etc.)
+                if amount > 1:
+                    # If ticker already exists, keep the larger amount
+                    if ticker not in portfolio or amount > portfolio[ticker]:
+                        portfolio[ticker] = amount
+            except (ValueError, IndexError, AttributeError):
+                # Skip invalid matches
+                continue
+    # Additional heuristics: filter out common false positives
+    # Remove entries that look like dates, IDs, etc.
+    false_positive_patterns = [
+        r'^ID$', r'^USD$', r'^EUR$', r'^GBP$', r'^JPY$',  # Currency codes
+        r'^AM$', r'^PM$',  # Time indicators
+        r'^JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC$',  # Months
+    ]
+    filtered_portfolio = {}
+    for ticker, amount in portfolio.items():
+        is_false_positive = any(re.match(pattern, ticker) for pattern in false_positive_patterns)
+        if not is_false_positive:
+            filtered_portfolio[ticker] = amount
+    return filtered_portfolio
 def validate_portfolio_json(json_str: str) -> Tuple[bool, Optional[Dict[str, float]], str]: