Spaces:

zz2232
/

UniversalOcrApp

Running

App Files Files Community

zz2232 commited on Feb 14

Commit

7d3fb90

verified ·

1 Parent(s): 8e1cf48

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +283 -192

src/streamlit_app.py CHANGED Viewed

@@ -39,7 +39,8 @@ DOCUMENT_TEMPLATES = {
         "name": "ID Card / Passport",
         "description": "Extract structured data from identity documents",
         "prompt": """Extract structured data from this identity document.
-Output ONLY valid JSON:
 {
   "document_type": "",
   "full_name": "",
@@ -48,8 +49,11 @@ Output ONLY valid JSON:
   "date_of_expiry": "",
   "nationality": "",
   "document_number": "",
-  "additional_info": {}
-}""",
         "icon": "🆔"
     },
@@ -57,6 +61,7 @@ Output ONLY valid JSON:
         "name": "Receipt",
         "description": "Extract items, prices, and totals from receipts",
         "prompt": """Extract information from this receipt.
 Output ONLY valid JSON:
 {
   "merchant_name": "",
@@ -77,6 +82,7 @@ Output ONLY valid JSON:
         "name": "Invoice",
         "description": "Extract invoice details and line items",
         "prompt": """Extract information from this invoice.
 Output ONLY valid JSON:
 {
   "invoice_number": "",
@@ -106,6 +112,7 @@ Output ONLY valid JSON:
         "name": "Business Card",
         "description": "Extract contact information",
         "prompt": """Extract contact information from this business card.
 Output ONLY valid JSON:
 {
   "name": "",
@@ -125,6 +132,7 @@ Output ONLY valid JSON:
         "name": "Form",
         "description": "Extract filled form data",
         "prompt": """Extract all fields and values from this form.
 Output ONLY valid JSON with field names as keys and filled values:
 {
   "field_name": "value"
@@ -133,7 +141,7 @@ Output ONLY valid JSON with field names as keys and filled values:
     },
     DocumentType.HANDWRITTEN: {
-        "name": "✍️ Handwritten Note",
         "description": "Extract text from handwritten documents",
         "prompt": "Extract all handwritten text from this image. Output plain text, preserving line breaks.",
         "icon": "✍️"
@@ -192,19 +200,21 @@ def preprocess_image(
 ) -> Image.Image:
     """
     Preprocess image with optional enhancements
     Args:
         image: PIL Image
         enhance_contrast: Apply CLAHE contrast enhancement
         denoise: Apply denoising
         sharpen: Apply sharpening
         auto_rotate: Attempt to auto-rotate text to horizontal
     Returns:
         Preprocessed PIL Image
     """
     if prevent_cropping and not auto_rotate:
         raise Exception(f"Auto-Rotate must be enabled when Prevent-Cropping is active")
-    # Convert to OpenCV format
     img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
@@ -284,10 +294,12 @@ def extract_text(
 ) -> tuple[str, int]:
     """
     Extract text from image using GLM-OCR
     Args:
         image: PIL Image
         prompt: Extraction prompt
         max_tokens: Maximum tokens to generate
     Returns:
         Tuple of (extracted_text, processing_time_ms)
     """
@@ -353,6 +365,20 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
 # Header
 st.title("🔍 Universal OCR Scanner")
 st.markdown("Extract text and structured data from **any document** - receipts, IDs, invoices, forms, and more!")
@@ -387,7 +413,7 @@ with st.sidebar:
         auto_rotate = st.checkbox("Auto-Rotate", value=False,
                                   help="Automatically straighten tilted documents")
         prevent_cropping = st.checkbox("Prevent-Cropping", value=False,
-                                  help="Prevent cropping when rotate")
     st.markdown("---")
@@ -426,11 +452,17 @@ with col1:
         )
         if uploaded_file is not None:
             image = Image.open(uploaded_file).convert("RGB")
     with camera_tab:
         camera_picture = st.camera_input("Take a photo")
         if camera_picture is not None:
             image = Image.open(BytesIO(camera_picture.getvalue())).convert("RGB")
     # Show original image
     if image is not None:
@@ -445,7 +477,8 @@ with col2:
             "Custom Extraction Prompt:",
             value=DOCUMENT_TEMPLATES[doc_type]['prompt'],
             height=200,
-            help="Customize how the OCR extracts data"
         )
     else:
         prompt = DOCUMENT_TEMPLATES[doc_type]['prompt']
@@ -453,18 +486,23 @@ with col2:
     # Process button
     if image is not None:
-        process_button = st.button(
-            "🚀 Extract Text",
-            type="primary",
-            width="content"
-        )
     else:
         st.info("👆 Upload or capture an image to begin")
-        process_button = False
-# Processing
-if image is not None and process_button:
-    with st.spinner("🔄 Processing document...", width="content"):
         try:
             # Preprocess image
             if enhance_contrast or denoise or sharpen or auto_rotate or prevent_cropping:
@@ -495,192 +533,245 @@ if image is not None and process_button:
                 max_tokens=max_tokens
             )
-            # Display results
-            st.success(f"✅ Extraction complete! ({processing_time}ms)")
-            # Try to parse as JSON for structured documents
-            is_json = False
-            parsed_data = None
-            if doc_type in [DocumentType.ID_CARD, DocumentType.RECEIPT,
-                            DocumentType.INVOICE, DocumentType.BUSINESS_CARD,
-                            DocumentType.FORM]:
-                try:
-                    # Clean JSON from markdown
-                    clean_text = output_text
-                    if "```json" in clean_text:
-                        clean_text = clean_text.split("```json")[1].split("```")[0].strip()
-                    elif "```" in clean_text:
-                        clean_text = clean_text.split("```")[1].split("```")[0].strip()
-                    parsed_data = json.loads(clean_text)
-                    is_json = True
-                except json.JSONDecodeError:
-                    is_json = False
-            # Display based on type
-            st.markdown("---")
-            st.subheader("📄 Extracted Data")
-            if is_json and parsed_data:
-                # Structured data display
-                col_display, col_download = st.columns([2, 1])
-                with col_display:
-                    # Format display based on document type
-                    if doc_type == DocumentType.RECEIPT:
-                        st.markdown("### 🧾 Receipt Details")
-                        # Merchant info
-                        if "merchant_name" in parsed_data:
-                            st.markdown(f"**Merchant:** {parsed_data['merchant_name']}")
-                        if "date" in parsed_data:
-                            st.markdown(f"**Date:** {parsed_data['date']}")
-                        if "time" in parsed_data:
-                            st.markdown(f"**Time:** {parsed_data['time']}")
-                        # Items table
-                        if "items" in parsed_data and parsed_data["items"]:
-                            st.markdown("**Items:**")
-                            items_df = pd.DataFrame(parsed_data["items"])
-                            st.dataframe(items_df, width="content", hide_index=True)
-                        # Totals
-                        st.markdown("---")
-                        if "subtotal" in parsed_data:
-                            st.markdown(f"**Subtotal:** ${parsed_data['subtotal']:.2f}")
-                        if "tax" in parsed_data:
-                            st.markdown(f"**Tax:** ${parsed_data['tax']:.2f}")
-                        if "total" in parsed_data:
-                            st.markdown(f"**Total:** ${parsed_data['total']:.2f}")
-                    elif doc_type == DocumentType.INVOICE:
-                        st.markdown("### 📋 Invoice Details")
-                        col_inv1, col_inv2 = st.columns(2)
-                        with col_inv1:
-                            st.markdown("**Invoice Info:**")
-                            if "invoice_number" in parsed_data:
-                                st.text(f"Number: {parsed_data['invoice_number']}")
-                            if "date" in parsed_data:
-                                st.text(f"Date: {parsed_data['date']}")
-                            if "due_date" in parsed_data:
-                                st.text(f"Due: {parsed_data['due_date']}")
-                        with col_inv2:
-                            if "vendor" in parsed_data:
-                                st.markdown("**Vendor:**")
-                                vendor = parsed_data["vendor"]
-                                if isinstance(vendor, dict):
-                                    for k, v in vendor.items():
-                                        if v:
-                                            st.text(f"{k.title()}: {v}")
-                        # Line items
-                        if "line_items" in parsed_data and parsed_data["line_items"]:
-                            st.markdown("**Line Items:**")
-                            items_df = pd.DataFrame(parsed_data["line_items"])
-                            st.dataframe(items_df, width="content", hide_index=True)
-                        # Total
-                        if "total" in parsed_data:
-                            st.markdown(f"### **Total: ${parsed_data['total']:.2f}**")
                     else:
-                        # Generic structured data display
-                        for key, value in parsed_data.items():
-                            if isinstance(value, dict):
-                                st.markdown(f"**{key.replace('_', ' ').title()}:**")
-                                for k, v in value.items():
-                                    st.text(f"  {k}: {v}")
-                            elif isinstance(value, list):
-                                st.markdown(f"**{key.replace('_', ' ').title()}:**")
-                                if value and isinstance(value[0], dict):
-                                    df = pd.DataFrame(value)
-                                    st.dataframe(df, width="content", hide_index=True)
-                                else:
-                                    for item in value:
-                                        st.text(f"  • {item}")
-                            else:
-                                st.markdown(f"**{key.replace('_', ' ').title()}:** {value}")
-                with col_download:
-                    st.subheader("💾 Downloads")
-                    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-                    # JSON download
-                    json_str = json.dumps(parsed_data, ensure_ascii=False, indent=2)
-                    st.download_button(
-                        label="📄 JSON",
-                        data=json_str,
-                        file_name=f"{doc_type.value}_{timestamp}.json",
-                        mime="application/json",
-                        width="content"
-                    )
-                    # CSV download (flattened)
-                    try:
-                        # Flatten nested structures
-                        flat_data = {}
-                        for k, v in parsed_data.items():
-                            if isinstance(v, (dict, list)):
-                                flat_data[k] = json.dumps(v, ensure_ascii=False)
-                            else:
-                                flat_data[k] = v
-                        df = pd.DataFrame([flat_data])
-                        csv_buffer = StringIO()
-                        df.to_csv(csv_buffer, index=False, encoding='utf-8')
-                        st.download_button(
-                            label="📊 CSV",
-                            data=csv_buffer.getvalue(),
-                            file_name=f"{doc_type.value}_{timestamp}.csv",
-                            mime="text/csv",
-                            width="content"
-                        )
-                    except:
-                        pass
-                    # Raw text download
-                    st.download_button(
-                        label="📝 TXT",
-                        data=output_text,
-                        file_name=f"{doc_type.value}_{timestamp}.txt",
-                        mime="text/plain",
-                        width="content"
-                    )
-                # Show raw JSON in expander
-                with st.expander("🔍 View Raw JSON"):
-                    st.json(parsed_data)
             else:
-                # Plain text display
-                st.text_area(
-                    "Extracted Text:",
-                    value=output_text,
-                    height=400,
-                    label_visibility="collapsed"
-                )
-                # Download
-                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                 st.download_button(
-                    label="💾 Download as TXT",
-                    data=output_text,
-                    file_name=f"extracted_text_{timestamp}.txt",
-                    mime="text/plain"
                 )
-        except Exception as e:
-            st.error(f"❌ Error during extraction: {str(e)}")
-            import traceback
-            with st.expander("Show Error Details"):
-                st.code(traceback.format_exc())
 # Footer
 st.markdown("---")

         "name": "ID Card / Passport",
         "description": "Extract structured data from identity documents",
         "prompt": """Extract structured data from this identity document.
+Output ONLY valid JSON with these exact fields, no nested objects:
 {
   "document_type": "",
   "full_name": "",
   "date_of_expiry": "",
   "nationality": "",
   "document_number": "",
+  "place_of_birth": "",
+  "personal_number": ""
+}
+IMPORTANT: Do NOT create nested or recursive structures. Keep it flat and simple.""",
         "icon": "🆔"
     },
         "name": "Receipt",
         "description": "Extract items, prices, and totals from receipts",
         "prompt": """Extract information from this receipt.
 Output ONLY valid JSON:
 {
   "merchant_name": "",
         "name": "Invoice",
         "description": "Extract invoice details and line items",
         "prompt": """Extract information from this invoice.
 Output ONLY valid JSON:
 {
   "invoice_number": "",
         "name": "Business Card",
         "description": "Extract contact information",
         "prompt": """Extract contact information from this business card.
 Output ONLY valid JSON:
 {
   "name": "",
         "name": "Form",
         "description": "Extract filled form data",
         "prompt": """Extract all fields and values from this form.
 Output ONLY valid JSON with field names as keys and filled values:
 {
   "field_name": "value"
     },
     DocumentType.HANDWRITTEN: {
+        "name": "Handwritten Note",
         "description": "Extract text from handwritten documents",
         "prompt": "Extract all handwritten text from this image. Output plain text, preserving line breaks.",
         "icon": "✍️"
 ) -> Image.Image:
     """
     Preprocess image with optional enhancements
     Args:
         image: PIL Image
         enhance_contrast: Apply CLAHE contrast enhancement
         denoise: Apply denoising
         sharpen: Apply sharpening
         auto_rotate: Attempt to auto-rotate text to horizontal
     Returns:
         Preprocessed PIL Image
     """
     if prevent_cropping and not auto_rotate:
         raise Exception(f"Auto-Rotate must be enabled when Prevent-Cropping is active")
+        # Convert to OpenCV format
     img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
 ) -> tuple[str, int]:
     """
     Extract text from image using GLM-OCR
     Args:
         image: PIL Image
         prompt: Extraction prompt
         max_tokens: Maximum tokens to generate
     Returns:
         Tuple of (extracted_text, processing_time_ms)
     """
     initial_sidebar_state="expanded"
 )
+# Initialize session state
+if 'should_process' not in st.session_state:
+    st.session_state.should_process = False
+if 'has_results' not in st.session_state:
+    st.session_state.has_results = False
+if 'output_text' not in st.session_state:
+    st.session_state.output_text = ""
+if 'processing_time' not in st.session_state:
+    st.session_state.processing_time = 0
+if 'doc_type' not in st.session_state:
+    st.session_state.doc_type = DocumentType.GENERAL
+if 'current_file' not in st.session_state:
+    st.session_state.current_file = None
 # Header
 st.title("🔍 Universal OCR Scanner")
 st.markdown("Extract text and structured data from **any document** - receipts, IDs, invoices, forms, and more!")
         auto_rotate = st.checkbox("Auto-Rotate", value=False,
                                   help="Automatically straighten tilted documents")
         prevent_cropping = st.checkbox("Prevent-Cropping", value=False,
+                                       help="Prevent cropping when rotate")
     st.markdown("---")
         )
         if uploaded_file is not None:
             image = Image.open(uploaded_file).convert("RGB")
+            # Clear previous results when new image uploaded
+            if 'current_file' not in st.session_state or st.session_state.current_file != uploaded_file.name:
+                st.session_state.current_file = uploaded_file.name
+                st.session_state.has_results = False
     with camera_tab:
         camera_picture = st.camera_input("Take a photo")
         if camera_picture is not None:
             image = Image.open(BytesIO(camera_picture.getvalue())).convert("RGB")
+            # Clear previous results when new photo taken
+            st.session_state.has_results = False
     # Show original image
     if image is not None:
             "Custom Extraction Prompt:",
             value=DOCUMENT_TEMPLATES[doc_type]['prompt'],
             height=200,
+            help="Customize how the OCR extracts data",
+            key="custom_prompt_text"
         )
     else:
         prompt = DOCUMENT_TEMPLATES[doc_type]['prompt']
     # Process button
     if image is not None:
+        if st.button(
+                "🚀 Extract Text",
+                type="primary",
+                width="content",
+                key="extract_button"
+        ):
+            # Trigger processing by setting session state
+            st.session_state.should_process = True
     else:
         st.info("👆 Upload or capture an image to begin")
+# Processing (only run when button is clicked)
+if image is not None and st.session_state.get('should_process', False):
+    # Clear the flag immediately to prevent re-processing on next rerun
+    st.session_state.should_process = False
+    with st.spinner("🔄 Processing document..."):
         try:
             # Preprocess image
             if enhance_contrast or denoise or sharpen or auto_rotate or prevent_cropping:
                 max_tokens=max_tokens
             )
+            # Store results in session state
+            st.session_state.output_text = output_text
+            st.session_state.processing_time = processing_time
+            st.session_state.doc_type = doc_type
+            st.session_state.preprocessed_image = preprocessed_image
+            st.session_state.has_results = True
+        except Exception as e:
+            st.error(f"❌ Error during extraction: {str(e)}")
+            import traceback
+            with st.expander("Show Error Details"):
+                st.code(traceback.format_exc())
+            st.session_state.has_results = False
+# Display results (separate from processing)
+if st.session_state.get('has_results', False):
+    output_text = st.session_state.output_text
+    processing_time = st.session_state.processing_time
+    doc_type = st.session_state.doc_type
+    preprocessed_image = st.session_state.get('preprocessed_image', image)
+    # Display success message
+    st.success(f"✅ Extraction complete! ({processing_time}ms)")
+    # Try to parse as JSON for structured documents
+    is_json = False
+    parsed_data = None
+    if doc_type in [DocumentType.ID_CARD, DocumentType.RECEIPT,
+                    DocumentType.INVOICE, DocumentType.BUSINESS_CARD,
+                    DocumentType.FORM]:
+        try:
+            # Clean JSON from markdown
+            clean_text = output_text
+            if "```json" in clean_text:
+                clean_text = clean_text.split("```json")[1].split("```")[0].strip()
+            elif "```" in clean_text:
+                clean_text = clean_text.split("```")[1].split("```")[0].strip()
+            # Truncate if too long (likely recursive)
+            if len(clean_text) > 50000:  # Reasonable JSON should be much smaller
+                st.warning("⚠️ Detected recursive JSON structure. Truncating...")
+                clean_text = clean_text[:50000]
+            parsed_data = json.loads(clean_text)
+            # Flatten recursive structures
+            def flatten_dict(d, max_depth=2, current_depth=0):
+                """Remove recursive nested structures"""
+                if current_depth >= max_depth:
+                    return {}
+                if not isinstance(d, dict):
+                    return d
+                flattened = {}
+                for key, value in d.items():
+                    if isinstance(value, dict):
+                        # Only keep first level of nesting
+                        if current_depth < max_depth - 1:
+                            flattened[key] = flatten_dict(value, max_depth, current_depth + 1)
+                        # Skip deeply nested structures
+                    elif isinstance(value, list):
+                        # Keep lists but limit depth
+                        flattened[key] = value
                     else:
+                        flattened[key] = value
+                return flattened
+            # Flatten the parsed data
+            parsed_data = flatten_dict(parsed_data, max_depth=2)
+            is_json = True
+        except json.JSONDecodeError:
+            is_json = False
+        except Exception as e:
+            st.warning(f"⚠️ JSON parsing issue: {str(e)}")
+            is_json = False
+    # Display based on type
+    st.markdown("---")
+    st.subheader("📄 Extracted Data")
+    if is_json and parsed_data:
+        # Structured data display
+        col_display, col_download = st.columns([2, 1])
+        with col_display:
+            # Format display based on document type
+            if doc_type == DocumentType.RECEIPT:
+                st.markdown("### 🧾 Receipt Details")
+                # Merchant info
+                if "merchant_name" in parsed_data:
+                    st.markdown(f"**Merchant:** {parsed_data['merchant_name']}")
+                if "date" in parsed_data:
+                    st.markdown(f"**Date:** {parsed_data['date']}")
+                if "time" in parsed_data:
+                    st.markdown(f"**Time:** {parsed_data['time']}")
+                # Items table
+                if "items" in parsed_data and parsed_data["items"]:
+                    st.markdown("**Items:**")
+                    items_df = pd.DataFrame(parsed_data["items"])
+                    st.dataframe(items_df, width="content", hide_index=True)
+                # Totals
+                st.markdown("---")
+                if "subtotal" in parsed_data:
+                    st.markdown(f"**Subtotal:** ${parsed_data['subtotal']:.2f}")
+                if "tax" in parsed_data:
+                    st.markdown(f"**Tax:** ${parsed_data['tax']:.2f}")
+                if "total" in parsed_data:
+                    st.markdown(f"**Total:** ${parsed_data['total']:.2f}")
+            elif doc_type == DocumentType.INVOICE:
+                st.markdown("### 📋 Invoice Details")
+                col_inv1, col_inv2 = st.columns(2)
+                with col_inv1:
+                    st.markdown("**Invoice Info:**")
+                    if "invoice_number" in parsed_data:
+                        st.text(f"Number: {parsed_data['invoice_number']}")
+                    if "date" in parsed_data:
+                        st.text(f"Date: {parsed_data['date']}")
+                    if "due_date" in parsed_data:
+                        st.text(f"Due: {parsed_data['due_date']}")
+                with col_inv2:
+                    if "vendor" in parsed_data:
+                        st.markdown("**Vendor:**")
+                        vendor = parsed_data["vendor"]
+                        if isinstance(vendor, dict):
+                            for k, v in vendor.items():
+                                if v:
+                                    st.text(f"{k.title()}: {v}")
+                # Line items
+                if "line_items" in parsed_data and parsed_data["line_items"]:
+                    st.markdown("**Line Items:**")
+                    items_df = pd.DataFrame(parsed_data["line_items"])
+                    st.dataframe(items_df, width="content", hide_index=True)
+                # Total
+                if "total" in parsed_data:
+                    st.markdown(f"### **Total: ${parsed_data['total']:.2f}**")
             else:
+                # Generic structured data display
+                for key, value in parsed_data.items():
+                    if isinstance(value, dict):
+                        st.markdown(f"**{key.replace('_', ' ').title()}:**")
+                        for k, v in value.items():
+                            st.text(f"  {k}: {v}")
+                    elif isinstance(value, list):
+                        st.markdown(f"**{key.replace('_', ' ').title()}:**")
+                        if value and isinstance(value[0], dict):
+                            df = pd.DataFrame(value)
+                            st.dataframe(df, width="content", hide_index=True)
+                        else:
+                            for item in value:
+                                st.text(f"  • {item}")
+                    else:
+                        st.markdown(f"**{key.replace('_', ' ').title()}:** {value}")
+        with col_download:
+            st.subheader("💾 Downloads")
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            # JSON download
+            json_str = json.dumps(parsed_data, ensure_ascii=False, indent=2)
+            st.download_button(
+                label="📄 JSON",
+                data=json_str,
+                file_name=f"{doc_type.value}_{timestamp}.json",
+                mime="application/json",
+                width="content"
+            )
+            # CSV download (flattened)
+            try:
+                # Flatten nested structures
+                flat_data = {}
+                for k, v in parsed_data.items():
+                    if isinstance(v, (dict, list)):
+                        flat_data[k] = json.dumps(v, ensure_ascii=False)
+                    else:
+                        flat_data[k] = v
+                df = pd.DataFrame([flat_data])
+                csv_buffer = StringIO()
+                df.to_csv(csv_buffer, index=False, encoding='utf-8')
                 st.download_button(
+                    label="📊 CSV",
+                    data=csv_buffer.getvalue(),
+                    file_name=f"{doc_type.value}_{timestamp}.csv",
+                    mime="text/csv",
+                    width="content"
                 )
+            except:
+                pass
+            # Raw text download
+            st.download_button(
+                label="📝 TXT",
+                data=output_text,
+                file_name=f"{doc_type.value}_{timestamp}.txt",
+                mime="text/plain",
+                width="content"
+            )
+        # Show raw JSON in expander
+        with st.expander("🔍 View Raw JSON"):
+            st.json(parsed_data)
+    else:
+        # Plain text display
+        st.text_area(
+            "Extracted Text:",
+            value=output_text,
+            height=400,
+            label_visibility="collapsed"
+        )
+        # Download
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        st.download_button(
+            label="💾 Download as TXT",
+            data=output_text,
+            file_name=f"extracted_text_{timestamp}.txt",
+            mime="text/plain"
+        )
 # Footer
 st.markdown("---")