EZOFISAIOCR

Sleeping

App Files Files Community

Seth0330 commited on Dec 9, 2025

Commit

1198aa3

verified ·

1 Parent(s): cd7e3ae

Update backend/app/main.py

Browse files

Files changed (1) hide show

backend/app/main.py +31 -13

backend/app/main.py CHANGED Viewed

@@ -105,25 +105,43 @@ async def extract_document(
         confidence = float(extracted.get("confidence", 90))
         fields = extracted.get("fields", {})
-        # Get full_text for text output (but keep fields empty for JSON/XML as requested)
         full_text = extracted.get("full_text", "")
         if full_text:
-            # Add full_text to fields only for text display in frontend
-            # This allows Text tab to show the extracted text
-            fields["full_text"] = full_text
             full_text_words = len(str(full_text).split())
             print(f"[INFO] Full text extracted: {full_text_words} words")
-        # Also check for pages array
-        pages_data = extracted.get("pages", [])
-        if pages_data and isinstance(pages_data, list):
-            print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
-            # Add pages to fields for frontend (needed for text display)
-            fields["pages"] = pages_data
-        # Count fields (excluding full_text and pages for JSON/XML count)
-        # For now, we're only extracting text, so fields_extracted will be minimal
-        fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]]) if isinstance(fields, dict) else 0
         print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")

         confidence = float(extracted.get("confidence", 90))
         fields = extracted.get("fields", {})
+        # Get full_text for text output
         full_text = extracted.get("full_text", "")
         if full_text:
             full_text_words = len(str(full_text).split())
             print(f"[INFO] Full text extracted: {full_text_words} words")
+        # Check if fields contain structured data (from table parsing)
+        # If fields is a dict with page_X keys, it's already structured
+        # If fields is empty or simple, add full_text and pages for text display
+        if not fields or (isinstance(fields, dict) and not any(k.startswith("page_") for k in fields.keys())):
+            if full_text:
+                fields["full_text"] = full_text
+            # Also check for pages array
+            pages_data = extracted.get("pages", [])
+            if pages_data and isinstance(pages_data, list):
+                print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
+                fields["pages"] = pages_data
+        # Count fields - if structured data exists, count table rows + metadata
+        if isinstance(fields, dict):
+            # Check if it's structured page data
+            if any(k.startswith("page_") for k in fields.keys()):
+                # Count structured fields (metadata keys + table rows)
+                page_data = list(fields.values())[0] if len(fields) == 1 else fields
+                if isinstance(page_data, dict):
+                    table_rows = page_data.get("table", [])
+                    metadata_keys = len(page_data.get("metadata", {}))
+                    fields_extracted = len(table_rows) + metadata_keys
+                    print(f"[INFO] Structured data: {len(table_rows)} table rows, {metadata_keys} metadata fields")
+                else:
+                    fields_extracted = len(fields)
+            else:
+                # Regular fields count (excluding full_text and pages)
+                fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]])
+        else:
+            fields_extracted = 0
         print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")