Spaces:
Sleeping
Sleeping
Update backend/app/main.py
Browse files- backend/app/main.py +31 -13
backend/app/main.py
CHANGED
|
@@ -105,25 +105,43 @@ async def extract_document(
|
|
| 105 |
confidence = float(extracted.get("confidence", 90))
|
| 106 |
fields = extracted.get("fields", {})
|
| 107 |
|
| 108 |
-
# Get full_text for text output
|
| 109 |
full_text = extracted.get("full_text", "")
|
| 110 |
if full_text:
|
| 111 |
-
# Add full_text to fields only for text display in frontend
|
| 112 |
-
# This allows Text tab to show the extracted text
|
| 113 |
-
fields["full_text"] = full_text
|
| 114 |
full_text_words = len(str(full_text).split())
|
| 115 |
print(f"[INFO] Full text extracted: {full_text_words} words")
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
# Count fields
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")
|
| 129 |
|
|
|
|
| 105 |
confidence = float(extracted.get("confidence", 90))
|
| 106 |
fields = extracted.get("fields", {})
|
| 107 |
|
| 108 |
+
# Get full_text for text output
|
| 109 |
full_text = extracted.get("full_text", "")
|
| 110 |
if full_text:
|
|
|
|
|
|
|
|
|
|
| 111 |
full_text_words = len(str(full_text).split())
|
| 112 |
print(f"[INFO] Full text extracted: {full_text_words} words")
|
| 113 |
|
| 114 |
+
# Check if fields contain structured data (from table parsing)
|
| 115 |
+
# If fields is a dict with page_X keys, it's already structured
|
| 116 |
+
# If fields is empty or simple, add full_text and pages for text display
|
| 117 |
+
if not fields or (isinstance(fields, dict) and not any(k.startswith("page_") for k in fields.keys())):
|
| 118 |
+
if full_text:
|
| 119 |
+
fields["full_text"] = full_text
|
| 120 |
+
|
| 121 |
+
# Also check for pages array
|
| 122 |
+
pages_data = extracted.get("pages", [])
|
| 123 |
+
if pages_data and isinstance(pages_data, list):
|
| 124 |
+
print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
|
| 125 |
+
fields["pages"] = pages_data
|
| 126 |
|
| 127 |
+
# Count fields - if structured data exists, count table rows + metadata
|
| 128 |
+
if isinstance(fields, dict):
|
| 129 |
+
# Check if it's structured page data
|
| 130 |
+
if any(k.startswith("page_") for k in fields.keys()):
|
| 131 |
+
# Count structured fields (metadata keys + table rows)
|
| 132 |
+
page_data = list(fields.values())[0] if len(fields) == 1 else fields
|
| 133 |
+
if isinstance(page_data, dict):
|
| 134 |
+
table_rows = page_data.get("table", [])
|
| 135 |
+
metadata_keys = len(page_data.get("metadata", {}))
|
| 136 |
+
fields_extracted = len(table_rows) + metadata_keys
|
| 137 |
+
print(f"[INFO] Structured data: {len(table_rows)} table rows, {metadata_keys} metadata fields")
|
| 138 |
+
else:
|
| 139 |
+
fields_extracted = len(fields)
|
| 140 |
+
else:
|
| 141 |
+
# Regular fields count (excluding full_text and pages)
|
| 142 |
+
fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]])
|
| 143 |
+
else:
|
| 144 |
+
fields_extracted = 0
|
| 145 |
|
| 146 |
print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")
|
| 147 |
|