Seth0330 commited on
Commit
1198aa3
·
verified ·
1 Parent(s): cd7e3ae

Update backend/app/main.py

Browse files
Files changed (1) hide show
  1. backend/app/main.py +31 -13
backend/app/main.py CHANGED
@@ -105,25 +105,43 @@ async def extract_document(
105
  confidence = float(extracted.get("confidence", 90))
106
  fields = extracted.get("fields", {})
107
 
108
- # Get full_text for text output (but keep fields empty for JSON/XML as requested)
109
  full_text = extracted.get("full_text", "")
110
  if full_text:
111
- # Add full_text to fields only for text display in frontend
112
- # This allows Text tab to show the extracted text
113
- fields["full_text"] = full_text
114
  full_text_words = len(str(full_text).split())
115
  print(f"[INFO] Full text extracted: {full_text_words} words")
116
 
117
- # Also check for pages array
118
- pages_data = extracted.get("pages", [])
119
- if pages_data and isinstance(pages_data, list):
120
- print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
121
- # Add pages to fields for frontend (needed for text display)
122
- fields["pages"] = pages_data
 
 
 
 
 
 
123
 
124
- # Count fields (excluding full_text and pages for JSON/XML count)
125
- # For now, we're only extracting text, so fields_extracted will be minimal
126
- fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]]) if isinstance(fields, dict) else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")
129
 
 
105
  confidence = float(extracted.get("confidence", 90))
106
  fields = extracted.get("fields", {})
107
 
108
+ # Get full_text for text output
109
  full_text = extracted.get("full_text", "")
110
  if full_text:
 
 
 
111
  full_text_words = len(str(full_text).split())
112
  print(f"[INFO] Full text extracted: {full_text_words} words")
113
 
114
+ # Check if fields contain structured data (from table parsing)
115
+ # If fields is a dict with page_X keys, it's already structured
116
+ # If fields is empty or simple, add full_text and pages for text display
117
+ if not fields or (isinstance(fields, dict) and not any(k.startswith("page_") for k in fields.keys())):
118
+ if full_text:
119
+ fields["full_text"] = full_text
120
+
121
+ # Also check for pages array
122
+ pages_data = extracted.get("pages", [])
123
+ if pages_data and isinstance(pages_data, list):
124
+ print(f"[INFO] Extracted text from {len(pages_data)} page(s)")
125
+ fields["pages"] = pages_data
126
 
127
+ # Count fields - if structured data exists, count table rows + metadata
128
+ if isinstance(fields, dict):
129
+ # Check if it's structured page data
130
+ if any(k.startswith("page_") for k in fields.keys()):
131
+ # Count structured fields (metadata keys + table rows)
132
+ page_data = list(fields.values())[0] if len(fields) == 1 else fields
133
+ if isinstance(page_data, dict):
134
+ table_rows = page_data.get("table", [])
135
+ metadata_keys = len(page_data.get("metadata", {}))
136
+ fields_extracted = len(table_rows) + metadata_keys
137
+ print(f"[INFO] Structured data: {len(table_rows)} table rows, {metadata_keys} metadata fields")
138
+ else:
139
+ fields_extracted = len(fields)
140
+ else:
141
+ # Regular fields count (excluding full_text and pages)
142
+ fields_extracted = len([k for k in fields.keys() if k not in ["full_text", "pages"]])
143
+ else:
144
+ fields_extracted = 0
145
 
146
  print(f"[INFO] Final stats - confidence: {confidence}, fields_count: {fields_extracted}")
147