Spaces:
Running
Running
Update frontend/src/components/ocr/ExtractionOutput.jsx
Browse files
frontend/src/components/ocr/ExtractionOutput.jsx
CHANGED
|
@@ -119,21 +119,55 @@ function prepareFieldsForOutput(fields, format = "json") {
|
|
| 119 |
|
| 120 |
const output = { ...fields };
|
| 121 |
|
| 122 |
-
// Remove full_text if pages array exists (to avoid duplication)
|
| 123 |
if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) {
|
| 124 |
delete output.full_text;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
}
|
| 126 |
|
| 127 |
-
// For JSON: restructure pages into separate top-level fields (page_1, page_2, etc.)
|
| 128 |
-
if (format === "json" && output.pages && Array.isArray(output.pages)) {
|
|
|
|
|
|
|
|
|
|
| 129 |
output.pages.forEach((page, idx) => {
|
| 130 |
const pageNum = page.page_number || idx + 1;
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
text: page.text || "",
|
| 133 |
-
fields: page.fields || {},
|
| 134 |
confidence: page.confidence || 0,
|
| 135 |
doc_type: page.doc_type || "other"
|
| 136 |
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
});
|
| 138 |
// Remove pages array - we now have page_1, page_2, etc. as separate fields
|
| 139 |
delete output.pages;
|
|
@@ -488,7 +522,7 @@ export default function ExtractionOutput({ hasFile, isProcessing, isComplete, ex
|
|
| 488 |
>
|
| 489 |
<Sparkles className="h-8 w-8 text-indigo-500" />
|
| 490 |
</motion.div>
|
| 491 |
-
<p className="text-slate-700 font-medium mb-1">
|
| 492 |
<p className="text-slate-400 text-sm">Analyzing document structure</p>
|
| 493 |
|
| 494 |
<div className="mt-6 flex items-center justify-center gap-1">
|
|
|
|
| 119 |
|
| 120 |
const output = { ...fields };
|
| 121 |
|
| 122 |
+
// Remove full_text from top-level if pages array exists (to avoid duplication)
|
| 123 |
if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) {
|
| 124 |
delete output.full_text;
|
| 125 |
+
|
| 126 |
+
// Clean up each page: remove full_text from page.fields (it duplicates page.text)
|
| 127 |
+
output.pages = output.pages.map(page => {
|
| 128 |
+
const cleanedPage = { ...page };
|
| 129 |
+
if (cleanedPage.fields && typeof cleanedPage.fields === "object") {
|
| 130 |
+
const cleanedFields = { ...cleanedPage.fields };
|
| 131 |
+
// Remove full_text from page fields (duplicates page.text)
|
| 132 |
+
delete cleanedFields.full_text;
|
| 133 |
+
cleanedPage.fields = cleanedFields;
|
| 134 |
+
}
|
| 135 |
+
return cleanedPage;
|
| 136 |
+
});
|
| 137 |
}
|
| 138 |
|
| 139 |
+
// For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.)
|
| 140 |
+
if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) {
|
| 141 |
+
// Get top-level field keys (these are merged from all pages - avoid duplicating in page fields)
|
| 142 |
+
const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text"));
|
| 143 |
+
|
| 144 |
output.pages.forEach((page, idx) => {
|
| 145 |
const pageNum = page.page_number || idx + 1;
|
| 146 |
+
const pageFields = page.fields || {};
|
| 147 |
+
|
| 148 |
+
// Remove duplicate fields from page.fields:
|
| 149 |
+
// 1. Remove full_text (duplicates page.text)
|
| 150 |
+
// 2. Remove fields that match top-level fields (already shown at root)
|
| 151 |
+
const cleanedPageFields = {};
|
| 152 |
+
for (const [key, value] of Object.entries(pageFields)) {
|
| 153 |
+
// Skip full_text and fields that match top-level exactly
|
| 154 |
+
if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) {
|
| 155 |
+
cleanedPageFields[key] = value;
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
const pageObj = {
|
| 160 |
text: page.text || "",
|
|
|
|
| 161 |
confidence: page.confidence || 0,
|
| 162 |
doc_type: page.doc_type || "other"
|
| 163 |
};
|
| 164 |
+
|
| 165 |
+
// Only add fields if there are unique page-specific fields
|
| 166 |
+
if (Object.keys(cleanedPageFields).length > 0) {
|
| 167 |
+
pageObj.fields = cleanedPageFields;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
output[`page_${pageNum}`] = pageObj;
|
| 171 |
});
|
| 172 |
// Remove pages array - we now have page_1, page_2, etc. as separate fields
|
| 173 |
delete output.pages;
|
|
|
|
| 522 |
>
|
| 523 |
<Sparkles className="h-8 w-8 text-indigo-500" />
|
| 524 |
</motion.div>
|
| 525 |
+
<p className="text-slate-700 font-medium mb-1">Extracting data...</p>
|
| 526 |
<p className="text-slate-400 text-sm">Analyzing document structure</p>
|
| 527 |
|
| 528 |
<div className="mt-6 flex items-center justify-center gap-1">
|