EZOFISOCR / frontend /src /components /ocr /ExtractionOutput.jsx
Seth
update
8aefeb2
import React, { useState, useEffect, useRef } from "react";
import { motion, AnimatePresence } from "framer-motion";
import {
Code2,
Copy,
Check,
Braces,
FileCode2,
FileText,
Sparkles,
ChevronDown,
Upload,
} from "lucide-react";
import { Button } from "@/components/ui/button";
import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs";
import { cn } from "@/lib/utils";
// Helper function to convert pipe-separated tables to HTML tables
function convertPipeTablesToHTML(text) {
if (!text) return text;
const lines = text.split('\n');
const result = [];
let i = 0;
while (i < lines.length) {
const line = lines[i];
// Check if this line looks like a table row (has multiple pipes)
if (line.includes('|') && line.split('|').length >= 3) {
// Check if it's a separator line (only |, -, :, spaces)
const isSeparator = /^[\s|\-:]+$/.test(line.trim());
if (!isSeparator) {
// Start of a table - collect all table rows
const tableRows = [];
let j = i;
// Collect header row
const headerLine = lines[j];
const headerCells = headerLine.split('|').map(cell => cell.trim()).filter(cell => cell || cell === '');
// Remove empty cells at start/end
if (headerCells.length > 0 && !headerCells[0]) headerCells.shift();
if (headerCells.length > 0 && !headerCells[headerCells.length - 1]) headerCells.pop();
if (headerCells.length >= 2) {
tableRows.push(headerCells);
j++;
// Skip separator line if present
if (j < lines.length && /^[\s|\-:]+$/.test(lines[j].trim())) {
j++;
}
// Collect data rows
while (j < lines.length) {
const rowLine = lines[j];
if (!rowLine.trim()) break; // Empty line ends table
// Check if it's still a table row
if (rowLine.includes('|') && rowLine.split('|').length >= 2) {
const isRowSeparator = /^[\s|\-:]+$/.test(rowLine.trim());
if (!isRowSeparator) {
const rowCells = rowLine.split('|').map(cell => cell.trim());
// Remove empty cells at start/end
if (rowCells.length > 0 && !rowCells[0]) rowCells.shift();
if (rowCells.length > 0 && !rowCells[rowCells.length - 1]) rowCells.pop();
tableRows.push(rowCells);
j++;
} else {
j++;
}
} else {
break; // Not a table row anymore
}
}
// Convert to HTML table
if (tableRows.length > 0) {
let htmlTable = '<table class="border-collapse border border-gray-300 w-full my-4">\n<thead>\n<tr>';
// Header row
tableRows[0].forEach(cell => {
htmlTable += `<th class="border border-gray-300 px-4 py-2 bg-gray-100 font-semibold text-left">${escapeHtml(cell)}</th>`;
});
htmlTable += '</tr>\n</thead>\n<tbody>\n';
// Data rows
for (let rowIdx = 1; rowIdx < tableRows.length; rowIdx++) {
htmlTable += '<tr>';
tableRows[rowIdx].forEach((cell, colIdx) => {
// Use header cell count to ensure alignment
const cellContent = cell || '';
htmlTable += `<td class="border border-gray-300 px-4 py-2">${escapeHtml(cellContent)}</td>`;
});
htmlTable += '</tr>\n';
}
htmlTable += '</tbody>\n</table>';
result.push(htmlTable);
i = j;
continue;
}
}
}
}
// Not a table row, add as-is
result.push(line);
i++;
}
return result.join('\n');
}
// Helper function to escape HTML
function escapeHtml(text) {
if (!text) return '';
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// Helper function to convert markdown/HTML text to safe HTML
function renderMarkdownToHTML(text) {
if (!text) return "";
let html = text;
// FIRST: Convert pipe-separated tables to HTML tables
html = convertPipeTablesToHTML(html);
// Convert LaTeX-style superscripts/subscripts FIRST (before protecting tables)
// This ensures they're converted everywhere, including inside tables
// Convert LaTeX-style superscripts: $^{text}$ or $^text$ to <sup>text</sup>
html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '<sup>$1</sup>');
html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '<sup>$1</sup>');
// Convert LaTeX-style subscripts: $_{text}$ or $_text$ to <sub>text</sub>
html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '<sub>$1</sub>');
html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '<sub>$1</sub>');
// Split by HTML tags to preserve existing HTML (like tables)
// Process markdown only in non-HTML sections
// First, protect existing HTML blocks (tables, etc.)
const htmlBlocks = [];
let htmlBlockIndex = 0;
// Extract and protect HTML table blocks
html = html.replace(/<table[\s\S]*?<\/table>/gi, (match) => {
const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`;
htmlBlocks[htmlBlockIndex] = match;
htmlBlockIndex++;
return placeholder;
});
// Convert markdown headers (only if not inside HTML)
html = html.replace(/^### (.*$)/gim, '<h3>$1</h3>');
html = html.replace(/^## (.*$)/gim, '<h2>$1</h2>');
html = html.replace(/^# (.*$)/gim, '<h1>$1</h1>');
// Convert markdown bold/italic (but not inside HTML tags)
html = html.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>');
html = html.replace(/\*(.*?)\*/g, '<em>$1</em>');
// Convert markdown links
html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank" rel="noopener noreferrer">$1</a>');
// Convert line breaks to paragraphs (but preserve structure around HTML blocks)
const parts = html.split(/(__HTML_BLOCK_\d+__)/);
const processedParts = parts.map((part, index) => {
if (part.match(/^__HTML_BLOCK_\d+__$/)) {
// Restore HTML block
const blockIndex = parseInt(part.match(/\d+/)[0]);
return htmlBlocks[blockIndex];
} else {
// Process markdown in this part
let processed = part;
// Convert double line breaks to paragraph breaks
processed = processed.replace(/\n\n+/g, '</p><p>');
// Convert single line breaks to <br> (but not if already in a tag)
processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1<br>$2');
// Wrap in paragraph if there's content
if (processed.trim() && !processed.trim().startsWith('<')) {
processed = '<p>' + processed + '</p>';
}
return processed;
}
});
html = processedParts.join('');
// Process LaTeX notation in restored HTML blocks (tables) as well
// This handles any LaTeX that might be in table cells
html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*\^\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, supText, after, closeTag) => {
return openTag + before + '<sup>' + supText + '</sup>' + after + closeTag;
});
html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*\^\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, supText, after, closeTag) => {
return openTag + before + '<sup>' + supText + '</sup>' + after + closeTag;
});
html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*_\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, subText, after, closeTag) => {
return openTag + before + '<sub>' + subText + '</sub>' + after + closeTag;
});
html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*_\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, subText, after, closeTag) => {
return openTag + before + '<sub>' + subText + '</sub>' + after + closeTag;
});
// Clean up empty paragraphs and fix paragraph structure
html = html.replace(/<p><\/p>/g, '');
html = html.replace(/<p>\s*<br>\s*<\/p>/g, '');
html = html.replace(/<p>\s*<\/p>/g, '');
// Ensure proper spacing around HTML blocks
html = html.replace(/(<\/table>)\s*(<h[1-3])/g, '$1</p><p>$2');
html = html.replace(/(<\/h[1-3]>)\s*(<table)/g, '$1<p>$2');
html = html.replace(/(<\/table>)\s*(<p>)/g, '$1$2');
return html;
}
// Mock extracted data
const mockData = {
document: {
type: "Invoice",
confidence: 0.98,
},
vendor: {
name: "Acme Corporation",
address: "123 Business Ave, Suite 400",
city: "San Francisco",
state: "CA",
zip: "94102",
phone: "+1 (555) 123-4567",
},
invoice: {
number: "INV-2024-0847",
date: "2024-01-15",
due_date: "2024-02-14",
po_number: "PO-9823",
},
items: [
{ description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 },
{ description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 },
{ description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 },
],
totals: {
subtotal: 7999.95,
tax_rate: 0.0875,
tax_amount: 699.99,
total: 8699.94,
},
};
const mockXML = `<?xml version="1.0" encoding="UTF-8"?>
<extraction>
<document type="Invoice" confidence="0.98"/>
<vendor>
<name>Acme Corporation</name>
<address>123 Business Ave, Suite 400</address>
<city>San Francisco</city>
<state>CA</state>
<zip>94102</zip>
</vendor>
<invoice>
<number>INV-2024-0847</number>
<date>2024-01-15</date>
<due_date>2024-02-14</due_date>
</invoice>
<items>
<item>
<description>Professional Services</description>
<quantity>40</quantity>
<total>6000.00</total>
</item>
</items>
<totals>
<subtotal>7999.95</subtotal>
<tax>699.99</tax>
<total>8699.94</total>
</totals>
</extraction>`;
const mockText = `INVOICE
ACME CORPORATION
123 Business Ave, Suite 400
San Francisco, CA 94102
Phone: +1 (555) 123-4567
Invoice Number: INV-2024-0847
Invoice Date: January 15, 2024
Due Date: February 14, 2024
PO Number: PO-9823
BILL TO:
Customer Name
456 Client Street
New York, NY 10001
ITEMS:
─────────────────────────────────────────────────────────
Description Qty Unit Price Total
─────────────────────────────────────────────────────────
Professional Services 40 $150.00 $6,000.00
Software License 5 $299.99 $1,499.95
Support Package 1 $500.00 $500.00
─────────────────────────────────────────────────────────
Subtotal: $7,999.95
Tax (8.75%): $699.99
─────────────────────────
TOTAL: $8,699.94
Payment Terms: Net 30
Thank you for your business!`;
// Helper function to convert object to XML
// Prepare fields for JSON/XML output - remove duplicates and restructure
function prepareFieldsForOutput(fields, format = "json") {
if (!fields || typeof fields !== "object") {
return fields;
}
const output = { ...fields };
// Extract Fields from root level if it exists
const rootFields = output.Fields;
// Remove Fields from output temporarily (will be added back at top)
delete output.Fields;
// Remove full_text from top-level if pages array exists (to avoid duplication)
if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) {
delete output.full_text;
// Clean up each page: remove full_text from page.fields (it duplicates page.text)
output.pages = output.pages.map(page => {
const cleanedPage = { ...page };
if (cleanedPage.fields && typeof cleanedPage.fields === "object") {
const cleanedFields = { ...cleanedPage.fields };
// Remove full_text from page fields (duplicates page.text)
delete cleanedFields.full_text;
cleanedPage.fields = cleanedFields;
}
return cleanedPage;
});
}
// For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.)
if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) {
// Get top-level field keys (these are merged from all pages - avoid duplicating in page fields)
const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields"));
output.pages.forEach((page, idx) => {
const pageNum = page.page_number || idx + 1;
const pageFields = page.fields || {};
// Remove duplicate fields from page.fields:
// 1. Remove full_text (duplicates page.text)
// 2. Remove fields that match top-level fields (already shown at root)
const cleanedPageFields = {};
for (const [key, value] of Object.entries(pageFields)) {
// Skip full_text and fields that match top-level exactly
if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) {
cleanedPageFields[key] = value;
}
}
const pageObj = {
text: page.text || "",
confidence: page.confidence || 0,
doc_type: page.doc_type || "other"
};
// Add table and footer_notes if they exist
if (page.table && Array.isArray(page.table) && page.table.length > 0) {
pageObj.table = page.table;
}
if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) {
pageObj.footer_notes = page.footer_notes;
}
// Only add fields if there are unique page-specific fields
if (Object.keys(cleanedPageFields).length > 0) {
pageObj.fields = cleanedPageFields;
}
output[`page_${pageNum}`] = pageObj;
});
// Remove pages array - we now have page_1, page_2, etc. as separate fields
delete output.pages;
}
// Handle page_X structure (from backend) - remove Fields from page objects if they exist
if (output && typeof output === "object") {
const pageKeys = Object.keys(output).filter(k => k.startsWith("page_"));
for (const pageKey of pageKeys) {
const pageData = output[pageKey];
if (pageData && typeof pageData === "object") {
// Remove Fields from page objects (it's now at root level)
delete pageData.Fields;
delete pageData.metadata;
}
}
}
// Rebuild output with Fields at the top (only if it exists and is not empty)
const finalOutput = {};
if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) {
finalOutput.Fields = rootFields;
}
// Add all other keys
Object.keys(output).forEach(key => {
finalOutput[key] = output[key];
});
return finalOutput;
}
function objectToXML(obj, rootName = "extraction") {
// Prepare fields - remove full_text if pages exist
const preparedObj = prepareFieldsForOutput(obj, "xml");
let xml = `<?xml version="1.0" encoding="UTF-8"?>\n<${rootName}>\n`;
const convert = (obj, indent = " ") => {
for (const [key, value] of Object.entries(obj)) {
if (value === null || value === undefined) continue;
// Skip full_text if pages exist (already handled in prepareFieldsForOutput)
if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) {
continue;
}
if (Array.isArray(value)) {
value.forEach((item) => {
xml += `${indent}<${key}>\n`;
if (typeof item === "object") {
convert(item, indent + " ");
} else {
xml += `${indent} ${escapeXML(String(item))}\n`;
}
xml += `${indent}</${key}>\n`;
});
} else if (typeof value === "object") {
xml += `${indent}<${key}>\n`;
convert(value, indent + " ");
xml += `${indent}</${key}>\n`;
} else {
xml += `${indent}<${key}>${escapeXML(String(value))}</${key}>\n`;
}
}
};
convert(preparedObj);
xml += `</${rootName}>`;
return xml;
}
function escapeXML(str) {
return str
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
}
// Helper function to extract text from page structure
function extractTextFromFields(fields) {
if (!fields || typeof fields !== "object") {
return "";
}
// Check for page_X structure first (preferred format)
const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_"));
if (pageKeys.length > 0) {
// Get text from first page (or combine all pages)
const pageTexts = pageKeys.map(key => {
const page = fields[key];
if (page && page.text) {
return page.text;
}
return "";
}).filter(text => text);
if (pageTexts.length > 0) {
return pageTexts.join("\n\n");
}
}
// Fallback to full_text
if (fields.full_text) {
return fields.full_text;
}
return "";
}
// Helper function to format fields as readable text
function fieldsToText(fields) {
if (!fields || typeof fields !== "object") {
return "No data extracted.";
}
// Extract text from page structure or full_text
const extractedText = extractTextFromFields(fields);
if (extractedText) {
return extractedText;
// Don't show pages array separately if full_text already contains page markers
// (full_text from backend already includes "=== PAGE 1 ===" etc.)
const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page");
// Only show pages array if full_text doesn't already have page breakdown
if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) {
text += "\n\n=== TEXT BY PAGE ===\n\n";
fields.pages.forEach((page, idx) => {
text += `--- Page ${page.page_number || idx + 1} ---\n`;
text += page.text || "";
text += "\n\n";
});
}
// Then show other structured fields
const otherFields = { ...fields };
delete otherFields.full_text;
delete otherFields.pages;
if (Object.keys(otherFields).length > 0) {
text += "\n\n=== STRUCTURED FIELDS ===\n\n";
const formatValue = (key, value, indent = "") => {
if (Array.isArray(value)) {
text += `${indent}${key}:\n`;
value.forEach((item, idx) => {
if (typeof item === "object") {
text += `${indent} Item ${idx + 1}:\n`;
Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent} - ${item}\n`;
}
});
} else if (typeof value === "object" && value !== null) {
text += `${indent}${key}:\n`;
Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent}${key}: ${value}\n`;
}
};
Object.entries(otherFields).forEach(([key, value]) => {
formatValue(key, value);
text += "\n";
});
}
return text.trim();
}
// Fallback: format all fields normally
let text = "";
const formatValue = (key, value, indent = "") => {
if (Array.isArray(value)) {
text += `${indent}${key}:\n`;
value.forEach((item, idx) => {
if (typeof item === "object") {
text += `${indent} Item ${idx + 1}:\n`;
Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent} - ${item}\n`;
}
});
} else if (typeof value === "object" && value !== null) {
text += `${indent}${key}:\n`;
Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent}${key}: ${value}\n`;
}
};
Object.entries(fields).forEach(([key, value]) => {
formatValue(key, value);
text += "\n";
});
return text.trim() || "No data extracted.";
}
export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult, onNewUpload }) {
const [activeTab, setActiveTab] = useState("json");
const [copied, setCopied] = useState(false);
const [statusMessage, setStatusMessage] = useState("Preparing document...");
// Get fields from extraction result, default to empty object
const fields = extractionResult?.fields || {};
const confidence = extractionResult?.confidence || 0;
const fieldsExtracted = extractionResult?.fieldsExtracted || 0;
const totalTime = extractionResult?.totalTime || 0;
// Dynamic status messages that rotate during processing
const statusMessages = [
"Preparing document...",
"Converting pages to images...",
"Visual Reasoning...",
"Reading text from document...",
"Identifying document structure...",
"Extracting tables and data...",
"Analyzing content...",
"Processing pages...",
"Organizing extracted information...",
"Finalizing results...",
];
// Rotate status messages during processing
const messageIndexRef = useRef(0);
useEffect(() => {
if (!isProcessing) {
setStatusMessage("Analyzing document structure");
messageIndexRef.current = 0;
return;
}
setStatusMessage(statusMessages[0]);
messageIndexRef.current = 0;
const interval = setInterval(() => {
messageIndexRef.current = (messageIndexRef.current + 1) % statusMessages.length;
setStatusMessage(statusMessages[messageIndexRef.current]);
}, 2500); // Change message every 2.5 seconds
return () => clearInterval(interval);
}, [isProcessing]);
// Initialize expanded sections based on available fields
const [expandedSections, setExpandedSections] = useState(() =>
Object.keys(fields).slice(0, 5) // Expand first 5 sections by default
);
// Helper function to convert HTML to formatted plain text with layout preserved
const htmlToFormattedText = (html) => {
if (!html) return "";
// Create a temporary div to parse HTML
const tempDiv = document.createElement("div");
tempDiv.innerHTML = html;
let text = "";
// Process each element
const processNode = (node) => {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent;
}
if (node.nodeType !== Node.ELEMENT_NODE) {
return "";
}
const tagName = node.tagName?.toLowerCase();
const children = Array.from(node.childNodes);
switch (tagName) {
case "h1":
return "\n\n" + processChildren(children).trim() + "\n\n";
case "h2":
return "\n\n" + processChildren(children).trim() + "\n\n";
case "h3":
return "\n" + processChildren(children).trim() + "\n";
case "p":
return processChildren(children) + "\n\n";
case "br":
return "\n";
case "strong":
case "b":
return processChildren(children);
case "em":
case "i":
return processChildren(children);
case "sup":
return processChildren(children);
case "sub":
return processChildren(children);
case "table":
return "\n" + processTable(node) + "\n\n";
case "ul":
case "ol":
return "\n" + processList(node) + "\n\n";
case "li":
return " • " + processChildren(children).trim() + "\n";
default:
return processChildren(children);
}
};
const processChildren = (children) => {
return children.map(processNode).join("");
};
const processTable = (table) => {
let tableText = "";
const rows = table.querySelectorAll("tr");
if (rows.length === 0) return "";
// First pass: calculate column widths
const allRows = Array.from(rows);
const columnCount = Math.max(...allRows.map(row => row.querySelectorAll("td, th").length));
const columnWidths = new Array(columnCount).fill(0);
allRows.forEach(row => {
const cells = row.querySelectorAll("td, th");
cells.forEach((cell, colIndex) => {
const cellText = processChildren(Array.from(cell.childNodes)).trim().replace(/\s+/g, " ");
columnWidths[colIndex] = Math.max(columnWidths[colIndex] || 0, cellText.length, 10);
});
});
// Second pass: format rows
allRows.forEach((row, rowIndex) => {
const cells = row.querySelectorAll("td, th");
const cellTexts = Array.from(cells).map(cell => {
let cellContent = processChildren(Array.from(cell.childNodes)).trim();
cellContent = cellContent.replace(/\s+/g, " ");
return cellContent;
});
// Pad cells to column widths
const paddedCells = cellTexts.map((text, i) => {
const width = columnWidths[i] || 10;
return text.padEnd(width);
});
tableText += paddedCells.join(" | ") + "\n";
// Add separator after header row
if (rowIndex === 0 && row.querySelector("th")) {
tableText += columnWidths.map(w => "-".repeat(w)).join("-|-") + "\n";
}
});
return tableText;
};
const processList = (list) => {
const items = list.querySelectorAll("li");
return Array.from(items).map(item => {
return " • " + processChildren(Array.from(item.childNodes)).trim();
}).join("\n");
};
text = processChildren(Array.from(tempDiv.childNodes));
// Clean up extra newlines
text = text.replace(/\n{3,}/g, "\n\n");
text = text.trim();
return text;
};
const handleCopy = () => {
let content = "";
if (activeTab === "json") {
const preparedFields = prepareFieldsForOutput(fields, "json");
content = JSON.stringify(preparedFields, null, 2);
} else if (activeTab === "xml") {
content = objectToXML(fields);
} else {
// For text tab, get the formatted HTML and convert to plain text with layout
const textContent = extractTextFromFields(fields);
const htmlContent = renderMarkdownToHTML(textContent);
content = htmlToFormattedText(htmlContent);
}
navigator.clipboard.writeText(content);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
};
// Get prepared fields for display
const preparedFields = React.useMemo(() => {
return prepareFieldsForOutput(fields, "json");
}, [fields]);
// Update expanded sections when fields change
React.useEffect(() => {
if (extractionResult?.fields) {
setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5));
}
}, [extractionResult]);
const toggleSection = (section) => {
setExpandedSections((prev) =>
prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section]
);
};
const renderValue = (value) => {
if (typeof value === "number") {
return <span className="text-amber-600">{value}</span>;
}
if (typeof value === "string") {
return <span className="text-emerald-600">"{value}"</span>;
}
return String(value);
};
const renderSection = (key, value, level = 0) => {
const isExpanded = expandedSections.includes(key);
const isObject = typeof value === "object" && value !== null;
const isArray = Array.isArray(value);
if (!isObject) {
return (
<div
key={key}
className="flex items-start gap-2 py-1"
style={{ paddingLeft: level * 16 }}
>
<span className="text-violet-500">"{key}"</span>
<span className="text-slate-400">:</span>
{renderValue(value)}
</div>
);
}
return (
<div key={key}>
<button
onClick={() => toggleSection(key)}
className="flex items-center gap-2 py-1 hover:bg-slate-50 w-full text-left rounded"
style={{ paddingLeft: level * 16 }}
>
<ChevronDown
className={cn(
"h-3 w-3 text-slate-400 transition-transform",
!isExpanded && "-rotate-90"
)}
/>
<span className="text-violet-500">"{key}"</span>
<span className="text-slate-400">:</span>
<span className="text-slate-400">{isArray ? "[" : "{"}</span>
{!isExpanded && (
<span className="text-slate-300 text-xs">
{isArray ? `${value.length} items` : `${Object.keys(value).length} fields`}
</span>
)}
</button>
<AnimatePresence>
{isExpanded && (
<motion.div
initial={{ height: 0, opacity: 0 }}
animate={{ height: "auto", opacity: 1 }}
exit={{ height: 0, opacity: 0 }}
transition={{ duration: 0.2 }}
className="overflow-hidden"
>
{isArray ? (
value.map((item, idx) => (
<div key={idx} className="border-l border-slate-100 ml-4">
{Object.entries(item).map(([k, v]) => renderSection(k, v, level + 2))}
{idx < value.length - 1 && <div className="h-2" />}
</div>
))
) : (
Object.entries(value).map(([k, v]) => renderSection(k, v, level + 1))
)}
<div style={{ paddingLeft: level * 16 }} className="text-slate-400">
{isArray ? "]" : "}"}
</div>
</motion.div>
)}
</AnimatePresence>
</div>
);
};
return (
<div className="h-full flex flex-col bg-white rounded-2xl border border-slate-200 overflow-hidden">
{/* Header */}
<div className="flex items-center justify-between px-5 py-4 border-b border-slate-100">
<div className="flex items-center gap-3">
<div className="h-8 w-8 rounded-lg bg-emerald-50 flex items-center justify-center">
<Code2 className="h-4 w-4 text-emerald-600" />
</div>
<div>
<h3 className="font-semibold text-slate-800 text-sm">Extracted Data</h3>
<p className="text-xs text-slate-400">
{isComplete
? `${fieldsExtracted} field${fieldsExtracted !== 1 ? 's' : ''} extracted`
: "Waiting for extraction"}
</p>
</div>
{isComplete && onNewUpload && (
<Button
variant="ghost"
size="sm"
onClick={onNewUpload}
className="h-8 ml-auto text-xs gap-1.5 text-indigo-600 hover:text-indigo-700 hover:bg-indigo-50"
title="Upload new document"
>
<Upload className="h-3.5 w-3.5" />
New
</Button>
)}
</div>
{isComplete && (
<div className="flex items-center gap-2">
<Tabs value={activeTab} onValueChange={setActiveTab}>
<TabsList className="h-8 bg-slate-100 p-0.5">
<TabsTrigger value="text" className="h-7 text-xs gap-1.5">
<FileText className="h-3 w-3" />
Text
</TabsTrigger>
<TabsTrigger value="json" className="h-7 text-xs gap-1.5">
<Braces className="h-3 w-3" />
JSON
</TabsTrigger>
<TabsTrigger value="xml" className="h-7 text-xs gap-1.5">
<FileCode2 className="h-3 w-3" />
XML
</TabsTrigger>
</TabsList>
</Tabs>
<Button
variant="ghost"
size="sm"
onClick={handleCopy}
className="h-8 text-xs gap-1.5"
>
{copied ? (
<>
<Check className="h-3 w-3 text-emerald-500" />
Copied
</>
) : (
<>
<Copy className="h-3 w-3" />
Copy
</>
)}
</Button>
</div>
)}
</div>
{/* Output Area */}
<div className="flex-1 overflow-auto">
{!hasFile ? (
<div className="h-full flex items-center justify-center p-6">
<div className="text-center">
<div className="h-20 w-20 mx-auto rounded-2xl bg-slate-100 flex items-center justify-center mb-4">
<Code2 className="h-10 w-10 text-slate-300" />
</div>
<p className="text-slate-400 text-sm">Extracted data will appear here</p>
</div>
</div>
) : isProcessing ? (
<div className="h-full flex items-center justify-center p-6">
<div className="text-center">
<motion.div
animate={{ rotate: 360 }}
transition={{ duration: 2, repeat: Infinity, ease: "linear" }}
className="h-16 w-16 mx-auto rounded-2xl bg-gradient-to-br from-indigo-100 to-violet-100 flex items-center justify-center mb-4"
>
<Sparkles className="h-8 w-8 text-indigo-500" />
</motion.div>
<p className="text-slate-700 font-medium mb-1">Extracting data...</p>
<p className="text-slate-400 text-sm">{statusMessage}</p>
<div className="mt-6 flex items-center justify-center gap-1">
{[0, 1, 2].map((i) => (
<motion.div
key={i}
animate={{ scale: [1, 1.2, 1] }}
transition={{
duration: 0.6,
repeat: Infinity,
delay: i * 0.2,
}}
className="h-2 w-2 rounded-full bg-indigo-400"
/>
))}
</div>
</div>
</div>
) : isComplete && Object.keys(fields).length === 0 ? (
<div className="h-full flex items-center justify-center p-6">
<div className="text-center">
<div className="h-20 w-20 mx-auto rounded-2xl bg-amber-100 flex items-center justify-center mb-4">
<Code2 className="h-10 w-10 text-amber-600" />
</div>
<p className="text-slate-600 font-medium mb-1">No data extracted</p>
<p className="text-slate-400 text-sm">The document may not contain extractable fields</p>
</div>
</div>
) : (
<div className="p-4 font-mono text-sm">
{activeTab === "text" ? (
<div
className="text-sm text-slate-700 leading-relaxed"
style={{
fontFamily: 'system-ui, -apple-system, sans-serif'
}}
>
<div
className="markdown-content"
dangerouslySetInnerHTML={{ __html: renderMarkdownToHTML(fieldsToText(fields)) }}
style={{
lineHeight: '1.6'
}}
/>
<style>{`
.markdown-content h1 {
font-size: 1.5rem;
font-weight: 700;
color: #0f172a;
margin-top: 1.5rem;
margin-bottom: 1rem;
line-height: 1.3;
}
.markdown-content h2 {
font-size: 1.25rem;
font-weight: 600;
color: #0f172a;
margin-top: 1.25rem;
margin-bottom: 0.75rem;
line-height: 1.3;
}
.markdown-content h3 {
font-size: 1.125rem;
font-weight: 600;
color: #1e293b;
margin-top: 1rem;
margin-bottom: 0.5rem;
line-height: 1.3;
}
.markdown-content p {
margin-top: 0.75rem;
margin-bottom: 0.75rem;
color: #334155;
}
.markdown-content table {
width: 100%;
border-collapse: collapse;
margin: 1.5rem 0;
font-size: 0.875rem;
box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1);
}
.markdown-content table caption {
font-weight: 600;
margin-bottom: 0.5rem;
text-align: left;
}
.markdown-content table th {
background-color: #f8fafc;
border: 1px solid #cbd5e1;
padding: 0.75rem;
text-align: left;
font-weight: 600;
color: #0f172a;
}
.markdown-content table td {
border: 1px solid #cbd5e1;
padding: 0.75rem;
color: #334155;
}
.markdown-content table tr:nth-child(even) {
background-color: #f8fafc;
}
.markdown-content table tr:hover {
background-color: #f1f5f9;
}
.markdown-content strong {
font-weight: 600;
color: #0f172a;
}
.markdown-content em {
font-style: italic;
}
.markdown-content a {
color: #4f46e5;
text-decoration: underline;
}
.markdown-content a:hover {
color: #4338ca;
}
.markdown-content sup {
font-size: 0.75em;
vertical-align: super;
line-height: 0;
position: relative;
top: -0.5em;
}
.markdown-content sub {
font-size: 0.75em;
vertical-align: sub;
line-height: 0;
position: relative;
bottom: -0.25em;
}
.markdown-content ul, .markdown-content ol {
margin: 0.75rem 0;
padding-left: 1.5rem;
}
.markdown-content li {
margin: 0.25rem 0;
}
`}</style>
</div>
) : activeTab === "json" ? (
<div className="space-y-1">
<span className="text-slate-400">{"{"}</span>
{Object.keys(preparedFields).length > 0 ? (
Object.entries(preparedFields).map(([key, value]) =>
renderSection(key, value, 1)
)
) : (
<div className="pl-4 text-slate-400 italic">No fields extracted</div>
)}
<span className="text-slate-400">{"}"}</span>
</div>
) : (
<pre className="text-sm text-slate-600 whitespace-pre-wrap">
{objectToXML(fields).split("\n").map((line, i) => (
<div key={i} className="hover:bg-slate-50 px-2 -mx-2 rounded">
{line.includes("<") ? (
<>
{line.split(/(<\/?[\w\s=".-]+>)/g).map((part, j) => {
if (part.startsWith("</")) {
return (
<span key={j} className="text-rose-500">
{part}
</span>
);
}
if (part.startsWith("<")) {
return (
<span key={j} className="text-indigo-500">
{part}
</span>
);
}
return (
<span key={j} className="text-slate-700">
{part}
</span>
);
})}
</>
) : (
line
)}
</div>
))}
</pre>
)}
</div>
)}
</div>
{/* Confidence Footer */}
{isComplete && extractionResult && (
<div className="px-5 py-3 border-t border-slate-100 bg-slate-50/50">
<div className="flex items-center justify-between text-xs">
<div className="flex items-center gap-4">
<div className="flex items-center gap-1.5">
<div className={cn(
"h-2 w-2 rounded-full",
confidence >= 90 ? "bg-emerald-500" : confidence >= 70 ? "bg-amber-500" : "bg-red-500"
)} />
<span className="text-slate-500">Confidence:</span>
<span className="font-semibold text-slate-700">
{confidence > 0 ? `${confidence.toFixed(1)}%` : "N/A"}
</span>
</div>
<div className="flex items-center gap-1.5">
<span className="text-slate-500">Fields:</span>
<span className="font-semibold text-slate-700">{fieldsExtracted}</span>
</div>
</div>
<span className="text-slate-400">
Processed in {totalTime >= 1000 ? `${(totalTime / 1000).toFixed(1)}s` : `${totalTime}ms`}
</span>
</div>
</div>
)}
</div>
);
}