';
result.push(htmlTable);
i = j;
continue;
}
}
}
}
// Not a table row, add as-is
result.push(line);
i++;
}
return result.join('\n');
}
// Helper function to escape HTML
function escapeHtml(text) {
if (!text) return '';
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// Helper function to convert markdown/HTML text to safe HTML
function renderMarkdownToHTML(text) {
if (!text) return "";
let html = text;
// FIRST: Convert pipe-separated tables to HTML tables
html = convertPipeTablesToHTML(html);
// Convert LaTeX-style superscripts/subscripts FIRST (before protecting tables)
// This ensures they're converted everywhere, including inside tables
// Convert LaTeX-style superscripts: $^{text}$ or $^text$ to text
html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '$1');
html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '$1');
// Convert LaTeX-style subscripts: $_{text}$ or $_text$ to text
html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '$1');
html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '$1');
// Split by HTML tags to preserve existing HTML (like tables)
// Process markdown only in non-HTML sections
// First, protect existing HTML blocks (tables, etc.)
const htmlBlocks = [];
let htmlBlockIndex = 0;
// Extract and protect HTML table blocks
html = html.replace(/
/gi, (match) => {
const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`;
htmlBlocks[htmlBlockIndex] = match;
htmlBlockIndex++;
return placeholder;
});
// Convert markdown headers (only if not inside HTML)
html = html.replace(/^### (.*$)/gim, '
$1
');
html = html.replace(/^## (.*$)/gim, '
$1
');
html = html.replace(/^# (.*$)/gim, '
$1
');
// Convert markdown bold/italic (but not inside HTML tags)
html = html.replace(/\*\*(.*?)\*\*/g, '$1');
html = html.replace(/\*(.*?)\*/g, '$1');
// Convert markdown links
html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1');
// Convert line breaks to paragraphs (but preserve structure around HTML blocks)
const parts = html.split(/(__HTML_BLOCK_\d+__)/);
const processedParts = parts.map((part, index) => {
if (part.match(/^__HTML_BLOCK_\d+__$/)) {
// Restore HTML block
const blockIndex = parseInt(part.match(/\d+/)[0]);
return htmlBlocks[blockIndex];
} else {
// Process markdown in this part
let processed = part;
// Convert double line breaks to paragraph breaks
processed = processed.replace(/\n\n+/g, '
');
// Convert single line breaks to (but not if already in a tag)
processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1 $2');
// Wrap in paragraph if there's content
if (processed.trim() && !processed.trim().startsWith('<')) {
processed = '
' + processed + '
';
}
return processed;
}
});
html = processedParts.join('');
// Process LaTeX notation in restored HTML blocks (tables) as well
// This handles any LaTeX that might be in table cells
html = html.replace(/(
]*>|
]*>)([^<]*)\$\s*\^\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, supText, after, closeTag) => {
return openTag + before + '' + supText + '' + after + closeTag;
});
html = html.replace(/(
]*>|
]*>)([^<]*)\$\s*\^\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, supText, after, closeTag) => {
return openTag + before + '' + supText + '' + after + closeTag;
});
html = html.replace(/(
]*>|
]*>)([^<]*)\$\s*_\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, subText, after, closeTag) => {
return openTag + before + '' + subText + '' + after + closeTag;
});
html = html.replace(/(
]*>|
]*>)([^<]*)\$\s*_\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi,
(match, openTag, before, subText, after, closeTag) => {
return openTag + before + '' + subText + '' + after + closeTag;
});
// Clean up empty paragraphs and fix paragraph structure
html = html.replace(/
<\/p>/g, '');
html = html.replace(/
\s* \s*<\/p>/g, '');
html = html.replace(/
\s*<\/p>/g, '');
// Ensure proper spacing around HTML blocks
html = html.replace(/(<\/table>)\s*(
$2');
html = html.replace(/(<\/h[1-3]>)\s*(
$2');
html = html.replace(/(<\/table>)\s*(
)/g, '$1$2');
return html;
}
// Mock extracted data
const mockData = {
document: {
type: "Invoice",
confidence: 0.98,
},
vendor: {
name: "Acme Corporation",
address: "123 Business Ave, Suite 400",
city: "San Francisco",
state: "CA",
zip: "94102",
phone: "+1 (555) 123-4567",
},
invoice: {
number: "INV-2024-0847",
date: "2024-01-15",
due_date: "2024-02-14",
po_number: "PO-9823",
},
items: [
{ description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 },
{ description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 },
{ description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 },
],
totals: {
subtotal: 7999.95,
tax_rate: 0.0875,
tax_amount: 699.99,
total: 8699.94,
},
};
const mockXML = `
Acme Corporation
123 Business Ave, Suite 400
San FranciscoCA94102INV-2024-08472024-01-152024-02-14Professional Services406000.007999.95699.998699.94`;
const mockText = `INVOICE
ACME CORPORATION
123 Business Ave, Suite 400
San Francisco, CA 94102
Phone: +1 (555) 123-4567
Invoice Number: INV-2024-0847
Invoice Date: January 15, 2024
Due Date: February 14, 2024
PO Number: PO-9823
BILL TO:
Customer Name
456 Client Street
New York, NY 10001
ITEMS:
─────────────────────────────────────────────────────────
Description Qty Unit Price Total
─────────────────────────────────────────────────────────
Professional Services 40 $150.00 $6,000.00
Software License 5 $299.99 $1,499.95
Support Package 1 $500.00 $500.00
─────────────────────────────────────────────────────────
Subtotal: $7,999.95
Tax (8.75%): $699.99
─────────────────────────
TOTAL: $8,699.94
Payment Terms: Net 30
Thank you for your business!`;
// Helper function to convert object to XML
// Prepare fields for JSON/XML output - remove duplicates and restructure
function prepareFieldsForOutput(fields, format = "json") {
if (!fields || typeof fields !== "object") {
return fields;
}
const output = { ...fields };
// Extract Fields from root level if it exists
const rootFields = output.Fields;
// Remove Fields from output temporarily (will be added back at top)
delete output.Fields;
// Remove full_text from top-level if pages array exists (to avoid duplication)
if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) {
delete output.full_text;
// Clean up each page: remove full_text from page.fields (it duplicates page.text)
output.pages = output.pages.map(page => {
const cleanedPage = { ...page };
if (cleanedPage.fields && typeof cleanedPage.fields === "object") {
const cleanedFields = { ...cleanedPage.fields };
// Remove full_text from page fields (duplicates page.text)
delete cleanedFields.full_text;
cleanedPage.fields = cleanedFields;
}
return cleanedPage;
});
}
// For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.)
if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) {
// Get top-level field keys (these are merged from all pages - avoid duplicating in page fields)
const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields"));
output.pages.forEach((page, idx) => {
const pageNum = page.page_number || idx + 1;
const pageFields = page.fields || {};
// Remove duplicate fields from page.fields:
// 1. Remove full_text (duplicates page.text)
// 2. Remove fields that match top-level fields (already shown at root)
const cleanedPageFields = {};
for (const [key, value] of Object.entries(pageFields)) {
// Skip full_text and fields that match top-level exactly
if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) {
cleanedPageFields[key] = value;
}
}
const pageObj = {
text: page.text || "",
confidence: page.confidence || 0,
doc_type: page.doc_type || "other"
};
// Add table and footer_notes if they exist
if (page.table && Array.isArray(page.table) && page.table.length > 0) {
pageObj.table = page.table;
}
if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) {
pageObj.footer_notes = page.footer_notes;
}
// Only add fields if there are unique page-specific fields
if (Object.keys(cleanedPageFields).length > 0) {
pageObj.fields = cleanedPageFields;
}
output[`page_${pageNum}`] = pageObj;
});
// Remove pages array - we now have page_1, page_2, etc. as separate fields
delete output.pages;
}
// Handle page_X structure (from backend) - remove Fields from page objects if they exist
if (output && typeof output === "object") {
const pageKeys = Object.keys(output).filter(k => k.startsWith("page_"));
for (const pageKey of pageKeys) {
const pageData = output[pageKey];
if (pageData && typeof pageData === "object") {
// Remove Fields from page objects (it's now at root level)
delete pageData.Fields;
delete pageData.metadata;
}
}
}
// Rebuild output with Fields at the top (only if it exists and is not empty)
const finalOutput = {};
if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) {
finalOutput.Fields = rootFields;
}
// Add all other keys
Object.keys(output).forEach(key => {
finalOutput[key] = output[key];
});
return finalOutput;
}
function objectToXML(obj, rootName = "extraction") {
// Prepare fields - remove full_text if pages exist
const preparedObj = prepareFieldsForOutput(obj, "xml");
let xml = `\n<${rootName}>\n`;
const convert = (obj, indent = " ") => {
for (const [key, value] of Object.entries(obj)) {
if (value === null || value === undefined) continue;
// Skip full_text if pages exist (already handled in prepareFieldsForOutput)
if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) {
continue;
}
if (Array.isArray(value)) {
value.forEach((item) => {
xml += `${indent}<${key}>\n`;
if (typeof item === "object") {
convert(item, indent + " ");
} else {
xml += `${indent} ${escapeXML(String(item))}\n`;
}
xml += `${indent}${key}>\n`;
});
} else if (typeof value === "object") {
xml += `${indent}<${key}>\n`;
convert(value, indent + " ");
xml += `${indent}${key}>\n`;
} else {
xml += `${indent}<${key}>${escapeXML(String(value))}${key}>\n`;
}
}
};
convert(preparedObj);
xml += `${rootName}>`;
return xml;
}
function escapeXML(str) {
return str
.replace(/&/g, "&")
.replace(//g, ">")
.replace(/"/g, """)
.replace(/'/g, "'");
}
// Helper function to extract text from page structure
function extractTextFromFields(fields) {
if (!fields || typeof fields !== "object") {
return "";
}
// Check for page_X structure first (preferred format)
const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_"));
if (pageKeys.length > 0) {
// Get text from first page (or combine all pages)
const pageTexts = pageKeys.map(key => {
const page = fields[key];
if (page && page.text) {
return page.text;
}
return "";
}).filter(text => text);
if (pageTexts.length > 0) {
return pageTexts.join("\n\n");
}
}
// Fallback to full_text
if (fields.full_text) {
return fields.full_text;
}
return "";
}
// Helper function to format fields as readable text
function fieldsToText(fields) {
if (!fields || typeof fields !== "object") {
return "No data extracted.";
}
// Extract text from page structure or full_text
const extractedText = extractTextFromFields(fields);
if (extractedText) {
return extractedText;
// Don't show pages array separately if full_text already contains page markers
// (full_text from backend already includes "=== PAGE 1 ===" etc.)
const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page");
// Only show pages array if full_text doesn't already have page breakdown
if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) {
text += "\n\n=== TEXT BY PAGE ===\n\n";
fields.pages.forEach((page, idx) => {
text += `--- Page ${page.page_number || idx + 1} ---\n`;
text += page.text || "";
text += "\n\n";
});
}
// Then show other structured fields
const otherFields = { ...fields };
delete otherFields.full_text;
delete otherFields.pages;
if (Object.keys(otherFields).length > 0) {
text += "\n\n=== STRUCTURED FIELDS ===\n\n";
const formatValue = (key, value, indent = "") => {
if (Array.isArray(value)) {
text += `${indent}${key}:\n`;
value.forEach((item, idx) => {
if (typeof item === "object") {
text += `${indent} Item ${idx + 1}:\n`;
Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent} - ${item}\n`;
}
});
} else if (typeof value === "object" && value !== null) {
text += `${indent}${key}:\n`;
Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent}${key}: ${value}\n`;
}
};
Object.entries(otherFields).forEach(([key, value]) => {
formatValue(key, value);
text += "\n";
});
}
return text.trim();
}
// Fallback: format all fields normally
let text = "";
const formatValue = (key, value, indent = "") => {
if (Array.isArray(value)) {
text += `${indent}${key}:\n`;
value.forEach((item, idx) => {
if (typeof item === "object") {
text += `${indent} Item ${idx + 1}:\n`;
Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent} - ${item}\n`;
}
});
} else if (typeof value === "object" && value !== null) {
text += `${indent}${key}:\n`;
Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " "));
} else {
text += `${indent}${key}: ${value}\n`;
}
};
Object.entries(fields).forEach(([key, value]) => {
formatValue(key, value);
text += "\n";
});
return text.trim() || "No data extracted.";
}
export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult, onNewUpload }) {
const [activeTab, setActiveTab] = useState("json");
const [copied, setCopied] = useState(false);
const [statusMessage, setStatusMessage] = useState("Preparing document...");
// Get fields from extraction result, default to empty object
const fields = extractionResult?.fields || {};
const confidence = extractionResult?.confidence || 0;
const fieldsExtracted = extractionResult?.fieldsExtracted || 0;
const totalTime = extractionResult?.totalTime || 0;
// Dynamic status messages that rotate during processing
const statusMessages = [
"Preparing document...",
"Converting pages to images...",
"Visual Reasoning...",
"Reading text from document...",
"Identifying document structure...",
"Extracting tables and data...",
"Analyzing content...",
"Processing pages...",
"Organizing extracted information...",
"Finalizing results...",
];
// Rotate status messages during processing
const messageIndexRef = useRef(0);
useEffect(() => {
if (!isProcessing) {
setStatusMessage("Analyzing document structure");
messageIndexRef.current = 0;
return;
}
setStatusMessage(statusMessages[0]);
messageIndexRef.current = 0;
const interval = setInterval(() => {
messageIndexRef.current = (messageIndexRef.current + 1) % statusMessages.length;
setStatusMessage(statusMessages[messageIndexRef.current]);
}, 2500); // Change message every 2.5 seconds
return () => clearInterval(interval);
}, [isProcessing]);
// Initialize expanded sections based on available fields
const [expandedSections, setExpandedSections] = useState(() =>
Object.keys(fields).slice(0, 5) // Expand first 5 sections by default
);
// Helper function to convert HTML to formatted plain text with layout preserved
const htmlToFormattedText = (html) => {
if (!html) return "";
// Create a temporary div to parse HTML
const tempDiv = document.createElement("div");
tempDiv.innerHTML = html;
let text = "";
// Process each element
const processNode = (node) => {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent;
}
if (node.nodeType !== Node.ELEMENT_NODE) {
return "";
}
const tagName = node.tagName?.toLowerCase();
const children = Array.from(node.childNodes);
switch (tagName) {
case "h1":
return "\n\n" + processChildren(children).trim() + "\n\n";
case "h2":
return "\n\n" + processChildren(children).trim() + "\n\n";
case "h3":
return "\n" + processChildren(children).trim() + "\n";
case "p":
return processChildren(children) + "\n\n";
case "br":
return "\n";
case "strong":
case "b":
return processChildren(children);
case "em":
case "i":
return processChildren(children);
case "sup":
return processChildren(children);
case "sub":
return processChildren(children);
case "table":
return "\n" + processTable(node) + "\n\n";
case "ul":
case "ol":
return "\n" + processList(node) + "\n\n";
case "li":
return " • " + processChildren(children).trim() + "\n";
default:
return processChildren(children);
}
};
const processChildren = (children) => {
return children.map(processNode).join("");
};
const processTable = (table) => {
let tableText = "";
const rows = table.querySelectorAll("tr");
if (rows.length === 0) return "";
// First pass: calculate column widths
const allRows = Array.from(rows);
const columnCount = Math.max(...allRows.map(row => row.querySelectorAll("td, th").length));
const columnWidths = new Array(columnCount).fill(0);
allRows.forEach(row => {
const cells = row.querySelectorAll("td, th");
cells.forEach((cell, colIndex) => {
const cellText = processChildren(Array.from(cell.childNodes)).trim().replace(/\s+/g, " ");
columnWidths[colIndex] = Math.max(columnWidths[colIndex] || 0, cellText.length, 10);
});
});
// Second pass: format rows
allRows.forEach((row, rowIndex) => {
const cells = row.querySelectorAll("td, th");
const cellTexts = Array.from(cells).map(cell => {
let cellContent = processChildren(Array.from(cell.childNodes)).trim();
cellContent = cellContent.replace(/\s+/g, " ");
return cellContent;
});
// Pad cells to column widths
const paddedCells = cellTexts.map((text, i) => {
const width = columnWidths[i] || 10;
return text.padEnd(width);
});
tableText += paddedCells.join(" | ") + "\n";
// Add separator after header row
if (rowIndex === 0 && row.querySelector("th")) {
tableText += columnWidths.map(w => "-".repeat(w)).join("-|-") + "\n";
}
});
return tableText;
};
const processList = (list) => {
const items = list.querySelectorAll("li");
return Array.from(items).map(item => {
return " • " + processChildren(Array.from(item.childNodes)).trim();
}).join("\n");
};
text = processChildren(Array.from(tempDiv.childNodes));
// Clean up extra newlines
text = text.replace(/\n{3,}/g, "\n\n");
text = text.trim();
return text;
};
const handleCopy = () => {
let content = "";
if (activeTab === "json") {
const preparedFields = prepareFieldsForOutput(fields, "json");
content = JSON.stringify(preparedFields, null, 2);
} else if (activeTab === "xml") {
content = objectToXML(fields);
} else {
// For text tab, get the formatted HTML and convert to plain text with layout
const textContent = extractTextFromFields(fields);
const htmlContent = renderMarkdownToHTML(textContent);
content = htmlToFormattedText(htmlContent);
}
navigator.clipboard.writeText(content);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
};
// Get prepared fields for display
const preparedFields = React.useMemo(() => {
return prepareFieldsForOutput(fields, "json");
}, [fields]);
// Update expanded sections when fields change
React.useEffect(() => {
if (extractionResult?.fields) {
setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5));
}
}, [extractionResult]);
const toggleSection = (section) => {
setExpandedSections((prev) =>
prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section]
);
};
const renderValue = (value) => {
if (typeof value === "number") {
return {value};
}
if (typeof value === "string") {
return "{value}";
}
return String(value);
};
const renderSection = (key, value, level = 0) => {
const isExpanded = expandedSections.includes(key);
const isObject = typeof value === "object" && value !== null;
const isArray = Array.isArray(value);
if (!isObject) {
return (