import React, { useState, useEffect, useRef } from "react"; import { motion, AnimatePresence } from "framer-motion"; import { Code2, Copy, Check, Braces, FileCode2, FileText, Sparkles, ChevronDown, Upload, } from "lucide-react"; import { Button } from "@/components/ui/button"; import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; import { cn } from "@/lib/utils"; // Helper function to convert pipe-separated tables to HTML tables function convertPipeTablesToHTML(text) { if (!text) return text; const lines = text.split('\n'); const result = []; let i = 0; while (i < lines.length) { const line = lines[i]; // Check if this line looks like a table row (has multiple pipes) if (line.includes('|') && line.split('|').length >= 3) { // Check if it's a separator line (only |, -, :, spaces) const isSeparator = /^[\s|\-:]+$/.test(line.trim()); if (!isSeparator) { // Start of a table - collect all table rows const tableRows = []; let j = i; // Collect header row const headerLine = lines[j]; const headerCells = headerLine.split('|').map(cell => cell.trim()).filter(cell => cell || cell === ''); // Remove empty cells at start/end if (headerCells.length > 0 && !headerCells[0]) headerCells.shift(); if (headerCells.length > 0 && !headerCells[headerCells.length - 1]) headerCells.pop(); if (headerCells.length >= 2) { tableRows.push(headerCells); j++; // Skip separator line if present if (j < lines.length && /^[\s|\-:]+$/.test(lines[j].trim())) { j++; } // Collect data rows while (j < lines.length) { const rowLine = lines[j]; if (!rowLine.trim()) break; // Empty line ends table // Check if it's still a table row if (rowLine.includes('|') && rowLine.split('|').length >= 2) { const isRowSeparator = /^[\s|\-:]+$/.test(rowLine.trim()); if (!isRowSeparator) { const rowCells = rowLine.split('|').map(cell => cell.trim()); // Remove empty cells at start/end if (rowCells.length > 0 && !rowCells[0]) rowCells.shift(); if (rowCells.length > 0 && !rowCells[rowCells.length - 1]) rowCells.pop(); tableRows.push(rowCells); j++; } else { j++; } } else { break; // Not a table row anymore } } // Convert to HTML table if (tableRows.length > 0) { let htmlTable = '\n\n'; // Header row tableRows[0].forEach(cell => { htmlTable += ``; }); htmlTable += '\n\n\n'; // Data rows for (let rowIdx = 1; rowIdx < tableRows.length; rowIdx++) { htmlTable += ''; tableRows[rowIdx].forEach((cell, colIdx) => { // Use header cell count to ensure alignment const cellContent = cell || ''; htmlTable += ``; }); htmlTable += '\n'; } htmlTable += '\n
${escapeHtml(cell)}
${escapeHtml(cellContent)}
'; result.push(htmlTable); i = j; continue; } } } } // Not a table row, add as-is result.push(line); i++; } return result.join('\n'); } // Helper function to escape HTML function escapeHtml(text) { if (!text) return ''; const div = document.createElement('div'); div.textContent = text; return div.innerHTML; } // Helper function to convert markdown/HTML text to safe HTML function renderMarkdownToHTML(text) { if (!text) return ""; let html = text; // FIRST: Convert pipe-separated tables to HTML tables html = convertPipeTablesToHTML(html); // Convert LaTeX-style superscripts/subscripts FIRST (before protecting tables) // This ensures they're converted everywhere, including inside tables // Convert LaTeX-style superscripts: $^{text}$ or $^text$ to text html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '$1'); html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '$1'); // Convert LaTeX-style subscripts: $_{text}$ or $_text$ to text html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '$1'); html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '$1'); // Split by HTML tags to preserve existing HTML (like tables) // Process markdown only in non-HTML sections // First, protect existing HTML blocks (tables, etc.) const htmlBlocks = []; let htmlBlockIndex = 0; // Extract and protect HTML table blocks html = html.replace(//gi, (match) => { const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`; htmlBlocks[htmlBlockIndex] = match; htmlBlockIndex++; return placeholder; }); // Convert markdown headers (only if not inside HTML) html = html.replace(/^### (.*$)/gim, '

$1

'); html = html.replace(/^## (.*$)/gim, '

$1

'); html = html.replace(/^# (.*$)/gim, '

$1

'); // Convert markdown bold/italic (but not inside HTML tags) html = html.replace(/\*\*(.*?)\*\*/g, '$1'); html = html.replace(/\*(.*?)\*/g, '$1'); // Convert markdown links html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1'); // Convert line breaks to paragraphs (but preserve structure around HTML blocks) const parts = html.split(/(__HTML_BLOCK_\d+__)/); const processedParts = parts.map((part, index) => { if (part.match(/^__HTML_BLOCK_\d+__$/)) { // Restore HTML block const blockIndex = parseInt(part.match(/\d+/)[0]); return htmlBlocks[blockIndex]; } else { // Process markdown in this part let processed = part; // Convert double line breaks to paragraph breaks processed = processed.replace(/\n\n+/g, '

'); // Convert single line breaks to
(but not if already in a tag) processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1
$2'); // Wrap in paragraph if there's content if (processed.trim() && !processed.trim().startsWith('<')) { processed = '

' + processed + '

'; } return processed; } }); html = processedParts.join(''); // Process LaTeX notation in restored HTML blocks (tables) as well // This handles any LaTeX that might be in table cells html = html.replace(/(]*>|]*>)([^<]*)\$\s*\^\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, (match, openTag, before, supText, after, closeTag) => { return openTag + before + '' + supText + '' + after + closeTag; }); html = html.replace(/(]*>|]*>)([^<]*)\$\s*\^\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, (match, openTag, before, supText, after, closeTag) => { return openTag + before + '' + supText + '' + after + closeTag; }); html = html.replace(/(]*>|]*>)([^<]*)\$\s*_\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, (match, openTag, before, subText, after, closeTag) => { return openTag + before + '' + subText + '' + after + closeTag; }); html = html.replace(/(]*>|]*>)([^<]*)\$\s*_\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, (match, openTag, before, subText, after, closeTag) => { return openTag + before + '' + subText + '' + after + closeTag; }); // Clean up empty paragraphs and fix paragraph structure html = html.replace(/

<\/p>/g, ''); html = html.replace(/

\s*
\s*<\/p>/g, ''); html = html.replace(/

\s*<\/p>/g, ''); // Ensure proper spacing around HTML blocks html = html.replace(/(<\/table>)\s*(

$2'); html = html.replace(/(<\/h[1-3]>)\s*($2'); html = html.replace(/(<\/table>)\s*(

)/g, '$1$2'); return html; } // Mock extracted data const mockData = { document: { type: "Invoice", confidence: 0.98, }, vendor: { name: "Acme Corporation", address: "123 Business Ave, Suite 400", city: "San Francisco", state: "CA", zip: "94102", phone: "+1 (555) 123-4567", }, invoice: { number: "INV-2024-0847", date: "2024-01-15", due_date: "2024-02-14", po_number: "PO-9823", }, items: [ { description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 }, { description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 }, { description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 }, ], totals: { subtotal: 7999.95, tax_rate: 0.0875, tax_amount: 699.99, total: 8699.94, }, }; const mockXML = ` Acme Corporation

123 Business Ave, Suite 400
San Francisco CA 94102 INV-2024-0847 2024-01-15 2024-02-14 Professional Services 40 6000.00 7999.95 699.99 8699.94 `; const mockText = `INVOICE ACME CORPORATION 123 Business Ave, Suite 400 San Francisco, CA 94102 Phone: +1 (555) 123-4567 Invoice Number: INV-2024-0847 Invoice Date: January 15, 2024 Due Date: February 14, 2024 PO Number: PO-9823 BILL TO: Customer Name 456 Client Street New York, NY 10001 ITEMS: ───────────────────────────────────────────────────────── Description Qty Unit Price Total ───────────────────────────────────────────────────────── Professional Services 40 $150.00 $6,000.00 Software License 5 $299.99 $1,499.95 Support Package 1 $500.00 $500.00 ───────────────────────────────────────────────────────── Subtotal: $7,999.95 Tax (8.75%): $699.99 ───────────────────────── TOTAL: $8,699.94 Payment Terms: Net 30 Thank you for your business!`; // Helper function to convert object to XML // Prepare fields for JSON/XML output - remove duplicates and restructure function prepareFieldsForOutput(fields, format = "json") { if (!fields || typeof fields !== "object") { return fields; } const output = { ...fields }; // Extract Fields from root level if it exists const rootFields = output.Fields; // Remove Fields from output temporarily (will be added back at top) delete output.Fields; // Remove full_text from top-level if pages array exists (to avoid duplication) if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { delete output.full_text; // Clean up each page: remove full_text from page.fields (it duplicates page.text) output.pages = output.pages.map(page => { const cleanedPage = { ...page }; if (cleanedPage.fields && typeof cleanedPage.fields === "object") { const cleanedFields = { ...cleanedPage.fields }; // Remove full_text from page fields (duplicates page.text) delete cleanedFields.full_text; cleanedPage.fields = cleanedFields; } return cleanedPage; }); } // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields")); output.pages.forEach((page, idx) => { const pageNum = page.page_number || idx + 1; const pageFields = page.fields || {}; // Remove duplicate fields from page.fields: // 1. Remove full_text (duplicates page.text) // 2. Remove fields that match top-level fields (already shown at root) const cleanedPageFields = {}; for (const [key, value] of Object.entries(pageFields)) { // Skip full_text and fields that match top-level exactly if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { cleanedPageFields[key] = value; } } const pageObj = { text: page.text || "", confidence: page.confidence || 0, doc_type: page.doc_type || "other" }; // Add table and footer_notes if they exist if (page.table && Array.isArray(page.table) && page.table.length > 0) { pageObj.table = page.table; } if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) { pageObj.footer_notes = page.footer_notes; } // Only add fields if there are unique page-specific fields if (Object.keys(cleanedPageFields).length > 0) { pageObj.fields = cleanedPageFields; } output[`page_${pageNum}`] = pageObj; }); // Remove pages array - we now have page_1, page_2, etc. as separate fields delete output.pages; } // Handle page_X structure (from backend) - remove Fields from page objects if they exist if (output && typeof output === "object") { const pageKeys = Object.keys(output).filter(k => k.startsWith("page_")); for (const pageKey of pageKeys) { const pageData = output[pageKey]; if (pageData && typeof pageData === "object") { // Remove Fields from page objects (it's now at root level) delete pageData.Fields; delete pageData.metadata; } } } // Rebuild output with Fields at the top (only if it exists and is not empty) const finalOutput = {}; if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) { finalOutput.Fields = rootFields; } // Add all other keys Object.keys(output).forEach(key => { finalOutput[key] = output[key]; }); return finalOutput; } function objectToXML(obj, rootName = "extraction") { // Prepare fields - remove full_text if pages exist const preparedObj = prepareFieldsForOutput(obj, "xml"); let xml = `\n<${rootName}>\n`; const convert = (obj, indent = " ") => { for (const [key, value] of Object.entries(obj)) { if (value === null || value === undefined) continue; // Skip full_text if pages exist (already handled in prepareFieldsForOutput) if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { continue; } if (Array.isArray(value)) { value.forEach((item) => { xml += `${indent}<${key}>\n`; if (typeof item === "object") { convert(item, indent + " "); } else { xml += `${indent} ${escapeXML(String(item))}\n`; } xml += `${indent}\n`; }); } else if (typeof value === "object") { xml += `${indent}<${key}>\n`; convert(value, indent + " "); xml += `${indent}\n`; } else { xml += `${indent}<${key}>${escapeXML(String(value))}\n`; } } }; convert(preparedObj); xml += ``; return xml; } function escapeXML(str) { return str .replace(/&/g, "&") .replace(//g, ">") .replace(/"/g, """) .replace(/'/g, "'"); } // Helper function to extract text from page structure function extractTextFromFields(fields) { if (!fields || typeof fields !== "object") { return ""; } // Check for page_X structure first (preferred format) const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_")); if (pageKeys.length > 0) { // Get text from first page (or combine all pages) const pageTexts = pageKeys.map(key => { const page = fields[key]; if (page && page.text) { return page.text; } return ""; }).filter(text => text); if (pageTexts.length > 0) { return pageTexts.join("\n\n"); } } // Fallback to full_text if (fields.full_text) { return fields.full_text; } return ""; } // Helper function to format fields as readable text function fieldsToText(fields) { if (!fields || typeof fields !== "object") { return "No data extracted."; } // Extract text from page structure or full_text const extractedText = extractTextFromFields(fields); if (extractedText) { return extractedText; // Don't show pages array separately if full_text already contains page markers // (full_text from backend already includes "=== PAGE 1 ===" etc.) const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page"); // Only show pages array if full_text doesn't already have page breakdown if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) { text += "\n\n=== TEXT BY PAGE ===\n\n"; fields.pages.forEach((page, idx) => { text += `--- Page ${page.page_number || idx + 1} ---\n`; text += page.text || ""; text += "\n\n"; }); } // Then show other structured fields const otherFields = { ...fields }; delete otherFields.full_text; delete otherFields.pages; if (Object.keys(otherFields).length > 0) { text += "\n\n=== STRUCTURED FIELDS ===\n\n"; const formatValue = (key, value, indent = "") => { if (Array.isArray(value)) { text += `${indent}${key}:\n`; value.forEach((item, idx) => { if (typeof item === "object") { text += `${indent} Item ${idx + 1}:\n`; Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); } else { text += `${indent} - ${item}\n`; } }); } else if (typeof value === "object" && value !== null) { text += `${indent}${key}:\n`; Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); } else { text += `${indent}${key}: ${value}\n`; } }; Object.entries(otherFields).forEach(([key, value]) => { formatValue(key, value); text += "\n"; }); } return text.trim(); } // Fallback: format all fields normally let text = ""; const formatValue = (key, value, indent = "") => { if (Array.isArray(value)) { text += `${indent}${key}:\n`; value.forEach((item, idx) => { if (typeof item === "object") { text += `${indent} Item ${idx + 1}:\n`; Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); } else { text += `${indent} - ${item}\n`; } }); } else if (typeof value === "object" && value !== null) { text += `${indent}${key}:\n`; Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); } else { text += `${indent}${key}: ${value}\n`; } }; Object.entries(fields).forEach(([key, value]) => { formatValue(key, value); text += "\n"; }); return text.trim() || "No data extracted."; } export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult, onNewUpload }) { const [activeTab, setActiveTab] = useState("json"); const [copied, setCopied] = useState(false); const [statusMessage, setStatusMessage] = useState("Preparing document..."); // Get fields from extraction result, default to empty object const fields = extractionResult?.fields || {}; const confidence = extractionResult?.confidence || 0; const fieldsExtracted = extractionResult?.fieldsExtracted || 0; const totalTime = extractionResult?.totalTime || 0; // Dynamic status messages that rotate during processing const statusMessages = [ "Preparing document...", "Converting pages to images...", "Visual Reasoning...", "Reading text from document...", "Identifying document structure...", "Extracting tables and data...", "Analyzing content...", "Processing pages...", "Organizing extracted information...", "Finalizing results...", ]; // Rotate status messages during processing const messageIndexRef = useRef(0); useEffect(() => { if (!isProcessing) { setStatusMessage("Analyzing document structure"); messageIndexRef.current = 0; return; } setStatusMessage(statusMessages[0]); messageIndexRef.current = 0; const interval = setInterval(() => { messageIndexRef.current = (messageIndexRef.current + 1) % statusMessages.length; setStatusMessage(statusMessages[messageIndexRef.current]); }, 2500); // Change message every 2.5 seconds return () => clearInterval(interval); }, [isProcessing]); // Initialize expanded sections based on available fields const [expandedSections, setExpandedSections] = useState(() => Object.keys(fields).slice(0, 5) // Expand first 5 sections by default ); // Helper function to convert HTML to formatted plain text with layout preserved const htmlToFormattedText = (html) => { if (!html) return ""; // Create a temporary div to parse HTML const tempDiv = document.createElement("div"); tempDiv.innerHTML = html; let text = ""; // Process each element const processNode = (node) => { if (node.nodeType === Node.TEXT_NODE) { return node.textContent; } if (node.nodeType !== Node.ELEMENT_NODE) { return ""; } const tagName = node.tagName?.toLowerCase(); const children = Array.from(node.childNodes); switch (tagName) { case "h1": return "\n\n" + processChildren(children).trim() + "\n\n"; case "h2": return "\n\n" + processChildren(children).trim() + "\n\n"; case "h3": return "\n" + processChildren(children).trim() + "\n"; case "p": return processChildren(children) + "\n\n"; case "br": return "\n"; case "strong": case "b": return processChildren(children); case "em": case "i": return processChildren(children); case "sup": return processChildren(children); case "sub": return processChildren(children); case "table": return "\n" + processTable(node) + "\n\n"; case "ul": case "ol": return "\n" + processList(node) + "\n\n"; case "li": return " • " + processChildren(children).trim() + "\n"; default: return processChildren(children); } }; const processChildren = (children) => { return children.map(processNode).join(""); }; const processTable = (table) => { let tableText = ""; const rows = table.querySelectorAll("tr"); if (rows.length === 0) return ""; // First pass: calculate column widths const allRows = Array.from(rows); const columnCount = Math.max(...allRows.map(row => row.querySelectorAll("td, th").length)); const columnWidths = new Array(columnCount).fill(0); allRows.forEach(row => { const cells = row.querySelectorAll("td, th"); cells.forEach((cell, colIndex) => { const cellText = processChildren(Array.from(cell.childNodes)).trim().replace(/\s+/g, " "); columnWidths[colIndex] = Math.max(columnWidths[colIndex] || 0, cellText.length, 10); }); }); // Second pass: format rows allRows.forEach((row, rowIndex) => { const cells = row.querySelectorAll("td, th"); const cellTexts = Array.from(cells).map(cell => { let cellContent = processChildren(Array.from(cell.childNodes)).trim(); cellContent = cellContent.replace(/\s+/g, " "); return cellContent; }); // Pad cells to column widths const paddedCells = cellTexts.map((text, i) => { const width = columnWidths[i] || 10; return text.padEnd(width); }); tableText += paddedCells.join(" | ") + "\n"; // Add separator after header row if (rowIndex === 0 && row.querySelector("th")) { tableText += columnWidths.map(w => "-".repeat(w)).join("-|-") + "\n"; } }); return tableText; }; const processList = (list) => { const items = list.querySelectorAll("li"); return Array.from(items).map(item => { return " • " + processChildren(Array.from(item.childNodes)).trim(); }).join("\n"); }; text = processChildren(Array.from(tempDiv.childNodes)); // Clean up extra newlines text = text.replace(/\n{3,}/g, "\n\n"); text = text.trim(); return text; }; const handleCopy = () => { let content = ""; if (activeTab === "json") { const preparedFields = prepareFieldsForOutput(fields, "json"); content = JSON.stringify(preparedFields, null, 2); } else if (activeTab === "xml") { content = objectToXML(fields); } else { // For text tab, get the formatted HTML and convert to plain text with layout const textContent = extractTextFromFields(fields); const htmlContent = renderMarkdownToHTML(textContent); content = htmlToFormattedText(htmlContent); } navigator.clipboard.writeText(content); setCopied(true); setTimeout(() => setCopied(false), 2000); }; // Get prepared fields for display const preparedFields = React.useMemo(() => { return prepareFieldsForOutput(fields, "json"); }, [fields]); // Update expanded sections when fields change React.useEffect(() => { if (extractionResult?.fields) { setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5)); } }, [extractionResult]); const toggleSection = (section) => { setExpandedSections((prev) => prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section] ); }; const renderValue = (value) => { if (typeof value === "number") { return {value}; } if (typeof value === "string") { return "{value}"; } return String(value); }; const renderSection = (key, value, level = 0) => { const isExpanded = expandedSections.includes(key); const isObject = typeof value === "object" && value !== null; const isArray = Array.isArray(value); if (!isObject) { return (
"{key}" : {renderValue(value)}
); } return (
{isExpanded && ( {isArray ? ( value.map((item, idx) => (
{Object.entries(item).map(([k, v]) => renderSection(k, v, level + 2))} {idx < value.length - 1 &&
}
)) ) : ( Object.entries(value).map(([k, v]) => renderSection(k, v, level + 1)) )}
{isArray ? "]" : "}"}
)}
); }; return (
{/* Header */}

Extracted Data

{isComplete ? `${fieldsExtracted} field${fieldsExtracted !== 1 ? 's' : ''} extracted` : "Waiting for extraction"}

{isComplete && onNewUpload && ( )}
{isComplete && (
Text JSON XML
)}
{/* Output Area */}
{!hasFile ? (

Extracted data will appear here

) : isProcessing ? (

Extracting data...

{statusMessage}

{[0, 1, 2].map((i) => ( ))}
) : isComplete && Object.keys(fields).length === 0 ? (

No data extracted

The document may not contain extractable fields

) : (
{activeTab === "text" ? (
) : activeTab === "json" ? (
{"{"} {Object.keys(preparedFields).length > 0 ? ( Object.entries(preparedFields).map(([key, value]) => renderSection(key, value, 1) ) ) : (
No fields extracted
)} {"}"}
) : (
                {objectToXML(fields).split("\n").map((line, i) => (
                  
{line.includes("<") ? ( <> {line.split(/(<\/?[\w\s=".-]+>)/g).map((part, j) => { if (part.startsWith(" {part} ); } if (part.startsWith("<")) { return ( {part} ); } return ( {part} ); })} ) : ( line )}
))}
)}
)}
{/* Confidence Footer */} {isComplete && extractionResult && (
= 90 ? "bg-emerald-500" : confidence >= 70 ? "bg-amber-500" : "bg-red-500" )} /> Confidence: {confidence > 0 ? `${confidence.toFixed(1)}%` : "N/A"}
Fields: {fieldsExtracted}
Processed in {totalTime >= 1000 ? `${(totalTime / 1000).toFixed(1)}s` : `${totalTime}ms`}
)}
); }