Spaces:
Sleeping
Sleeping
| import React, { useState, useEffect, useRef } from "react"; | |
| import { motion, AnimatePresence } from "framer-motion"; | |
| import { | |
| Code2, | |
| Copy, | |
| Check, | |
| Braces, | |
| FileCode2, | |
| FileText, | |
| Sparkles, | |
| ChevronDown, | |
| Upload, | |
| } from "lucide-react"; | |
| import { Button } from "@/components/ui/button"; | |
| import { Tabs, TabsList, TabsTrigger } from "@/components/ui/tabs"; | |
| import { cn } from "@/lib/utils"; | |
| // Helper function to convert pipe-separated tables to HTML tables | |
| function convertPipeTablesToHTML(text) { | |
| if (!text) return text; | |
| const lines = text.split('\n'); | |
| const result = []; | |
| let i = 0; | |
| while (i < lines.length) { | |
| const line = lines[i]; | |
| // Check if this line looks like a table row (has multiple pipes) | |
| if (line.includes('|') && line.split('|').length >= 3) { | |
| // Check if it's a separator line (only |, -, :, spaces) | |
| const isSeparator = /^[\s|\-:]+$/.test(line.trim()); | |
| if (!isSeparator) { | |
| // Start of a table - collect all table rows | |
| const tableRows = []; | |
| let j = i; | |
| // Collect header row | |
| const headerLine = lines[j]; | |
| const headerCells = headerLine.split('|').map(cell => cell.trim()).filter(cell => cell || cell === ''); | |
| // Remove empty cells at start/end | |
| if (headerCells.length > 0 && !headerCells[0]) headerCells.shift(); | |
| if (headerCells.length > 0 && !headerCells[headerCells.length - 1]) headerCells.pop(); | |
| if (headerCells.length >= 2) { | |
| tableRows.push(headerCells); | |
| j++; | |
| // Skip separator line if present | |
| if (j < lines.length && /^[\s|\-:]+$/.test(lines[j].trim())) { | |
| j++; | |
| } | |
| // Collect data rows | |
| while (j < lines.length) { | |
| const rowLine = lines[j]; | |
| if (!rowLine.trim()) break; // Empty line ends table | |
| // Check if it's still a table row | |
| if (rowLine.includes('|') && rowLine.split('|').length >= 2) { | |
| const isRowSeparator = /^[\s|\-:]+$/.test(rowLine.trim()); | |
| if (!isRowSeparator) { | |
| const rowCells = rowLine.split('|').map(cell => cell.trim()); | |
| // Remove empty cells at start/end | |
| if (rowCells.length > 0 && !rowCells[0]) rowCells.shift(); | |
| if (rowCells.length > 0 && !rowCells[rowCells.length - 1]) rowCells.pop(); | |
| tableRows.push(rowCells); | |
| j++; | |
| } else { | |
| j++; | |
| } | |
| } else { | |
| break; // Not a table row anymore | |
| } | |
| } | |
| // Convert to HTML table | |
| if (tableRows.length > 0) { | |
| let htmlTable = '<table class="border-collapse border border-gray-300 w-full my-4">\n<thead>\n<tr>'; | |
| // Header row | |
| tableRows[0].forEach(cell => { | |
| htmlTable += `<th class="border border-gray-300 px-4 py-2 bg-gray-100 font-semibold text-left">${escapeHtml(cell)}</th>`; | |
| }); | |
| htmlTable += '</tr>\n</thead>\n<tbody>\n'; | |
| // Data rows | |
| for (let rowIdx = 1; rowIdx < tableRows.length; rowIdx++) { | |
| htmlTable += '<tr>'; | |
| tableRows[rowIdx].forEach((cell, colIdx) => { | |
| // Use header cell count to ensure alignment | |
| const cellContent = cell || ''; | |
| htmlTable += `<td class="border border-gray-300 px-4 py-2">${escapeHtml(cellContent)}</td>`; | |
| }); | |
| htmlTable += '</tr>\n'; | |
| } | |
| htmlTable += '</tbody>\n</table>'; | |
| result.push(htmlTable); | |
| i = j; | |
| continue; | |
| } | |
| } | |
| } | |
| } | |
| // Not a table row, add as-is | |
| result.push(line); | |
| i++; | |
| } | |
| return result.join('\n'); | |
| } | |
| // Helper function to escape HTML | |
| function escapeHtml(text) { | |
| if (!text) return ''; | |
| const div = document.createElement('div'); | |
| div.textContent = text; | |
| return div.innerHTML; | |
| } | |
| // Helper function to convert markdown/HTML text to safe HTML | |
| function renderMarkdownToHTML(text) { | |
| if (!text) return ""; | |
| let html = text; | |
| // FIRST: Convert pipe-separated tables to HTML tables | |
| html = convertPipeTablesToHTML(html); | |
| // Convert LaTeX-style superscripts/subscripts FIRST (before protecting tables) | |
| // This ensures they're converted everywhere, including inside tables | |
| // Convert LaTeX-style superscripts: $^{text}$ or $^text$ to <sup>text</sup> | |
| html = html.replace(/\$\s*\^\s*\{([^}]+)\}\s*\$/g, '<sup>$1</sup>'); | |
| html = html.replace(/\$\s*\^\s*([^\s$<>]+)\s*\$/g, '<sup>$1</sup>'); | |
| // Convert LaTeX-style subscripts: $_{text}$ or $_text$ to <sub>text</sub> | |
| html = html.replace(/\$\s*_\s*\{([^}]+)\}\s*\$/g, '<sub>$1</sub>'); | |
| html = html.replace(/\$\s*_\s*([^\s$<>]+)\s*\$/g, '<sub>$1</sub>'); | |
| // Split by HTML tags to preserve existing HTML (like tables) | |
| // Process markdown only in non-HTML sections | |
| // First, protect existing HTML blocks (tables, etc.) | |
| const htmlBlocks = []; | |
| let htmlBlockIndex = 0; | |
| // Extract and protect HTML table blocks | |
| html = html.replace(/<table[\s\S]*?<\/table>/gi, (match) => { | |
| const placeholder = `__HTML_BLOCK_${htmlBlockIndex}__`; | |
| htmlBlocks[htmlBlockIndex] = match; | |
| htmlBlockIndex++; | |
| return placeholder; | |
| }); | |
| // Convert markdown headers (only if not inside HTML) | |
| html = html.replace(/^### (.*$)/gim, '<h3>$1</h3>'); | |
| html = html.replace(/^## (.*$)/gim, '<h2>$1</h2>'); | |
| html = html.replace(/^# (.*$)/gim, '<h1>$1</h1>'); | |
| // Convert markdown bold/italic (but not inside HTML tags) | |
| html = html.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>'); | |
| html = html.replace(/\*(.*?)\*/g, '<em>$1</em>'); | |
| // Convert markdown links | |
| html = html.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank" rel="noopener noreferrer">$1</a>'); | |
| // Convert line breaks to paragraphs (but preserve structure around HTML blocks) | |
| const parts = html.split(/(__HTML_BLOCK_\d+__)/); | |
| const processedParts = parts.map((part, index) => { | |
| if (part.match(/^__HTML_BLOCK_\d+__$/)) { | |
| // Restore HTML block | |
| const blockIndex = parseInt(part.match(/\d+/)[0]); | |
| return htmlBlocks[blockIndex]; | |
| } else { | |
| // Process markdown in this part | |
| let processed = part; | |
| // Convert double line breaks to paragraph breaks | |
| processed = processed.replace(/\n\n+/g, '</p><p>'); | |
| // Convert single line breaks to <br> (but not if already in a tag) | |
| processed = processed.replace(/([^\n>])\n([^\n<])/g, '$1<br>$2'); | |
| // Wrap in paragraph if there's content | |
| if (processed.trim() && !processed.trim().startsWith('<')) { | |
| processed = '<p>' + processed + '</p>'; | |
| } | |
| return processed; | |
| } | |
| }); | |
| html = processedParts.join(''); | |
| // Process LaTeX notation in restored HTML blocks (tables) as well | |
| // This handles any LaTeX that might be in table cells | |
| html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*\^\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, | |
| (match, openTag, before, supText, after, closeTag) => { | |
| return openTag + before + '<sup>' + supText + '</sup>' + after + closeTag; | |
| }); | |
| html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*\^\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, | |
| (match, openTag, before, supText, after, closeTag) => { | |
| return openTag + before + '<sup>' + supText + '</sup>' + after + closeTag; | |
| }); | |
| html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*_\s*\{([^}]+)\}\s*\$([^<]*)(<\/td>|<\/th>)/gi, | |
| (match, openTag, before, subText, after, closeTag) => { | |
| return openTag + before + '<sub>' + subText + '</sub>' + after + closeTag; | |
| }); | |
| html = html.replace(/(<td[^>]*>|<th[^>]*>)([^<]*)\$\s*_\s*([^\s$<>]+)\s*\$([^<]*)(<\/td>|<\/th>)/gi, | |
| (match, openTag, before, subText, after, closeTag) => { | |
| return openTag + before + '<sub>' + subText + '</sub>' + after + closeTag; | |
| }); | |
| // Clean up empty paragraphs and fix paragraph structure | |
| html = html.replace(/<p><\/p>/g, ''); | |
| html = html.replace(/<p>\s*<br>\s*<\/p>/g, ''); | |
| html = html.replace(/<p>\s*<\/p>/g, ''); | |
| // Ensure proper spacing around HTML blocks | |
| html = html.replace(/(<\/table>)\s*(<h[1-3])/g, '$1</p><p>$2'); | |
| html = html.replace(/(<\/h[1-3]>)\s*(<table)/g, '$1<p>$2'); | |
| html = html.replace(/(<\/table>)\s*(<p>)/g, '$1$2'); | |
| return html; | |
| } | |
| // Mock extracted data | |
| const mockData = { | |
| document: { | |
| type: "Invoice", | |
| confidence: 0.98, | |
| }, | |
| vendor: { | |
| name: "Acme Corporation", | |
| address: "123 Business Ave, Suite 400", | |
| city: "San Francisco", | |
| state: "CA", | |
| zip: "94102", | |
| phone: "+1 (555) 123-4567", | |
| }, | |
| invoice: { | |
| number: "INV-2024-0847", | |
| date: "2024-01-15", | |
| due_date: "2024-02-14", | |
| po_number: "PO-9823", | |
| }, | |
| items: [ | |
| { description: "Professional Services", quantity: 40, unit_price: 150.0, total: 6000.0 }, | |
| { description: "Software License", quantity: 5, unit_price: 299.99, total: 1499.95 }, | |
| { description: "Support Package", quantity: 1, unit_price: 500.0, total: 500.0 }, | |
| ], | |
| totals: { | |
| subtotal: 7999.95, | |
| tax_rate: 0.0875, | |
| tax_amount: 699.99, | |
| total: 8699.94, | |
| }, | |
| }; | |
| const mockXML = `<?xml version="1.0" encoding="UTF-8"?> | |
| <extraction> | |
| <document type="Invoice" confidence="0.98"/> | |
| <vendor> | |
| <name>Acme Corporation</name> | |
| <address>123 Business Ave, Suite 400</address> | |
| <city>San Francisco</city> | |
| <state>CA</state> | |
| <zip>94102</zip> | |
| </vendor> | |
| <invoice> | |
| <number>INV-2024-0847</number> | |
| <date>2024-01-15</date> | |
| <due_date>2024-02-14</due_date> | |
| </invoice> | |
| <items> | |
| <item> | |
| <description>Professional Services</description> | |
| <quantity>40</quantity> | |
| <total>6000.00</total> | |
| </item> | |
| </items> | |
| <totals> | |
| <subtotal>7999.95</subtotal> | |
| <tax>699.99</tax> | |
| <total>8699.94</total> | |
| </totals> | |
| </extraction>`; | |
| const mockText = `INVOICE | |
| ACME CORPORATION | |
| 123 Business Ave, Suite 400 | |
| San Francisco, CA 94102 | |
| Phone: +1 (555) 123-4567 | |
| Invoice Number: INV-2024-0847 | |
| Invoice Date: January 15, 2024 | |
| Due Date: February 14, 2024 | |
| PO Number: PO-9823 | |
| BILL TO: | |
| Customer Name | |
| 456 Client Street | |
| New York, NY 10001 | |
| ITEMS: | |
| ───────────────────────────────────────────────────────── | |
| Description Qty Unit Price Total | |
| ───────────────────────────────────────────────────────── | |
| Professional Services 40 $150.00 $6,000.00 | |
| Software License 5 $299.99 $1,499.95 | |
| Support Package 1 $500.00 $500.00 | |
| ───────────────────────────────────────────────────────── | |
| Subtotal: $7,999.95 | |
| Tax (8.75%): $699.99 | |
| ───────────────────────── | |
| TOTAL: $8,699.94 | |
| Payment Terms: Net 30 | |
| Thank you for your business!`; | |
| // Helper function to convert object to XML | |
| // Prepare fields for JSON/XML output - remove duplicates and restructure | |
| function prepareFieldsForOutput(fields, format = "json") { | |
| if (!fields || typeof fields !== "object") { | |
| return fields; | |
| } | |
| const output = { ...fields }; | |
| // Extract Fields from root level if it exists | |
| const rootFields = output.Fields; | |
| // Remove Fields from output temporarily (will be added back at top) | |
| delete output.Fields; | |
| // Remove full_text from top-level if pages array exists (to avoid duplication) | |
| if (output.pages && Array.isArray(output.pages) && output.pages.length > 0) { | |
| delete output.full_text; | |
| // Clean up each page: remove full_text from page.fields (it duplicates page.text) | |
| output.pages = output.pages.map(page => { | |
| const cleanedPage = { ...page }; | |
| if (cleanedPage.fields && typeof cleanedPage.fields === "object") { | |
| const cleanedFields = { ...cleanedPage.fields }; | |
| // Remove full_text from page fields (duplicates page.text) | |
| delete cleanedFields.full_text; | |
| cleanedPage.fields = cleanedFields; | |
| } | |
| return cleanedPage; | |
| }); | |
| } | |
| // For JSON and XML: restructure pages into separate top-level fields (page_1, page_2, etc.) | |
| if ((format === "json" || format === "xml") && output.pages && Array.isArray(output.pages)) { | |
| // Get top-level field keys (these are merged from all pages - avoid duplicating in page fields) | |
| const topLevelKeys = new Set(Object.keys(output).filter(k => k !== "pages" && k !== "full_text" && k !== "Fields")); | |
| output.pages.forEach((page, idx) => { | |
| const pageNum = page.page_number || idx + 1; | |
| const pageFields = page.fields || {}; | |
| // Remove duplicate fields from page.fields: | |
| // 1. Remove full_text (duplicates page.text) | |
| // 2. Remove fields that match top-level fields (already shown at root) | |
| const cleanedPageFields = {}; | |
| for (const [key, value] of Object.entries(pageFields)) { | |
| // Skip full_text and fields that match top-level exactly | |
| if (key !== "full_text" && (!topLevelKeys.has(key) || (value !== output[key]))) { | |
| cleanedPageFields[key] = value; | |
| } | |
| } | |
| const pageObj = { | |
| text: page.text || "", | |
| confidence: page.confidence || 0, | |
| doc_type: page.doc_type || "other" | |
| }; | |
| // Add table and footer_notes if they exist | |
| if (page.table && Array.isArray(page.table) && page.table.length > 0) { | |
| pageObj.table = page.table; | |
| } | |
| if (page.footer_notes && Array.isArray(page.footer_notes) && page.footer_notes.length > 0) { | |
| pageObj.footer_notes = page.footer_notes; | |
| } | |
| // Only add fields if there are unique page-specific fields | |
| if (Object.keys(cleanedPageFields).length > 0) { | |
| pageObj.fields = cleanedPageFields; | |
| } | |
| output[`page_${pageNum}`] = pageObj; | |
| }); | |
| // Remove pages array - we now have page_1, page_2, etc. as separate fields | |
| delete output.pages; | |
| } | |
| // Handle page_X structure (from backend) - remove Fields from page objects if they exist | |
| if (output && typeof output === "object") { | |
| const pageKeys = Object.keys(output).filter(k => k.startsWith("page_")); | |
| for (const pageKey of pageKeys) { | |
| const pageData = output[pageKey]; | |
| if (pageData && typeof pageData === "object") { | |
| // Remove Fields from page objects (it's now at root level) | |
| delete pageData.Fields; | |
| delete pageData.metadata; | |
| } | |
| } | |
| } | |
| // Rebuild output with Fields at the top (only if it exists and is not empty) | |
| const finalOutput = {}; | |
| if (rootFields && typeof rootFields === "object" && Object.keys(rootFields).length > 0) { | |
| finalOutput.Fields = rootFields; | |
| } | |
| // Add all other keys | |
| Object.keys(output).forEach(key => { | |
| finalOutput[key] = output[key]; | |
| }); | |
| return finalOutput; | |
| } | |
| function objectToXML(obj, rootName = "extraction") { | |
| // Prepare fields - remove full_text if pages exist | |
| const preparedObj = prepareFieldsForOutput(obj, "xml"); | |
| let xml = `<?xml version="1.0" encoding="UTF-8"?>\n<${rootName}>\n`; | |
| const convert = (obj, indent = " ") => { | |
| for (const [key, value] of Object.entries(obj)) { | |
| if (value === null || value === undefined) continue; | |
| // Skip full_text if pages exist (already handled in prepareFieldsForOutput) | |
| if (key === "full_text" && obj.pages && Array.isArray(obj.pages) && obj.pages.length > 0) { | |
| continue; | |
| } | |
| if (Array.isArray(value)) { | |
| value.forEach((item) => { | |
| xml += `${indent}<${key}>\n`; | |
| if (typeof item === "object") { | |
| convert(item, indent + " "); | |
| } else { | |
| xml += `${indent} ${escapeXML(String(item))}\n`; | |
| } | |
| xml += `${indent}</${key}>\n`; | |
| }); | |
| } else if (typeof value === "object") { | |
| xml += `${indent}<${key}>\n`; | |
| convert(value, indent + " "); | |
| xml += `${indent}</${key}>\n`; | |
| } else { | |
| xml += `${indent}<${key}>${escapeXML(String(value))}</${key}>\n`; | |
| } | |
| } | |
| }; | |
| convert(preparedObj); | |
| xml += `</${rootName}>`; | |
| return xml; | |
| } | |
| function escapeXML(str) { | |
| return str | |
| .replace(/&/g, "&") | |
| .replace(/</g, "<") | |
| .replace(/>/g, ">") | |
| .replace(/"/g, """) | |
| .replace(/'/g, "'"); | |
| } | |
| // Helper function to extract text from page structure | |
| function extractTextFromFields(fields) { | |
| if (!fields || typeof fields !== "object") { | |
| return ""; | |
| } | |
| // Check for page_X structure first (preferred format) | |
| const pageKeys = Object.keys(fields).filter(key => key.startsWith("page_")); | |
| if (pageKeys.length > 0) { | |
| // Get text from first page (or combine all pages) | |
| const pageTexts = pageKeys.map(key => { | |
| const page = fields[key]; | |
| if (page && page.text) { | |
| return page.text; | |
| } | |
| return ""; | |
| }).filter(text => text); | |
| if (pageTexts.length > 0) { | |
| return pageTexts.join("\n\n"); | |
| } | |
| } | |
| // Fallback to full_text | |
| if (fields.full_text) { | |
| return fields.full_text; | |
| } | |
| return ""; | |
| } | |
| // Helper function to format fields as readable text | |
| function fieldsToText(fields) { | |
| if (!fields || typeof fields !== "object") { | |
| return "No data extracted."; | |
| } | |
| // Extract text from page structure or full_text | |
| const extractedText = extractTextFromFields(fields); | |
| if (extractedText) { | |
| return extractedText; | |
| // Don't show pages array separately if full_text already contains page markers | |
| // (full_text from backend already includes "=== PAGE 1 ===" etc.) | |
| const hasPageMarkers = fields.full_text.includes("=== PAGE") || fields.full_text.includes("--- Page"); | |
| // Only show pages array if full_text doesn't already have page breakdown | |
| if (!hasPageMarkers && fields.pages && Array.isArray(fields.pages)) { | |
| text += "\n\n=== TEXT BY PAGE ===\n\n"; | |
| fields.pages.forEach((page, idx) => { | |
| text += `--- Page ${page.page_number || idx + 1} ---\n`; | |
| text += page.text || ""; | |
| text += "\n\n"; | |
| }); | |
| } | |
| // Then show other structured fields | |
| const otherFields = { ...fields }; | |
| delete otherFields.full_text; | |
| delete otherFields.pages; | |
| if (Object.keys(otherFields).length > 0) { | |
| text += "\n\n=== STRUCTURED FIELDS ===\n\n"; | |
| const formatValue = (key, value, indent = "") => { | |
| if (Array.isArray(value)) { | |
| text += `${indent}${key}:\n`; | |
| value.forEach((item, idx) => { | |
| if (typeof item === "object") { | |
| text += `${indent} Item ${idx + 1}:\n`; | |
| Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); | |
| } else { | |
| text += `${indent} - ${item}\n`; | |
| } | |
| }); | |
| } else if (typeof value === "object" && value !== null) { | |
| text += `${indent}${key}:\n`; | |
| Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); | |
| } else { | |
| text += `${indent}${key}: ${value}\n`; | |
| } | |
| }; | |
| Object.entries(otherFields).forEach(([key, value]) => { | |
| formatValue(key, value); | |
| text += "\n"; | |
| }); | |
| } | |
| return text.trim(); | |
| } | |
| // Fallback: format all fields normally | |
| let text = ""; | |
| const formatValue = (key, value, indent = "") => { | |
| if (Array.isArray(value)) { | |
| text += `${indent}${key}:\n`; | |
| value.forEach((item, idx) => { | |
| if (typeof item === "object") { | |
| text += `${indent} Item ${idx + 1}:\n`; | |
| Object.entries(item).forEach(([k, v]) => formatValue(k, v, indent + " ")); | |
| } else { | |
| text += `${indent} - ${item}\n`; | |
| } | |
| }); | |
| } else if (typeof value === "object" && value !== null) { | |
| text += `${indent}${key}:\n`; | |
| Object.entries(value).forEach(([k, v]) => formatValue(k, v, indent + " ")); | |
| } else { | |
| text += `${indent}${key}: ${value}\n`; | |
| } | |
| }; | |
| Object.entries(fields).forEach(([key, value]) => { | |
| formatValue(key, value); | |
| text += "\n"; | |
| }); | |
| return text.trim() || "No data extracted."; | |
| } | |
| export default function ExtractionOutput({ hasFile, isProcessing, isComplete, extractionResult, onNewUpload }) { | |
| const [activeTab, setActiveTab] = useState("json"); | |
| const [copied, setCopied] = useState(false); | |
| const [statusMessage, setStatusMessage] = useState("Preparing document..."); | |
| // Get fields from extraction result, default to empty object | |
| const fields = extractionResult?.fields || {}; | |
| const confidence = extractionResult?.confidence || 0; | |
| const fieldsExtracted = extractionResult?.fieldsExtracted || 0; | |
| const totalTime = extractionResult?.totalTime || 0; | |
| // Dynamic status messages that rotate during processing | |
| const statusMessages = [ | |
| "Preparing document...", | |
| "Converting pages to images...", | |
| "Visual Reasoning...", | |
| "Reading text from document...", | |
| "Identifying document structure...", | |
| "Extracting tables and data...", | |
| "Analyzing content...", | |
| "Processing pages...", | |
| "Organizing extracted information...", | |
| "Finalizing results...", | |
| ]; | |
| // Rotate status messages during processing | |
| const messageIndexRef = useRef(0); | |
| useEffect(() => { | |
| if (!isProcessing) { | |
| setStatusMessage("Analyzing document structure"); | |
| messageIndexRef.current = 0; | |
| return; | |
| } | |
| setStatusMessage(statusMessages[0]); | |
| messageIndexRef.current = 0; | |
| const interval = setInterval(() => { | |
| messageIndexRef.current = (messageIndexRef.current + 1) % statusMessages.length; | |
| setStatusMessage(statusMessages[messageIndexRef.current]); | |
| }, 2500); // Change message every 2.5 seconds | |
| return () => clearInterval(interval); | |
| }, [isProcessing]); | |
| // Initialize expanded sections based on available fields | |
| const [expandedSections, setExpandedSections] = useState(() => | |
| Object.keys(fields).slice(0, 5) // Expand first 5 sections by default | |
| ); | |
| // Helper function to convert HTML to formatted plain text with layout preserved | |
| const htmlToFormattedText = (html) => { | |
| if (!html) return ""; | |
| // Create a temporary div to parse HTML | |
| const tempDiv = document.createElement("div"); | |
| tempDiv.innerHTML = html; | |
| let text = ""; | |
| // Process each element | |
| const processNode = (node) => { | |
| if (node.nodeType === Node.TEXT_NODE) { | |
| return node.textContent; | |
| } | |
| if (node.nodeType !== Node.ELEMENT_NODE) { | |
| return ""; | |
| } | |
| const tagName = node.tagName?.toLowerCase(); | |
| const children = Array.from(node.childNodes); | |
| switch (tagName) { | |
| case "h1": | |
| return "\n\n" + processChildren(children).trim() + "\n\n"; | |
| case "h2": | |
| return "\n\n" + processChildren(children).trim() + "\n\n"; | |
| case "h3": | |
| return "\n" + processChildren(children).trim() + "\n"; | |
| case "p": | |
| return processChildren(children) + "\n\n"; | |
| case "br": | |
| return "\n"; | |
| case "strong": | |
| case "b": | |
| return processChildren(children); | |
| case "em": | |
| case "i": | |
| return processChildren(children); | |
| case "sup": | |
| return processChildren(children); | |
| case "sub": | |
| return processChildren(children); | |
| case "table": | |
| return "\n" + processTable(node) + "\n\n"; | |
| case "ul": | |
| case "ol": | |
| return "\n" + processList(node) + "\n\n"; | |
| case "li": | |
| return " • " + processChildren(children).trim() + "\n"; | |
| default: | |
| return processChildren(children); | |
| } | |
| }; | |
| const processChildren = (children) => { | |
| return children.map(processNode).join(""); | |
| }; | |
| const processTable = (table) => { | |
| let tableText = ""; | |
| const rows = table.querySelectorAll("tr"); | |
| if (rows.length === 0) return ""; | |
| // First pass: calculate column widths | |
| const allRows = Array.from(rows); | |
| const columnCount = Math.max(...allRows.map(row => row.querySelectorAll("td, th").length)); | |
| const columnWidths = new Array(columnCount).fill(0); | |
| allRows.forEach(row => { | |
| const cells = row.querySelectorAll("td, th"); | |
| cells.forEach((cell, colIndex) => { | |
| const cellText = processChildren(Array.from(cell.childNodes)).trim().replace(/\s+/g, " "); | |
| columnWidths[colIndex] = Math.max(columnWidths[colIndex] || 0, cellText.length, 10); | |
| }); | |
| }); | |
| // Second pass: format rows | |
| allRows.forEach((row, rowIndex) => { | |
| const cells = row.querySelectorAll("td, th"); | |
| const cellTexts = Array.from(cells).map(cell => { | |
| let cellContent = processChildren(Array.from(cell.childNodes)).trim(); | |
| cellContent = cellContent.replace(/\s+/g, " "); | |
| return cellContent; | |
| }); | |
| // Pad cells to column widths | |
| const paddedCells = cellTexts.map((text, i) => { | |
| const width = columnWidths[i] || 10; | |
| return text.padEnd(width); | |
| }); | |
| tableText += paddedCells.join(" | ") + "\n"; | |
| // Add separator after header row | |
| if (rowIndex === 0 && row.querySelector("th")) { | |
| tableText += columnWidths.map(w => "-".repeat(w)).join("-|-") + "\n"; | |
| } | |
| }); | |
| return tableText; | |
| }; | |
| const processList = (list) => { | |
| const items = list.querySelectorAll("li"); | |
| return Array.from(items).map(item => { | |
| return " • " + processChildren(Array.from(item.childNodes)).trim(); | |
| }).join("\n"); | |
| }; | |
| text = processChildren(Array.from(tempDiv.childNodes)); | |
| // Clean up extra newlines | |
| text = text.replace(/\n{3,}/g, "\n\n"); | |
| text = text.trim(); | |
| return text; | |
| }; | |
| const handleCopy = () => { | |
| let content = ""; | |
| if (activeTab === "json") { | |
| const preparedFields = prepareFieldsForOutput(fields, "json"); | |
| content = JSON.stringify(preparedFields, null, 2); | |
| } else if (activeTab === "xml") { | |
| content = objectToXML(fields); | |
| } else { | |
| // For text tab, get the formatted HTML and convert to plain text with layout | |
| const textContent = extractTextFromFields(fields); | |
| const htmlContent = renderMarkdownToHTML(textContent); | |
| content = htmlToFormattedText(htmlContent); | |
| } | |
| navigator.clipboard.writeText(content); | |
| setCopied(true); | |
| setTimeout(() => setCopied(false), 2000); | |
| }; | |
| // Get prepared fields for display | |
| const preparedFields = React.useMemo(() => { | |
| return prepareFieldsForOutput(fields, "json"); | |
| }, [fields]); | |
| // Update expanded sections when fields change | |
| React.useEffect(() => { | |
| if (extractionResult?.fields) { | |
| setExpandedSections(Object.keys(extractionResult.fields).slice(0, 5)); | |
| } | |
| }, [extractionResult]); | |
| const toggleSection = (section) => { | |
| setExpandedSections((prev) => | |
| prev.includes(section) ? prev.filter((s) => s !== section) : [...prev, section] | |
| ); | |
| }; | |
| const renderValue = (value) => { | |
| if (typeof value === "number") { | |
| return <span className="text-amber-600">{value}</span>; | |
| } | |
| if (typeof value === "string") { | |
| return <span className="text-emerald-600">"{value}"</span>; | |
| } | |
| return String(value); | |
| }; | |
| const renderSection = (key, value, level = 0) => { | |
| const isExpanded = expandedSections.includes(key); | |
| const isObject = typeof value === "object" && value !== null; | |
| const isArray = Array.isArray(value); | |
| if (!isObject) { | |
| return ( | |
| <div | |
| key={key} | |
| className="flex items-start gap-2 py-1" | |
| style={{ paddingLeft: level * 16 }} | |
| > | |
| <span className="text-violet-500">"{key}"</span> | |
| <span className="text-slate-400">:</span> | |
| {renderValue(value)} | |
| </div> | |
| ); | |
| } | |
| return ( | |
| <div key={key}> | |
| <button | |
| onClick={() => toggleSection(key)} | |
| className="flex items-center gap-2 py-1 hover:bg-slate-50 w-full text-left rounded" | |
| style={{ paddingLeft: level * 16 }} | |
| > | |
| <ChevronDown | |
| className={cn( | |
| "h-3 w-3 text-slate-400 transition-transform", | |
| !isExpanded && "-rotate-90" | |
| )} | |
| /> | |
| <span className="text-violet-500">"{key}"</span> | |
| <span className="text-slate-400">:</span> | |
| <span className="text-slate-400">{isArray ? "[" : "{"}</span> | |
| {!isExpanded && ( | |
| <span className="text-slate-300 text-xs"> | |
| {isArray ? `${value.length} items` : `${Object.keys(value).length} fields`} | |
| </span> | |
| )} | |
| </button> | |
| <AnimatePresence> | |
| {isExpanded && ( | |
| <motion.div | |
| initial={{ height: 0, opacity: 0 }} | |
| animate={{ height: "auto", opacity: 1 }} | |
| exit={{ height: 0, opacity: 0 }} | |
| transition={{ duration: 0.2 }} | |
| className="overflow-hidden" | |
| > | |
| {isArray ? ( | |
| value.map((item, idx) => ( | |
| <div key={idx} className="border-l border-slate-100 ml-4"> | |
| {Object.entries(item).map(([k, v]) => renderSection(k, v, level + 2))} | |
| {idx < value.length - 1 && <div className="h-2" />} | |
| </div> | |
| )) | |
| ) : ( | |
| Object.entries(value).map(([k, v]) => renderSection(k, v, level + 1)) | |
| )} | |
| <div style={{ paddingLeft: level * 16 }} className="text-slate-400"> | |
| {isArray ? "]" : "}"} | |
| </div> | |
| </motion.div> | |
| )} | |
| </AnimatePresence> | |
| </div> | |
| ); | |
| }; | |
| return ( | |
| <div className="h-full flex flex-col bg-white rounded-2xl border border-slate-200 overflow-hidden"> | |
| {/* Header */} | |
| <div className="flex items-center justify-between px-5 py-4 border-b border-slate-100"> | |
| <div className="flex items-center gap-3"> | |
| <div className="h-8 w-8 rounded-lg bg-emerald-50 flex items-center justify-center"> | |
| <Code2 className="h-4 w-4 text-emerald-600" /> | |
| </div> | |
| <div> | |
| <h3 className="font-semibold text-slate-800 text-sm">Extracted Data</h3> | |
| <p className="text-xs text-slate-400"> | |
| {isComplete | |
| ? `${fieldsExtracted} field${fieldsExtracted !== 1 ? 's' : ''} extracted` | |
| : "Waiting for extraction"} | |
| </p> | |
| </div> | |
| {isComplete && onNewUpload && ( | |
| <Button | |
| variant="ghost" | |
| size="sm" | |
| onClick={onNewUpload} | |
| className="h-8 ml-auto text-xs gap-1.5 text-indigo-600 hover:text-indigo-700 hover:bg-indigo-50" | |
| title="Upload new document" | |
| > | |
| <Upload className="h-3.5 w-3.5" /> | |
| New | |
| </Button> | |
| )} | |
| </div> | |
| {isComplete && ( | |
| <div className="flex items-center gap-2"> | |
| <Tabs value={activeTab} onValueChange={setActiveTab}> | |
| <TabsList className="h-8 bg-slate-100 p-0.5"> | |
| <TabsTrigger value="text" className="h-7 text-xs gap-1.5"> | |
| <FileText className="h-3 w-3" /> | |
| Text | |
| </TabsTrigger> | |
| <TabsTrigger value="json" className="h-7 text-xs gap-1.5"> | |
| <Braces className="h-3 w-3" /> | |
| JSON | |
| </TabsTrigger> | |
| <TabsTrigger value="xml" className="h-7 text-xs gap-1.5"> | |
| <FileCode2 className="h-3 w-3" /> | |
| XML | |
| </TabsTrigger> | |
| </TabsList> | |
| </Tabs> | |
| <Button | |
| variant="ghost" | |
| size="sm" | |
| onClick={handleCopy} | |
| className="h-8 text-xs gap-1.5" | |
| > | |
| {copied ? ( | |
| <> | |
| <Check className="h-3 w-3 text-emerald-500" /> | |
| Copied | |
| </> | |
| ) : ( | |
| <> | |
| <Copy className="h-3 w-3" /> | |
| Copy | |
| </> | |
| )} | |
| </Button> | |
| </div> | |
| )} | |
| </div> | |
| {/* Output Area */} | |
| <div className="flex-1 overflow-auto"> | |
| {!hasFile ? ( | |
| <div className="h-full flex items-center justify-center p-6"> | |
| <div className="text-center"> | |
| <div className="h-20 w-20 mx-auto rounded-2xl bg-slate-100 flex items-center justify-center mb-4"> | |
| <Code2 className="h-10 w-10 text-slate-300" /> | |
| </div> | |
| <p className="text-slate-400 text-sm">Extracted data will appear here</p> | |
| </div> | |
| </div> | |
| ) : isProcessing ? ( | |
| <div className="h-full flex items-center justify-center p-6"> | |
| <div className="text-center"> | |
| <motion.div | |
| animate={{ rotate: 360 }} | |
| transition={{ duration: 2, repeat: Infinity, ease: "linear" }} | |
| className="h-16 w-16 mx-auto rounded-2xl bg-gradient-to-br from-indigo-100 to-violet-100 flex items-center justify-center mb-4" | |
| > | |
| <Sparkles className="h-8 w-8 text-indigo-500" /> | |
| </motion.div> | |
| <p className="text-slate-700 font-medium mb-1">Extracting data...</p> | |
| <p className="text-slate-400 text-sm">{statusMessage}</p> | |
| <div className="mt-6 flex items-center justify-center gap-1"> | |
| {[0, 1, 2].map((i) => ( | |
| <motion.div | |
| key={i} | |
| animate={{ scale: [1, 1.2, 1] }} | |
| transition={{ | |
| duration: 0.6, | |
| repeat: Infinity, | |
| delay: i * 0.2, | |
| }} | |
| className="h-2 w-2 rounded-full bg-indigo-400" | |
| /> | |
| ))} | |
| </div> | |
| </div> | |
| </div> | |
| ) : isComplete && Object.keys(fields).length === 0 ? ( | |
| <div className="h-full flex items-center justify-center p-6"> | |
| <div className="text-center"> | |
| <div className="h-20 w-20 mx-auto rounded-2xl bg-amber-100 flex items-center justify-center mb-4"> | |
| <Code2 className="h-10 w-10 text-amber-600" /> | |
| </div> | |
| <p className="text-slate-600 font-medium mb-1">No data extracted</p> | |
| <p className="text-slate-400 text-sm">The document may not contain extractable fields</p> | |
| </div> | |
| </div> | |
| ) : ( | |
| <div className="p-4 font-mono text-sm"> | |
| {activeTab === "text" ? ( | |
| <div | |
| className="text-sm text-slate-700 leading-relaxed" | |
| style={{ | |
| fontFamily: 'system-ui, -apple-system, sans-serif' | |
| }} | |
| > | |
| <div | |
| className="markdown-content" | |
| dangerouslySetInnerHTML={{ __html: renderMarkdownToHTML(fieldsToText(fields)) }} | |
| style={{ | |
| lineHeight: '1.6' | |
| }} | |
| /> | |
| <style>{` | |
| .markdown-content h1 { | |
| font-size: 1.5rem; | |
| font-weight: 700; | |
| color: #0f172a; | |
| margin-top: 1.5rem; | |
| margin-bottom: 1rem; | |
| line-height: 1.3; | |
| } | |
| .markdown-content h2 { | |
| font-size: 1.25rem; | |
| font-weight: 600; | |
| color: #0f172a; | |
| margin-top: 1.25rem; | |
| margin-bottom: 0.75rem; | |
| line-height: 1.3; | |
| } | |
| .markdown-content h3 { | |
| font-size: 1.125rem; | |
| font-weight: 600; | |
| color: #1e293b; | |
| margin-top: 1rem; | |
| margin-bottom: 0.5rem; | |
| line-height: 1.3; | |
| } | |
| .markdown-content p { | |
| margin-top: 0.75rem; | |
| margin-bottom: 0.75rem; | |
| color: #334155; | |
| } | |
| .markdown-content table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 1.5rem 0; | |
| font-size: 0.875rem; | |
| box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.1); | |
| } | |
| .markdown-content table caption { | |
| font-weight: 600; | |
| margin-bottom: 0.5rem; | |
| text-align: left; | |
| } | |
| .markdown-content table th { | |
| background-color: #f8fafc; | |
| border: 1px solid #cbd5e1; | |
| padding: 0.75rem; | |
| text-align: left; | |
| font-weight: 600; | |
| color: #0f172a; | |
| } | |
| .markdown-content table td { | |
| border: 1px solid #cbd5e1; | |
| padding: 0.75rem; | |
| color: #334155; | |
| } | |
| .markdown-content table tr:nth-child(even) { | |
| background-color: #f8fafc; | |
| } | |
| .markdown-content table tr:hover { | |
| background-color: #f1f5f9; | |
| } | |
| .markdown-content strong { | |
| font-weight: 600; | |
| color: #0f172a; | |
| } | |
| .markdown-content em { | |
| font-style: italic; | |
| } | |
| .markdown-content a { | |
| color: #4f46e5; | |
| text-decoration: underline; | |
| } | |
| .markdown-content a:hover { | |
| color: #4338ca; | |
| } | |
| .markdown-content sup { | |
| font-size: 0.75em; | |
| vertical-align: super; | |
| line-height: 0; | |
| position: relative; | |
| top: -0.5em; | |
| } | |
| .markdown-content sub { | |
| font-size: 0.75em; | |
| vertical-align: sub; | |
| line-height: 0; | |
| position: relative; | |
| bottom: -0.25em; | |
| } | |
| .markdown-content ul, .markdown-content ol { | |
| margin: 0.75rem 0; | |
| padding-left: 1.5rem; | |
| } | |
| .markdown-content li { | |
| margin: 0.25rem 0; | |
| } | |
| `}</style> | |
| </div> | |
| ) : activeTab === "json" ? ( | |
| <div className="space-y-1"> | |
| <span className="text-slate-400">{"{"}</span> | |
| {Object.keys(preparedFields).length > 0 ? ( | |
| Object.entries(preparedFields).map(([key, value]) => | |
| renderSection(key, value, 1) | |
| ) | |
| ) : ( | |
| <div className="pl-4 text-slate-400 italic">No fields extracted</div> | |
| )} | |
| <span className="text-slate-400">{"}"}</span> | |
| </div> | |
| ) : ( | |
| <pre className="text-sm text-slate-600 whitespace-pre-wrap"> | |
| {objectToXML(fields).split("\n").map((line, i) => ( | |
| <div key={i} className="hover:bg-slate-50 px-2 -mx-2 rounded"> | |
| {line.includes("<") ? ( | |
| <> | |
| {line.split(/(<\/?[\w\s=".-]+>)/g).map((part, j) => { | |
| if (part.startsWith("</")) { | |
| return ( | |
| <span key={j} className="text-rose-500"> | |
| {part} | |
| </span> | |
| ); | |
| } | |
| if (part.startsWith("<")) { | |
| return ( | |
| <span key={j} className="text-indigo-500"> | |
| {part} | |
| </span> | |
| ); | |
| } | |
| return ( | |
| <span key={j} className="text-slate-700"> | |
| {part} | |
| </span> | |
| ); | |
| })} | |
| </> | |
| ) : ( | |
| line | |
| )} | |
| </div> | |
| ))} | |
| </pre> | |
| )} | |
| </div> | |
| )} | |
| </div> | |
| {/* Confidence Footer */} | |
| {isComplete && extractionResult && ( | |
| <div className="px-5 py-3 border-t border-slate-100 bg-slate-50/50"> | |
| <div className="flex items-center justify-between text-xs"> | |
| <div className="flex items-center gap-4"> | |
| <div className="flex items-center gap-1.5"> | |
| <div className={cn( | |
| "h-2 w-2 rounded-full", | |
| confidence >= 90 ? "bg-emerald-500" : confidence >= 70 ? "bg-amber-500" : "bg-red-500" | |
| )} /> | |
| <span className="text-slate-500">Confidence:</span> | |
| <span className="font-semibold text-slate-700"> | |
| {confidence > 0 ? `${confidence.toFixed(1)}%` : "N/A"} | |
| </span> | |
| </div> | |
| <div className="flex items-center gap-1.5"> | |
| <span className="text-slate-500">Fields:</span> | |
| <span className="font-semibold text-slate-700">{fieldsExtracted}</span> | |
| </div> | |
| </div> | |
| <span className="text-slate-400"> | |
| Processed in {totalTime >= 1000 ? `${(totalTime / 1000).toFixed(1)}s` : `${totalTime}ms`} | |
| </span> | |
| </div> | |
| </div> | |
| )} | |
| </div> | |
| ); | |
| } | |