"use client"; import ReactMarkdown from 'react-markdown'; import remarkGfm from 'remark-gfm'; import React from 'react'; // Color mapping for dataset tags const TAG_COLORS = { named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' }, descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' }, vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' }, 'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' }, }; /** * Highlights all dataset mentions within the markdown text. * Returns the text with tags wrapping each dataset name occurrence. */ function highlightDatasets(text, datasets) { if (!datasets || datasets.length === 0 || !text) return text; // Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps) const mentions = datasets .filter(ds => ds.dataset_name?.text) .map(ds => ({ name: ds.dataset_name.text, tag: ds.dataset_tag || 'non-dataset', })) .sort((a, b) => b.name.length - a.name.length); // Deduplicate by name const seen = new Set(); const uniqueMentions = mentions.filter(m => { if (seen.has(m.name)) return false; seen.add(m.name); return true; }); if (uniqueMentions.length === 0) return text; // Build regex that matches any of the dataset names const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')); const pattern = new RegExp(`(${escaped.join('|')})`, 'gi'); // Create a lookup map for fast tag resolution const nameToTag = {}; uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; }); // Split text by the pattern, preserving the matches const parts = text.split(pattern); return parts.map((part, i) => { const tag = nameToTag[part.toLowerCase()]; if (tag) { const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset']; return `${part}`; } return part; }).join(''); } export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) { const handleAnnotateClick = () => { const selection = window.getSelection(); if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) { const text = selection.toString().trim(); // Compute the character offset of the selection start within the // .markdown-preview container. This lets us disambiguate when the // same text appears multiple times on the page. let selectionOffset = 0; const container = document.querySelector('.markdown-preview'); if (container) { try { const range = selection.getRangeAt(0); const preCaretRange = document.createRange(); preCaretRange.setStart(container, 0); preCaretRange.setEnd(range.startContainer, range.startOffset); selectionOffset = preCaretRange.toString().length; } catch (e) { // Fallback: offset 0 (will just use first occurrence) selectionOffset = 0; } } onAnnotate(text, selectionOffset); } else { const btn = document.getElementById('annotate-btn'); if (btn) { btn.classList.add('shake'); setTimeout(() => btn.classList.remove('shake'), 500); } } }; // Filter out consensus non-datasets (model + judge both agree) const datasets = (currentPageData?.datasets || []).filter(ds => { if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false; return true; }); const rawText = currentPageData?.input_text || ""; const highlightedText = highlightDatasets(rawText, datasets); // Recursive helper: processes children at any depth so text inside // , , , etc. also gets highlighted. const processChildren = (children) => React.Children.map(children, child => { if (typeof child === 'string') { const highlighted = highlightDatasets(child, datasets); if (highlighted !== child) { return ; } return child; } // If it's a React element with children, recurse into it if (React.isValidElement(child) && child.props?.children) { return React.cloneElement(child, {}, processChildren(child.props.children)); } return child; }); // Build component overrides for all block-level and inline elements const highlightWrapper = (Tag) => ({ children, ...props }) => ( {processChildren(children)} ); const highlightComponents = { p: highlightWrapper('p'), li: highlightWrapper('li'), td: highlightWrapper('td'), th: highlightWrapper('th'), h1: highlightWrapper('h1'), h2: highlightWrapper('h2'), h3: highlightWrapper('h3'), h4: highlightWrapper('h4'), h5: highlightWrapper('h5'), h6: highlightWrapper('h6'), blockquote: highlightWrapper('blockquote'), strong: highlightWrapper('strong'), em: highlightWrapper('em'), }; return (

Markdown Annotation

{/* Dataset legend */} {datasets.length > 0 && (
{Object.entries(TAG_COLORS).map(([tag, colors]) => { const count = datasets.filter(ds => ds.dataset_tag === tag).length; if (count === 0) return null; return ( {colors.label} ({count}) ); })}
)}

Doc {selectedDocIndex}, Page {selectedPage}

{datasets.length > 0 && ( {datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected )}
{loadingPage ? (

Loading page data...

) : currentPageData ? (
{rawText || "No text available."}
) : (

Select a document and page to view extracted text.

)}
); }