Spaces:
Running
Running
| "use client"; | |
| import ReactMarkdown from 'react-markdown'; | |
| import remarkGfm from 'remark-gfm'; | |
| import React from 'react'; | |
| // Color mapping for dataset tags | |
| const TAG_COLORS = { | |
| named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' }, | |
| descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' }, | |
| vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' }, | |
| 'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' }, | |
| }; | |
| /** | |
| * Highlights all dataset mentions within the markdown text. | |
| * Returns the text with <mark> tags wrapping each dataset name occurrence. | |
| */ | |
| function highlightDatasets(text, datasets) { | |
| if (!datasets || datasets.length === 0 || !text) return text; | |
| // Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps) | |
| const mentions = datasets | |
| .filter(ds => ds.dataset_name?.text) | |
| .map(ds => ({ | |
| name: ds.dataset_name.text, | |
| tag: ds.dataset_tag || 'non-dataset', | |
| })) | |
| .sort((a, b) => b.name.length - a.name.length); | |
| // Deduplicate by name | |
| const seen = new Set(); | |
| const uniqueMentions = mentions.filter(m => { | |
| if (seen.has(m.name)) return false; | |
| seen.add(m.name); | |
| return true; | |
| }); | |
| if (uniqueMentions.length === 0) return text; | |
| // Build regex that matches any of the dataset names | |
| const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')); | |
| const pattern = new RegExp(`(${escaped.join('|')})`, 'gi'); | |
| // Create a lookup map for fast tag resolution | |
| const nameToTag = {}; | |
| uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; }); | |
| // Split text by the pattern, preserving the matches | |
| const parts = text.split(pattern); | |
| return parts.map((part, i) => { | |
| const tag = nameToTag[part.toLowerCase()]; | |
| if (tag) { | |
| const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset']; | |
| return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`; | |
| } | |
| return part; | |
| }).join(''); | |
| } | |
| export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) { | |
| const handleAnnotateClick = () => { | |
| const selection = window.getSelection(); | |
| if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) { | |
| const text = selection.toString().trim(); | |
| // Compute the character offset of the selection start within the | |
| // .markdown-preview container. This lets us disambiguate when the | |
| // same text appears multiple times on the page. | |
| let selectionOffset = 0; | |
| const container = document.querySelector('.markdown-preview'); | |
| if (container) { | |
| try { | |
| const range = selection.getRangeAt(0); | |
| const preCaretRange = document.createRange(); | |
| preCaretRange.setStart(container, 0); | |
| preCaretRange.setEnd(range.startContainer, range.startOffset); | |
| selectionOffset = preCaretRange.toString().length; | |
| } catch (e) { | |
| // Fallback: offset 0 (will just use first occurrence) | |
| selectionOffset = 0; | |
| } | |
| } | |
| onAnnotate(text, selectionOffset); | |
| } else { | |
| const btn = document.getElementById('annotate-btn'); | |
| if (btn) { | |
| btn.classList.add('shake'); | |
| setTimeout(() => btn.classList.remove('shake'), 500); | |
| } | |
| } | |
| }; | |
| // Filter out consensus non-datasets (model + judge both agree) | |
| const datasets = (currentPageData?.datasets || []).filter(ds => { | |
| if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false; | |
| return true; | |
| }); | |
| const rawText = currentPageData?.input_text || ""; | |
| const highlightedText = highlightDatasets(rawText, datasets); | |
| // Recursive helper: processes children at any depth so text inside | |
| // <strong>, <em>, <a>, etc. also gets highlighted. | |
| const processChildren = (children) => | |
| React.Children.map(children, child => { | |
| if (typeof child === 'string') { | |
| const highlighted = highlightDatasets(child, datasets); | |
| if (highlighted !== child) { | |
| return <span dangerouslySetInnerHTML={{ __html: highlighted }} />; | |
| } | |
| return child; | |
| } | |
| // If it's a React element with children, recurse into it | |
| if (React.isValidElement(child) && child.props?.children) { | |
| return React.cloneElement(child, {}, processChildren(child.props.children)); | |
| } | |
| return child; | |
| }); | |
| // Build component overrides for all block-level and inline elements | |
| const highlightWrapper = (Tag) => ({ children, ...props }) => ( | |
| <Tag {...props}>{processChildren(children)}</Tag> | |
| ); | |
| const highlightComponents = { | |
| p: highlightWrapper('p'), | |
| li: highlightWrapper('li'), | |
| td: highlightWrapper('td'), | |
| th: highlightWrapper('th'), | |
| h1: highlightWrapper('h1'), | |
| h2: highlightWrapper('h2'), | |
| h3: highlightWrapper('h3'), | |
| h4: highlightWrapper('h4'), | |
| h5: highlightWrapper('h5'), | |
| h6: highlightWrapper('h6'), | |
| blockquote: highlightWrapper('blockquote'), | |
| strong: highlightWrapper('strong'), | |
| em: highlightWrapper('em'), | |
| }; | |
| return ( | |
| <div className="annotator-container"> | |
| <div className="annotator-header"> | |
| <h2>Markdown Annotation</h2> | |
| <button | |
| id="annotate-btn" | |
| onClick={handleAnnotateClick} | |
| className="btn btn-primary" | |
| title="Select text below, then click to annotate" | |
| > | |
| ✍️ Annotate Selection | |
| </button> | |
| </div> | |
| {/* Dataset legend */} | |
| {datasets.length > 0 && ( | |
| <div className="dataset-legend"> | |
| {Object.entries(TAG_COLORS).map(([tag, colors]) => { | |
| const count = datasets.filter(ds => ds.dataset_tag === tag).length; | |
| if (count === 0) return null; | |
| return ( | |
| <span key={tag} className="legend-item" style={{ borderColor: colors.border }}> | |
| <span className="legend-dot" style={{ backgroundColor: colors.border }} /> | |
| {colors.label} ({count}) | |
| </span> | |
| ); | |
| })} | |
| </div> | |
| )} | |
| <div className="markdown-content"> | |
| <div className="markdown-content-header"> | |
| <h3>Doc {selectedDocIndex}, Page {selectedPage}</h3> | |
| {datasets.length > 0 && ( | |
| <span className="dataset-count"> | |
| {datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected | |
| </span> | |
| )} | |
| </div> | |
| {loadingPage ? ( | |
| <div className="loading-spinner-container"> | |
| <div className="loading-spinner" /> | |
| <p>Loading page data...</p> | |
| </div> | |
| ) : currentPageData ? ( | |
| <div className="markdown-preview"> | |
| <ReactMarkdown | |
| remarkPlugins={[remarkGfm]} | |
| components={highlightComponents} | |
| > | |
| {rawText || "No text available."} | |
| </ReactMarkdown> | |
| </div> | |
| ) : ( | |
| <p className="text-muted">Select a document and page to view extracted text.</p> | |
| )} | |
| </div> | |
| </div> | |
| ); | |
| } | |