data-use-annotation / app /components /MarkdownAnnotator.js
rafmacalaba's picture
fix: extend highlighting to headings, strong, em, blockquote
58e23bb
"use client";
import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import React from 'react';
// Color mapping for dataset tags
const TAG_COLORS = {
named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
};
/**
* Highlights all dataset mentions within the markdown text.
* Returns the text with <mark> tags wrapping each dataset name occurrence.
*/
function highlightDatasets(text, datasets) {
if (!datasets || datasets.length === 0 || !text) return text;
// Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
const mentions = datasets
.filter(ds => ds.dataset_name?.text)
.map(ds => ({
name: ds.dataset_name.text,
tag: ds.dataset_tag || 'non-dataset',
}))
.sort((a, b) => b.name.length - a.name.length);
// Deduplicate by name
const seen = new Set();
const uniqueMentions = mentions.filter(m => {
if (seen.has(m.name)) return false;
seen.add(m.name);
return true;
});
if (uniqueMentions.length === 0) return text;
// Build regex that matches any of the dataset names
const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
const pattern = new RegExp(`(${escaped.join('|')})`, 'gi');
// Create a lookup map for fast tag resolution
const nameToTag = {};
uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });
// Split text by the pattern, preserving the matches
const parts = text.split(pattern);
return parts.map((part, i) => {
const tag = nameToTag[part.toLowerCase()];
if (tag) {
const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset'];
return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
}
return part;
}).join('');
}
export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
const handleAnnotateClick = () => {
const selection = window.getSelection();
if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
const text = selection.toString().trim();
// Compute the character offset of the selection start within the
// .markdown-preview container. This lets us disambiguate when the
// same text appears multiple times on the page.
let selectionOffset = 0;
const container = document.querySelector('.markdown-preview');
if (container) {
try {
const range = selection.getRangeAt(0);
const preCaretRange = document.createRange();
preCaretRange.setStart(container, 0);
preCaretRange.setEnd(range.startContainer, range.startOffset);
selectionOffset = preCaretRange.toString().length;
} catch (e) {
// Fallback: offset 0 (will just use first occurrence)
selectionOffset = 0;
}
}
onAnnotate(text, selectionOffset);
} else {
const btn = document.getElementById('annotate-btn');
if (btn) {
btn.classList.add('shake');
setTimeout(() => btn.classList.remove('shake'), 500);
}
}
};
// Filter out consensus non-datasets (model + judge both agree)
const datasets = (currentPageData?.datasets || []).filter(ds => {
if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false;
return true;
});
const rawText = currentPageData?.input_text || "";
const highlightedText = highlightDatasets(rawText, datasets);
// Recursive helper: processes children at any depth so text inside
// <strong>, <em>, <a>, etc. also gets highlighted.
const processChildren = (children) =>
React.Children.map(children, child => {
if (typeof child === 'string') {
const highlighted = highlightDatasets(child, datasets);
if (highlighted !== child) {
return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
}
return child;
}
// If it's a React element with children, recurse into it
if (React.isValidElement(child) && child.props?.children) {
return React.cloneElement(child, {}, processChildren(child.props.children));
}
return child;
});
// Build component overrides for all block-level and inline elements
const highlightWrapper = (Tag) => ({ children, ...props }) => (
<Tag {...props}>{processChildren(children)}</Tag>
);
const highlightComponents = {
p: highlightWrapper('p'),
li: highlightWrapper('li'),
td: highlightWrapper('td'),
th: highlightWrapper('th'),
h1: highlightWrapper('h1'),
h2: highlightWrapper('h2'),
h3: highlightWrapper('h3'),
h4: highlightWrapper('h4'),
h5: highlightWrapper('h5'),
h6: highlightWrapper('h6'),
blockquote: highlightWrapper('blockquote'),
strong: highlightWrapper('strong'),
em: highlightWrapper('em'),
};
return (
<div className="annotator-container">
<div className="annotator-header">
<h2>Markdown Annotation</h2>
<button
id="annotate-btn"
onClick={handleAnnotateClick}
className="btn btn-primary"
title="Select text below, then click to annotate"
>
✍️ Annotate Selection
</button>
</div>
{/* Dataset legend */}
{datasets.length > 0 && (
<div className="dataset-legend">
{Object.entries(TAG_COLORS).map(([tag, colors]) => {
const count = datasets.filter(ds => ds.dataset_tag === tag).length;
if (count === 0) return null;
return (
<span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
<span className="legend-dot" style={{ backgroundColor: colors.border }} />
{colors.label} ({count})
</span>
);
})}
</div>
)}
<div className="markdown-content">
<div className="markdown-content-header">
<h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
{datasets.length > 0 && (
<span className="dataset-count">
{datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
</span>
)}
</div>
{loadingPage ? (
<div className="loading-spinner-container">
<div className="loading-spinner" />
<p>Loading page data...</p>
</div>
) : currentPageData ? (
<div className="markdown-preview">
<ReactMarkdown
remarkPlugins={[remarkGfm]}
components={highlightComponents}
>
{rawText || "No text available."}
</ReactMarkdown>
</div>
) : (
<p className="text-muted">Select a document and page to view extracted text.</p>
)}
</div>
</div>
);
}