Spaces:

ai4data
/

data-use-annotation

Running

App Files Files Community

data-use-annotation / app /components /MarkdownAnnotator.js

rafmacalaba's picture

fix: extend highlighting to headings, strong, em, blockquote

58e23bb 20 days ago

history blame contribute delete

8.46 kB

	"use client";

	import ReactMarkdown from 'react-markdown';
	import remarkGfm from 'remark-gfm';
	import React from 'react';

	// Color mapping for dataset tags
	const TAG_COLORS = {
	named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
	descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
	vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
	'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
	};

	/**
	* Highlights all dataset mentions within the markdown text.
	* Returns the text with <mark> tags wrapping each dataset name occurrence.
	*/
	function highlightDatasets(text, datasets) {
	if (!datasets \|\| datasets.length === 0 \|\| !text) return text;

	// Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
	const mentions = datasets
	.filter(ds => ds.dataset_name?.text)
	.map(ds => ({
	name: ds.dataset_name.text,
	tag: ds.dataset_tag \|\| 'non-dataset',
	}))
	.sort((a, b) => b.name.length - a.name.length);

	// Deduplicate by name
	const seen = new Set();
	const uniqueMentions = mentions.filter(m => {
	if (seen.has(m.name)) return false;
	seen.add(m.name);
	return true;
	});

	if (uniqueMentions.length === 0) return text;

	// Build regex that matches any of the dataset names
	const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()\|[\]\\]/g, '\\$&'));
	const pattern = new RegExp(`(${escaped.join('\|')})`, 'gi');

	// Create a lookup map for fast tag resolution
	const nameToTag = {};
	uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });

	// Split text by the pattern, preserving the matches
	const parts = text.split(pattern);

	return parts.map((part, i) => {
	const tag = nameToTag[part.toLowerCase()];
	if (tag) {
	const colors = TAG_COLORS[tag] \|\| TAG_COLORS['non-dataset'];
	return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
	}
	return part;
	}).join('');
	}

	export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
	const handleAnnotateClick = () => {
	const selection = window.getSelection();
	if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
	const text = selection.toString().trim();

	// Compute the character offset of the selection start within the
	// .markdown-preview container. This lets us disambiguate when the
	// same text appears multiple times on the page.
	let selectionOffset = 0;
	const container = document.querySelector('.markdown-preview');
	if (container) {
	try {
	const range = selection.getRangeAt(0);
	const preCaretRange = document.createRange();
	preCaretRange.setStart(container, 0);
	preCaretRange.setEnd(range.startContainer, range.startOffset);
	selectionOffset = preCaretRange.toString().length;
	} catch (e) {
	// Fallback: offset 0 (will just use first occurrence)
	selectionOffset = 0;
	}
	}

	onAnnotate(text, selectionOffset);
	} else {
	const btn = document.getElementById('annotate-btn');
	if (btn) {
	btn.classList.add('shake');
	setTimeout(() => btn.classList.remove('shake'), 500);
	}
	}
	};

	// Filter out consensus non-datasets (model + judge both agree)
	const datasets = (currentPageData?.datasets \|\| []).filter(ds => {
	if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false;
	return true;
	});
	const rawText = currentPageData?.input_text \|\| "";
	const highlightedText = highlightDatasets(rawText, datasets);

	// Recursive helper: processes children at any depth so text inside
	// <strong>, <em>, <a>, etc. also gets highlighted.
	const processChildren = (children) =>
	React.Children.map(children, child => {
	if (typeof child === 'string') {
	const highlighted = highlightDatasets(child, datasets);
	if (highlighted !== child) {
	return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
	}
	return child;
	}
	// If it's a React element with children, recurse into it
	if (React.isValidElement(child) && child.props?.children) {
	return React.cloneElement(child, {}, processChildren(child.props.children));
	}
	return child;
	});

	// Build component overrides for all block-level and inline elements
	const highlightWrapper = (Tag) => ({ children, ...props }) => (
	<Tag {...props}>{processChildren(children)}</Tag>
	);

	const highlightComponents = {
	p: highlightWrapper('p'),
	li: highlightWrapper('li'),
	td: highlightWrapper('td'),
	th: highlightWrapper('th'),
	h1: highlightWrapper('h1'),
	h2: highlightWrapper('h2'),
	h3: highlightWrapper('h3'),
	h4: highlightWrapper('h4'),
	h5: highlightWrapper('h5'),
	h6: highlightWrapper('h6'),
	blockquote: highlightWrapper('blockquote'),
	strong: highlightWrapper('strong'),
	em: highlightWrapper('em'),
	};

	return (
	<div className="annotator-container">
	<div className="annotator-header">
	<h2>Markdown Annotation</h2>
	<button
	id="annotate-btn"
	onClick={handleAnnotateClick}
	className="btn btn-primary"
	title="Select text below, then click to annotate"
	>
	✍️ Annotate Selection
	</button>
	</div>

	{/* Dataset legend */}
	{datasets.length > 0 && (
	<div className="dataset-legend">
	{Object.entries(TAG_COLORS).map(([tag, colors]) => {
	const count = datasets.filter(ds => ds.dataset_tag === tag).length;
	if (count === 0) return null;
	return (
	<span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
	<span className="legend-dot" style={{ backgroundColor: colors.border }} />
	{colors.label} ({count})
	</span>
	);
	})}
	</div>
	)}

	<div className="markdown-content">
	<div className="markdown-content-header">
	<h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
	{datasets.length > 0 && (
	<span className="dataset-count">
	{datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
	</span>
	)}
	</div>

	{loadingPage ? (
	<div className="loading-spinner-container">
	<div className="loading-spinner" />
	<p>Loading page data...</p>
	</div>
	) : currentPageData ? (
	<div className="markdown-preview">
	<ReactMarkdown
	remarkPlugins={[remarkGfm]}
	components={highlightComponents}
	>
	{rawText \|\| "No text available."}
	</ReactMarkdown>
	</div>
	) : (
	<p className="text-muted">Select a document and page to view extracted text.</p>
	)}
	</div>
	</div>
	);
	}