Spaces:
Running
Running
File size: 8,458 Bytes
da957b0 d08736d da957b0 58e23bb da957b0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | "use client";
import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import React from 'react';
// Color mapping for dataset tags
const TAG_COLORS = {
named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
};
/**
* Highlights all dataset mentions within the markdown text.
* Returns the text with <mark> tags wrapping each dataset name occurrence.
*/
function highlightDatasets(text, datasets) {
if (!datasets || datasets.length === 0 || !text) return text;
// Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
const mentions = datasets
.filter(ds => ds.dataset_name?.text)
.map(ds => ({
name: ds.dataset_name.text,
tag: ds.dataset_tag || 'non-dataset',
}))
.sort((a, b) => b.name.length - a.name.length);
// Deduplicate by name
const seen = new Set();
const uniqueMentions = mentions.filter(m => {
if (seen.has(m.name)) return false;
seen.add(m.name);
return true;
});
if (uniqueMentions.length === 0) return text;
// Build regex that matches any of the dataset names
const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
const pattern = new RegExp(`(${escaped.join('|')})`, 'gi');
// Create a lookup map for fast tag resolution
const nameToTag = {};
uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });
// Split text by the pattern, preserving the matches
const parts = text.split(pattern);
return parts.map((part, i) => {
const tag = nameToTag[part.toLowerCase()];
if (tag) {
const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset'];
return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
}
return part;
}).join('');
}
export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
const handleAnnotateClick = () => {
const selection = window.getSelection();
if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
const text = selection.toString().trim();
// Compute the character offset of the selection start within the
// .markdown-preview container. This lets us disambiguate when the
// same text appears multiple times on the page.
let selectionOffset = 0;
const container = document.querySelector('.markdown-preview');
if (container) {
try {
const range = selection.getRangeAt(0);
const preCaretRange = document.createRange();
preCaretRange.setStart(container, 0);
preCaretRange.setEnd(range.startContainer, range.startOffset);
selectionOffset = preCaretRange.toString().length;
} catch (e) {
// Fallback: offset 0 (will just use first occurrence)
selectionOffset = 0;
}
}
onAnnotate(text, selectionOffset);
} else {
const btn = document.getElementById('annotate-btn');
if (btn) {
btn.classList.add('shake');
setTimeout(() => btn.classList.remove('shake'), 500);
}
}
};
// Filter out consensus non-datasets (model + judge both agree)
const datasets = (currentPageData?.datasets || []).filter(ds => {
if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false;
return true;
});
const rawText = currentPageData?.input_text || "";
const highlightedText = highlightDatasets(rawText, datasets);
// Recursive helper: processes children at any depth so text inside
// <strong>, <em>, <a>, etc. also gets highlighted.
const processChildren = (children) =>
React.Children.map(children, child => {
if (typeof child === 'string') {
const highlighted = highlightDatasets(child, datasets);
if (highlighted !== child) {
return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
}
return child;
}
// If it's a React element with children, recurse into it
if (React.isValidElement(child) && child.props?.children) {
return React.cloneElement(child, {}, processChildren(child.props.children));
}
return child;
});
// Build component overrides for all block-level and inline elements
const highlightWrapper = (Tag) => ({ children, ...props }) => (
<Tag {...props}>{processChildren(children)}</Tag>
);
const highlightComponents = {
p: highlightWrapper('p'),
li: highlightWrapper('li'),
td: highlightWrapper('td'),
th: highlightWrapper('th'),
h1: highlightWrapper('h1'),
h2: highlightWrapper('h2'),
h3: highlightWrapper('h3'),
h4: highlightWrapper('h4'),
h5: highlightWrapper('h5'),
h6: highlightWrapper('h6'),
blockquote: highlightWrapper('blockquote'),
strong: highlightWrapper('strong'),
em: highlightWrapper('em'),
};
return (
<div className="annotator-container">
<div className="annotator-header">
<h2>Markdown Annotation</h2>
<button
id="annotate-btn"
onClick={handleAnnotateClick}
className="btn btn-primary"
title="Select text below, then click to annotate"
>
✍️ Annotate Selection
</button>
</div>
{/* Dataset legend */}
{datasets.length > 0 && (
<div className="dataset-legend">
{Object.entries(TAG_COLORS).map(([tag, colors]) => {
const count = datasets.filter(ds => ds.dataset_tag === tag).length;
if (count === 0) return null;
return (
<span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
<span className="legend-dot" style={{ backgroundColor: colors.border }} />
{colors.label} ({count})
</span>
);
})}
</div>
)}
<div className="markdown-content">
<div className="markdown-content-header">
<h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
{datasets.length > 0 && (
<span className="dataset-count">
{datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
</span>
)}
</div>
{loadingPage ? (
<div className="loading-spinner-container">
<div className="loading-spinner" />
<p>Loading page data...</p>
</div>
) : currentPageData ? (
<div className="markdown-preview">
<ReactMarkdown
remarkPlugins={[remarkGfm]}
components={highlightComponents}
>
{rawText || "No text available."}
</ReactMarkdown>
</div>
) : (
<p className="text-muted">Select a document and page to view extracted text.</p>
)}
</div>
</div>
);
}
|