Spaces:

ai4data
/

data-use-annotation

Running

File size: 8,458 Bytes

"use client";

import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import React from 'react';

// Color mapping for dataset tags
const TAG_COLORS = {
    named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
    descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
    vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
    'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
};

/**
 * Highlights all dataset mentions within the markdown text.
 * Returns the text with <mark> tags wrapping each dataset name occurrence.
 */
function highlightDatasets(text, datasets) {
    if (!datasets || datasets.length === 0 || !text) return text;

    // Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
    const mentions = datasets
        .filter(ds => ds.dataset_name?.text)
        .map(ds => ({
            name: ds.dataset_name.text,
            tag: ds.dataset_tag || 'non-dataset',
        }))
        .sort((a, b) => b.name.length - a.name.length);

    // Deduplicate by name
    const seen = new Set();
    const uniqueMentions = mentions.filter(m => {
        if (seen.has(m.name)) return false;
        seen.add(m.name);
        return true;
    });

    if (uniqueMentions.length === 0) return text;

    // Build regex that matches any of the dataset names
    const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
    const pattern = new RegExp(`(${escaped.join('|')})`, 'gi');

    // Create a lookup map for fast tag resolution
    const nameToTag = {};
    uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });

    // Split text by the pattern, preserving the matches
    const parts = text.split(pattern);

    return parts.map((part, i) => {
        const tag = nameToTag[part.toLowerCase()];
        if (tag) {
            const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset'];
            return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
        }
        return part;
    }).join('');
}

export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
    const handleAnnotateClick = () => {
        const selection = window.getSelection();
        if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
            const text = selection.toString().trim();

            // Compute the character offset of the selection start within the
            // .markdown-preview container. This lets us disambiguate when the
            // same text appears multiple times on the page.
            let selectionOffset = 0;
            const container = document.querySelector('.markdown-preview');
            if (container) {
                try {
                    const range = selection.getRangeAt(0);
                    const preCaretRange = document.createRange();
                    preCaretRange.setStart(container, 0);
                    preCaretRange.setEnd(range.startContainer, range.startOffset);
                    selectionOffset = preCaretRange.toString().length;
                } catch (e) {
                    // Fallback: offset 0 (will just use first occurrence)
                    selectionOffset = 0;
                }
            }

            onAnnotate(text, selectionOffset);
        } else {
            const btn = document.getElementById('annotate-btn');
            if (btn) {
                btn.classList.add('shake');
                setTimeout(() => btn.classList.remove('shake'), 500);
            }
        }
    };

    // Filter out consensus non-datasets (model + judge both agree)
    const datasets = (currentPageData?.datasets || []).filter(ds => {
        if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false;
        return true;
    });
    const rawText = currentPageData?.input_text || "";
    const highlightedText = highlightDatasets(rawText, datasets);

    // Recursive helper: processes children at any depth so text inside
    // <strong>, <em>, <a>, etc. also gets highlighted.
    const processChildren = (children) =>
        React.Children.map(children, child => {
            if (typeof child === 'string') {
                const highlighted = highlightDatasets(child, datasets);
                if (highlighted !== child) {
                    return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
                }
                return child;
            }
            // If it's a React element with children, recurse into it
            if (React.isValidElement(child) && child.props?.children) {
                return React.cloneElement(child, {}, processChildren(child.props.children));
            }
            return child;
        });

    // Build component overrides for all block-level and inline elements
    const highlightWrapper = (Tag) => ({ children, ...props }) => (
        <Tag {...props}>{processChildren(children)}</Tag>
    );

    const highlightComponents = {
        p: highlightWrapper('p'),
        li: highlightWrapper('li'),
        td: highlightWrapper('td'),
        th: highlightWrapper('th'),
        h1: highlightWrapper('h1'),
        h2: highlightWrapper('h2'),
        h3: highlightWrapper('h3'),
        h4: highlightWrapper('h4'),
        h5: highlightWrapper('h5'),
        h6: highlightWrapper('h6'),
        blockquote: highlightWrapper('blockquote'),
        strong: highlightWrapper('strong'),
        em: highlightWrapper('em'),
    };

    return (
        <div className="annotator-container">
            <div className="annotator-header">
                <h2>Markdown Annotation</h2>
                <button
                    id="annotate-btn"
                    onClick={handleAnnotateClick}
                    className="btn btn-primary"
                    title="Select text below, then click to annotate"
                >
                    ✍️ Annotate Selection
                </button>
            </div>

            {/* Dataset legend */}
            {datasets.length > 0 && (
                <div className="dataset-legend">
                    {Object.entries(TAG_COLORS).map(([tag, colors]) => {
                        const count = datasets.filter(ds => ds.dataset_tag === tag).length;
                        if (count === 0) return null;
                        return (
                            <span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
                                <span className="legend-dot" style={{ backgroundColor: colors.border }} />
                                {colors.label} ({count})
                            </span>
                        );
                    })}
                </div>
            )}

            <div className="markdown-content">
                <div className="markdown-content-header">
                    <h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
                    {datasets.length > 0 && (
                        <span className="dataset-count">
                            {datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
                        </span>
                    )}
                </div>

                {loadingPage ? (
                    <div className="loading-spinner-container">
                        <div className="loading-spinner" />
                        <p>Loading page data...</p>
                    </div>
                ) : currentPageData ? (
                    <div className="markdown-preview">
                        <ReactMarkdown
                            remarkPlugins={[remarkGfm]}
                            components={highlightComponents}
                        >
                            {rawText || "No text available."}
                        </ReactMarkdown>
                    </div>
                ) : (
                    <p className="text-muted">Select a document and page to view extracted text.</p>
                )}
            </div>
        </div>
    );
}