File size: 8,458 Bytes
da957b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d08736d
 
 
 
 
da957b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58e23bb
 
 
 
 
 
 
 
 
da957b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"use client";

import ReactMarkdown from 'react-markdown';
import remarkGfm from 'remark-gfm';
import React from 'react';

// Color mapping for dataset tags
const TAG_COLORS = {
    named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
    descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
    vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
    'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
};

/**
 * Highlights all dataset mentions within the markdown text.
 * Returns the text with <mark> tags wrapping each dataset name occurrence.
 */
function highlightDatasets(text, datasets) {
    if (!datasets || datasets.length === 0 || !text) return text;

    // Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
    const mentions = datasets
        .filter(ds => ds.dataset_name?.text)
        .map(ds => ({
            name: ds.dataset_name.text,
            tag: ds.dataset_tag || 'non-dataset',
        }))
        .sort((a, b) => b.name.length - a.name.length);

    // Deduplicate by name
    const seen = new Set();
    const uniqueMentions = mentions.filter(m => {
        if (seen.has(m.name)) return false;
        seen.add(m.name);
        return true;
    });

    if (uniqueMentions.length === 0) return text;

    // Build regex that matches any of the dataset names
    const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
    const pattern = new RegExp(`(${escaped.join('|')})`, 'gi');

    // Create a lookup map for fast tag resolution
    const nameToTag = {};
    uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });

    // Split text by the pattern, preserving the matches
    const parts = text.split(pattern);

    return parts.map((part, i) => {
        const tag = nameToTag[part.toLowerCase()];
        if (tag) {
            const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset'];
            return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
        }
        return part;
    }).join('');
}

export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
    const handleAnnotateClick = () => {
        const selection = window.getSelection();
        if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
            const text = selection.toString().trim();

            // Compute the character offset of the selection start within the
            // .markdown-preview container. This lets us disambiguate when the
            // same text appears multiple times on the page.
            let selectionOffset = 0;
            const container = document.querySelector('.markdown-preview');
            if (container) {
                try {
                    const range = selection.getRangeAt(0);
                    const preCaretRange = document.createRange();
                    preCaretRange.setStart(container, 0);
                    preCaretRange.setEnd(range.startContainer, range.startOffset);
                    selectionOffset = preCaretRange.toString().length;
                } catch (e) {
                    // Fallback: offset 0 (will just use first occurrence)
                    selectionOffset = 0;
                }
            }

            onAnnotate(text, selectionOffset);
        } else {
            const btn = document.getElementById('annotate-btn');
            if (btn) {
                btn.classList.add('shake');
                setTimeout(() => btn.classList.remove('shake'), 500);
            }
        }
    };

    // Filter out consensus non-datasets (model + judge both agree)
    const datasets = (currentPageData?.datasets || []).filter(ds => {
        if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) return false;
        return true;
    });
    const rawText = currentPageData?.input_text || "";
    const highlightedText = highlightDatasets(rawText, datasets);

    // Recursive helper: processes children at any depth so text inside
    // <strong>, <em>, <a>, etc. also gets highlighted.
    const processChildren = (children) =>
        React.Children.map(children, child => {
            if (typeof child === 'string') {
                const highlighted = highlightDatasets(child, datasets);
                if (highlighted !== child) {
                    return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
                }
                return child;
            }
            // If it's a React element with children, recurse into it
            if (React.isValidElement(child) && child.props?.children) {
                return React.cloneElement(child, {}, processChildren(child.props.children));
            }
            return child;
        });

    // Build component overrides for all block-level and inline elements
    const highlightWrapper = (Tag) => ({ children, ...props }) => (
        <Tag {...props}>{processChildren(children)}</Tag>
    );

    const highlightComponents = {
        p: highlightWrapper('p'),
        li: highlightWrapper('li'),
        td: highlightWrapper('td'),
        th: highlightWrapper('th'),
        h1: highlightWrapper('h1'),
        h2: highlightWrapper('h2'),
        h3: highlightWrapper('h3'),
        h4: highlightWrapper('h4'),
        h5: highlightWrapper('h5'),
        h6: highlightWrapper('h6'),
        blockquote: highlightWrapper('blockquote'),
        strong: highlightWrapper('strong'),
        em: highlightWrapper('em'),
    };

    return (
        <div className="annotator-container">
            <div className="annotator-header">
                <h2>Markdown Annotation</h2>
                <button
                    id="annotate-btn"
                    onClick={handleAnnotateClick}
                    className="btn btn-primary"
                    title="Select text below, then click to annotate"
                >
                    ✍️ Annotate Selection
                </button>
            </div>

            {/* Dataset legend */}
            {datasets.length > 0 && (
                <div className="dataset-legend">
                    {Object.entries(TAG_COLORS).map(([tag, colors]) => {
                        const count = datasets.filter(ds => ds.dataset_tag === tag).length;
                        if (count === 0) return null;
                        return (
                            <span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
                                <span className="legend-dot" style={{ backgroundColor: colors.border }} />
                                {colors.label} ({count})
                            </span>
                        );
                    })}
                </div>
            )}

            <div className="markdown-content">
                <div className="markdown-content-header">
                    <h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
                    {datasets.length > 0 && (
                        <span className="dataset-count">
                            {datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
                        </span>
                    )}
                </div>

                {loadingPage ? (
                    <div className="loading-spinner-container">
                        <div className="loading-spinner" />
                        <p>Loading page data...</p>
                    </div>
                ) : currentPageData ? (
                    <div className="markdown-preview">
                        <ReactMarkdown
                            remarkPlugins={[remarkGfm]}
                            components={highlightComponents}
                        >
                            {rawText || "No text available."}
                        </ReactMarkdown>
                    </div>
                ) : (
                    <p className="text-muted">Select a document and page to view extracted text.</p>
                )}
            </div>
        </div>
    );
}