Spaces:

ai4data
/

data-use-annotation

Sleeping

rafmacalaba commited on 24 days ago

Commit

bd79ab1

1 Parent(s): ca6c843

fix: strip markdown from rawText before indexOf for annotation position

Selected text from browser is plain but rawText has markdown formatting
(** bold **, # headings, etc.), causing indexOf to fail and return null
start/end. Now strips markdown before searching, with fallback to
case-insensitive search.

Files changed (1) hide show

app/page.js +35 -6

app/page.js CHANGED Viewed

@@ -145,31 +145,60 @@ export default function Home() {
     };
     const handleAnnotationSubmit = async ({ dataset_tag }) => {
-        // Find ALL occurrences of the selected text in input_text
         const inputText = currentPageData?.input_text || "";
         const occurrences = [];
         let searchFrom = 0;
-        while (searchFrom < inputText.length) {
-            const idx = inputText.indexOf(selectedText, searchFrom);
             if (idx === -1) break;
             occurrences.push(idx);
             searchFrom = idx + 1;
         }
         let startIdx = null;
         if (occurrences.length === 1) {
             startIdx = occurrences[0];
         } else if (occurrences.length > 1) {
             const container = document.querySelector('.markdown-preview');
-            const visibleLen = container?.textContent?.length || inputText.length;
-            const ratio = inputText.length / visibleLen;
             const estimatedSourcePos = selectionOffset * ratio;
             startIdx = occurrences.reduce((best, idx) =>
                 Math.abs(idx - estimatedSourcePos) < Math.abs(best - estimatedSourcePos) ? idx : best
             );
         }
-        const endIdx = startIdx !== null ? startIdx + selectedText.length : null;
         const payload = {
             dataset_name: {

     };
     const handleAnnotationSubmit = async ({ dataset_tag }) => {
         const inputText = currentPageData?.input_text || "";
+        // Strip markdown formatting so browser-selected plain text can be found
+        const stripMd = (s) => s
+            .replace(/\*\*\*/g, '')  // bold-italic
+            .replace(/\*\*/g, '')     // bold
+            .replace(/\*/g, '')       // italic
+            .replace(/__/g, '')
+            .replace(/_/g, ' ')
+            .replace(/^#{1,6}\s+/gm, '')  // headings
+            .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');  // links
+        const plainText = stripMd(inputText);
+        // Normalize whitespace in selected text (browser may add extra spaces/newlines)
+        const normalizedSelection = selectedText.replace(/\s+/g, ' ').trim();
+        // Find occurrences in the stripped text
         const occurrences = [];
         let searchFrom = 0;
+        while (searchFrom < plainText.length) {
+            const idx = plainText.indexOf(normalizedSelection, searchFrom);
             if (idx === -1) break;
             occurrences.push(idx);
             searchFrom = idx + 1;
         }
+        // If not found with exact match, try case-insensitive
+        if (occurrences.length === 0) {
+            const lowerPlain = plainText.toLowerCase();
+            const lowerSel = normalizedSelection.toLowerCase();
+            let sf = 0;
+            while (sf < lowerPlain.length) {
+                const idx = lowerPlain.indexOf(lowerSel, sf);
+                if (idx === -1) break;
+                occurrences.push(idx);
+                sf = idx + 1;
+            }
+        }
         let startIdx = null;
         if (occurrences.length === 1) {
             startIdx = occurrences[0];
         } else if (occurrences.length > 1) {
             const container = document.querySelector('.markdown-preview');
+            const visibleLen = container?.textContent?.length || plainText.length;
+            const ratio = plainText.length / visibleLen;
             const estimatedSourcePos = selectionOffset * ratio;
             startIdx = occurrences.reduce((best, idx) =>
                 Math.abs(idx - estimatedSourcePos) < Math.abs(best - estimatedSourcePos) ? idx : best
             );
         }
+        const endIdx = startIdx !== null ? startIdx + normalizedSelection.length : null;
         const payload = {
             dataset_name: {