rafmacalaba commited on
Commit
bd79ab1
·
1 Parent(s): ca6c843

fix: strip markdown from rawText before indexOf for annotation position

Browse files

Selected text from browser is plain but rawText has markdown formatting
(** bold **, # headings, etc.), causing indexOf to fail and return null
start/end. Now strips markdown before searching, with fallback to
case-insensitive search.

Files changed (1) hide show
  1. app/page.js +35 -6
app/page.js CHANGED
@@ -145,31 +145,60 @@ export default function Home() {
145
  };
146
 
147
  const handleAnnotationSubmit = async ({ dataset_tag }) => {
148
- // Find ALL occurrences of the selected text in input_text
149
  const inputText = currentPageData?.input_text || "";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  const occurrences = [];
151
  let searchFrom = 0;
152
- while (searchFrom < inputText.length) {
153
- const idx = inputText.indexOf(selectedText, searchFrom);
154
  if (idx === -1) break;
155
  occurrences.push(idx);
156
  searchFrom = idx + 1;
157
  }
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  let startIdx = null;
160
  if (occurrences.length === 1) {
161
  startIdx = occurrences[0];
162
  } else if (occurrences.length > 1) {
163
  const container = document.querySelector('.markdown-preview');
164
- const visibleLen = container?.textContent?.length || inputText.length;
165
- const ratio = inputText.length / visibleLen;
166
  const estimatedSourcePos = selectionOffset * ratio;
167
  startIdx = occurrences.reduce((best, idx) =>
168
  Math.abs(idx - estimatedSourcePos) < Math.abs(best - estimatedSourcePos) ? idx : best
169
  );
170
  }
171
 
172
- const endIdx = startIdx !== null ? startIdx + selectedText.length : null;
173
 
174
  const payload = {
175
  dataset_name: {
 
145
  };
146
 
147
  const handleAnnotationSubmit = async ({ dataset_tag }) => {
 
148
  const inputText = currentPageData?.input_text || "";
149
+
150
+ // Strip markdown formatting so browser-selected plain text can be found
151
+ const stripMd = (s) => s
152
+ .replace(/\*\*\*/g, '') // bold-italic
153
+ .replace(/\*\*/g, '') // bold
154
+ .replace(/\*/g, '') // italic
155
+ .replace(/__/g, '')
156
+ .replace(/_/g, ' ')
157
+ .replace(/^#{1,6}\s+/gm, '') // headings
158
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); // links
159
+
160
+ const plainText = stripMd(inputText);
161
+
162
+ // Normalize whitespace in selected text (browser may add extra spaces/newlines)
163
+ const normalizedSelection = selectedText.replace(/\s+/g, ' ').trim();
164
+
165
+ // Find occurrences in the stripped text
166
  const occurrences = [];
167
  let searchFrom = 0;
168
+ while (searchFrom < plainText.length) {
169
+ const idx = plainText.indexOf(normalizedSelection, searchFrom);
170
  if (idx === -1) break;
171
  occurrences.push(idx);
172
  searchFrom = idx + 1;
173
  }
174
 
175
+ // If not found with exact match, try case-insensitive
176
+ if (occurrences.length === 0) {
177
+ const lowerPlain = plainText.toLowerCase();
178
+ const lowerSel = normalizedSelection.toLowerCase();
179
+ let sf = 0;
180
+ while (sf < lowerPlain.length) {
181
+ const idx = lowerPlain.indexOf(lowerSel, sf);
182
+ if (idx === -1) break;
183
+ occurrences.push(idx);
184
+ sf = idx + 1;
185
+ }
186
+ }
187
+
188
  let startIdx = null;
189
  if (occurrences.length === 1) {
190
  startIdx = occurrences[0];
191
  } else if (occurrences.length > 1) {
192
  const container = document.querySelector('.markdown-preview');
193
+ const visibleLen = container?.textContent?.length || plainText.length;
194
+ const ratio = plainText.length / visibleLen;
195
  const estimatedSourcePos = selectionOffset * ratio;
196
  startIdx = occurrences.reduce((best, idx) =>
197
  Math.abs(idx - estimatedSourcePos) < Math.abs(best - estimatedSourcePos) ? idx : best
198
  );
199
  }
200
 
201
+ const endIdx = startIdx !== null ? startIdx + normalizedSelection.length : null;
202
 
203
  const payload = {
204
  dataset_name: {