dusan-presswhizz commited on
Commit
a072005
·
verified ·
1 Parent(s): 4fbef2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -9
app.py CHANGED
@@ -253,16 +253,27 @@ def embed(texts):
253
 
254
  def inject_anchor_into_sentence(sentence, anchor_text, target_url):
255
  """Wrap anchor if present; otherwise integrate link smoothly."""
256
- def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
 
 
 
 
 
 
257
  n_sent, n_anchor = norm(sentence), norm(anchor_text)
258
 
259
- if n_anchor and n_anchor in n_sent:
260
  # Use word boundaries for more accurate matching
261
- pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
262
- return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
 
 
 
 
 
263
 
264
  # Build a natural integration clause
265
- if sentence.endswith(('.', '!', '?')):
266
  base, punct = sentence[:-1], sentence[-1]
267
  else:
268
  base, punct = sentence, '.'
@@ -428,11 +439,25 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
428
  results = []
429
  for idx in top_idx:
430
  blk = blocks[idx]
 
431
  sents = re.split(r'(?<=[.!?])\s+', blk)
432
- s_embs = embed(sents)
433
- s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
434
- si = int(torch.argmax(s_sims))
435
- best_sent = sents[si]
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
437
 
438
  result = {
 
253
 
254
  def inject_anchor_into_sentence(sentence, anchor_text, target_url):
255
  """Wrap anchor if present; otherwise integrate link smoothly."""
256
+ # Handle empty or invalid inputs
257
+ if not sentence or not anchor_text:
258
+ return sentence, False
259
+
260
+ def norm(x):
261
+ return re.sub(r'[^a-z0-9 ]','',x.lower()) if x else ""
262
+
263
  n_sent, n_anchor = norm(sentence), norm(anchor_text)
264
 
265
+ if n_anchor and n_sent and n_anchor in n_sent:
266
  # Use word boundaries for more accurate matching
267
+ try:
268
+ pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
269
+ result = pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence)
270
+ return result, True
271
+ except:
272
+ # If regex fails, just append the link
273
+ pass
274
 
275
  # Build a natural integration clause
276
+ if len(sentence) > 0 and sentence[-1] in '.!?':
277
  base, punct = sentence[:-1], sentence[-1]
278
  else:
279
  base, punct = sentence, '.'
 
439
  results = []
440
  for idx in top_idx:
441
  blk = blocks[idx]
442
+ # Split sentences more carefully
443
  sents = re.split(r'(?<=[.!?])\s+', blk)
444
+ # Filter out empty sentences
445
+ sents = [s for s in sents if s and len(s.strip()) > 0]
446
+
447
+ if not sents:
448
+ # If no valid sentences, use the whole block
449
+ sents = [blk]
450
+
451
+ try:
452
+ s_embs = embed(sents)
453
+ s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
454
+ si = int(torch.argmax(s_sims))
455
+ best_sent = sents[min(si, len(sents)-1)] # Ensure index is valid
456
+ except Exception as e:
457
+ print(f"Error in sentence embedding: {e}")
458
+ # Fallback to first sentence
459
+ best_sent = sents[0] if sents else blk
460
+
461
  rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
462
 
463
  result = {