Update app.py
Browse files
app.py
CHANGED
|
@@ -253,16 +253,27 @@ def embed(texts):
|
|
| 253 |
|
| 254 |
def inject_anchor_into_sentence(sentence, anchor_text, target_url):
|
| 255 |
"""Wrap anchor if present; otherwise integrate link smoothly."""
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
n_sent, n_anchor = norm(sentence), norm(anchor_text)
|
| 258 |
|
| 259 |
-
if n_anchor and n_anchor in n_sent:
|
| 260 |
# Use word boundaries for more accurate matching
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
# Build a natural integration clause
|
| 265 |
-
if sentence
|
| 266 |
base, punct = sentence[:-1], sentence[-1]
|
| 267 |
else:
|
| 268 |
base, punct = sentence, '.'
|
|
@@ -428,11 +439,25 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
|
|
| 428 |
results = []
|
| 429 |
for idx in top_idx:
|
| 430 |
blk = blocks[idx]
|
|
|
|
| 431 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
|
| 437 |
|
| 438 |
result = {
|
|
|
|
| 253 |
|
| 254 |
def inject_anchor_into_sentence(sentence, anchor_text, target_url):
|
| 255 |
"""Wrap anchor if present; otherwise integrate link smoothly."""
|
| 256 |
+
# Handle empty or invalid inputs
|
| 257 |
+
if not sentence or not anchor_text:
|
| 258 |
+
return sentence, False
|
| 259 |
+
|
| 260 |
+
def norm(x):
|
| 261 |
+
return re.sub(r'[^a-z0-9 ]','',x.lower()) if x else ""
|
| 262 |
+
|
| 263 |
n_sent, n_anchor = norm(sentence), norm(anchor_text)
|
| 264 |
|
| 265 |
+
if n_anchor and n_sent and n_anchor in n_sent:
|
| 266 |
# Use word boundaries for more accurate matching
|
| 267 |
+
try:
|
| 268 |
+
pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
|
| 269 |
+
result = pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence)
|
| 270 |
+
return result, True
|
| 271 |
+
except:
|
| 272 |
+
# If regex fails, just append the link
|
| 273 |
+
pass
|
| 274 |
|
| 275 |
# Build a natural integration clause
|
| 276 |
+
if len(sentence) > 0 and sentence[-1] in '.!?':
|
| 277 |
base, punct = sentence[:-1], sentence[-1]
|
| 278 |
else:
|
| 279 |
base, punct = sentence, '.'
|
|
|
|
| 439 |
results = []
|
| 440 |
for idx in top_idx:
|
| 441 |
blk = blocks[idx]
|
| 442 |
+
# Split sentences more carefully
|
| 443 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 444 |
+
# Filter out empty sentences
|
| 445 |
+
sents = [s for s in sents if s and len(s.strip()) > 0]
|
| 446 |
+
|
| 447 |
+
if not sents:
|
| 448 |
+
# If no valid sentences, use the whole block
|
| 449 |
+
sents = [blk]
|
| 450 |
+
|
| 451 |
+
try:
|
| 452 |
+
s_embs = embed(sents)
|
| 453 |
+
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
| 454 |
+
si = int(torch.argmax(s_sims))
|
| 455 |
+
best_sent = sents[min(si, len(sents)-1)] # Ensure index is valid
|
| 456 |
+
except Exception as e:
|
| 457 |
+
print(f"Error in sentence embedding: {e}")
|
| 458 |
+
# Fallback to first sentence
|
| 459 |
+
best_sent = sents[0] if sents else blk
|
| 460 |
+
|
| 461 |
rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
|
| 462 |
|
| 463 |
result = {
|