dusan-presswhizz commited on
Commit
8263900
·
verified ·
1 Parent(s): a072005

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -168
app.py CHANGED
@@ -298,191 +298,253 @@ def create_anchor_suggestion(anchor_text, target_url):
298
 
299
  def find_alternative_anchor(blocks, target_url, original_anchor):
300
  """Find a better anchor text from the article that relates to the target URL."""
301
-
302
- # Get target page context
303
  try:
304
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
305
- soup = BeautifulSoup(tgt_html, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- # Extract target page title and meta description
308
- title = soup.title.get_text().strip() if soup.title else ""
309
- meta_desc = ""
310
- meta_tag = soup.find("meta", attrs={"name": "description"})
311
- if meta_tag:
312
- meta_desc = meta_tag.get("content", "")
313
 
314
- # Extract key terms from target page (first few paragraphs)
315
- target_paragraphs = []
316
- for p in soup.find_all("p")[:5]:
317
- text = p.get_text().strip()
318
- if len(text) > 50:
319
- target_paragraphs.append(text)
320
- target_content = " ".join(target_paragraphs[:3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- except Exception as e:
323
- print(f"Error fetching target URL: {e}")
324
- title = ""
325
- meta_desc = ""
326
- target_content = original_anchor
327
-
328
- # Extract all potential anchor phrases from the source article
329
- all_phrases = set()
330
- full_text = " ".join(blocks)
331
-
332
- # Common words to exclude
333
- stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
334
- 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
335
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
336
- 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
337
- 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
338
-
339
- # Extract noun phrases and important terms (2-4 words)
340
- sentences = re.split(r'[.!?]', full_text)
341
- for sentence in sentences:
342
- words = sentence.split()
343
 
344
- # Extract phrases of 2-4 words
345
- for length in range(2, min(5, len(words) + 1)):
346
- for i in range(len(words) - length + 1):
347
- phrase = ' '.join(words[i:i+length])
348
- phrase_clean = phrase.strip('.,!?;:"\' ')
349
-
350
- # Check if phrase is meaningful
351
- first_word = words[i].lower().strip('.,!?;:')
352
- last_word = words[i+length-1].lower().strip('.,!?;:')
353
-
354
- # Skip if starts/ends with stopwords or is too short
355
- if (first_word not in stopwords and
356
- last_word not in stopwords and
357
- len(phrase_clean) > 5 and
358
- len(phrase_clean) < 50):
359
- all_phrases.add(phrase_clean)
360
 
361
- # Also extract single important words (proper nouns, long words)
362
- for word in words:
363
- clean_word = word.strip('.,!?;:"\' ')
364
- if (len(clean_word) > 6 or
365
- (clean_word[0].isupper() and clean_word.lower() not in stopwords)):
366
- all_phrases.add(clean_word)
367
-
368
- if not all_phrases:
369
- return None, None
370
-
371
- # Create context query from target URL info
372
- target_context = f"{title} {meta_desc} {target_content}"[:500]
373
-
374
- # Score each phrase based on relevance to target
375
- target_emb = embed([target_context])[0]
376
-
377
- best_anchor = None
378
- best_score = -1
379
- best_sentence = None
380
-
381
- # Evaluate each potential anchor
382
- for phrase in all_phrases:
383
- # Skip if too similar to original anchor (we want something different)
384
- if phrase.lower() == original_anchor.lower():
385
- continue
386
 
387
- # Score this phrase against target context
388
- phrase_emb = embed([phrase])[0]
389
- relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
390
 
391
- # Check if this phrase appears in article and find its best context
392
- if phrase.lower() in full_text.lower():
393
- # Find sentences containing this phrase
394
- for block in blocks:
395
- if phrase.lower() in block.lower():
396
- sents = re.split(r'(?<=[.!?])\s+', block)
397
- for sent in sents:
398
- if phrase.lower() in sent.lower():
399
- # Score this sentence-phrase combination
400
- sent_emb = embed([sent])[0]
401
- context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
402
- combined_score = (relevance_score * 0.6) + (context_score * 0.4)
403
-
404
- if combined_score > best_score:
405
- best_score = combined_score
406
- best_anchor = phrase
407
- best_sentence = sent
408
-
409
- return best_anchor, best_sentence
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
412
- blocks = get_text_blocks(source_url)
413
- if not blocks:
414
- return [{"error":"No text blocks found on the page."}]
 
415
 
416
- # Check if keyword is present in the article
417
- full_text = " ".join(blocks).lower()
418
- keyword_present = anchor_text.lower() in full_text
419
 
420
- # target context
421
- try:
422
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
423
- tt = BeautifulSoup(tgt_html, "html.parser").title
424
- tgt_title = tt.get_text().strip() if tt else ""
425
- except Exception:
426
- tgt_title = ""
427
-
428
- ext = tldextract.extract(target_url)
429
- tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
430
-
431
- # First, find best match with original anchor
432
- query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
433
- q_emb = embed([query])[0]
434
-
435
- blk_embs = embed(blocks)
436
- sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
437
- top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
438
-
439
- results = []
440
- for idx in top_idx:
441
- blk = blocks[idx]
442
- # Split sentences more carefully
443
- sents = re.split(r'(?<=[.!?])\s+', blk)
444
- # Filter out empty sentences
445
- sents = [s for s in sents if s and len(s.strip()) > 0]
446
-
447
- if not sents:
448
- # If no valid sentences, use the whole block
449
- sents = [blk]
450
-
451
  try:
452
- s_embs = embed(sents)
453
- s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
454
- si = int(torch.argmax(s_sims))
455
- best_sent = sents[min(si, len(sents)-1)] # Ensure index is valid
456
  except Exception as e:
457
- print(f"Error in sentence embedding: {e}")
458
- # Fallback to first sentence
459
- best_sent = sents[0] if sents else blk
460
-
461
- rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
 
 
 
462
 
463
- result = {
464
- "anchor_was_present": exact_found,
465
- "best_sentence_original": best_sent,
466
- "best_sentence_with_anchor": rewritten_sent,
467
- "keyword_in_article": keyword_present
468
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
- # If anchor not present and alternative suggestion requested
471
- if suggest_alternative and not keyword_present:
472
- # Find a completely different anchor and sentence
473
- alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
474
-
475
- if alt_anchor and alt_sentence:
476
- # Create the sentence with the alternative anchor
477
- alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
478
- result["alternative_anchor"] = alt_anchor
479
- result["alternative_sentence_original"] = alt_sentence
480
- result["alternative_sentence"] = alt_rewritten
481
- result["alternative_exact_match"] = alt_exact
482
 
483
- results.append(result)
484
-
485
- return results
 
 
 
 
 
 
 
 
486
 
487
  # =========================
488
  # OpenAI helpers with caching
 
298
 
299
  def find_alternative_anchor(blocks, target_url, original_anchor):
300
  """Find a better anchor text from the article that relates to the target URL."""
 
 
301
  try:
302
+ # Get target page context
303
+ try:
304
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
305
+ soup = BeautifulSoup(tgt_html, "html.parser")
306
+
307
+ # Extract target page title and meta description
308
+ title = soup.title.get_text().strip() if soup.title else ""
309
+ meta_desc = ""
310
+ meta_tag = soup.find("meta", attrs={"name": "description"})
311
+ if meta_tag:
312
+ meta_desc = meta_tag.get("content", "")
313
+
314
+ # Extract key terms from target page (first few paragraphs)
315
+ target_paragraphs = []
316
+ for p in soup.find_all("p")[:5]:
317
+ text = p.get_text().strip()
318
+ if len(text) > 50:
319
+ target_paragraphs.append(text)
320
+ target_content = " ".join(target_paragraphs[:3])
321
+
322
+ except Exception as e:
323
+ print(f"Error fetching target URL: {e}")
324
+ title = ""
325
+ meta_desc = ""
326
+ target_content = original_anchor
327
+
328
+ # Extract all potential anchor phrases from the source article
329
+ all_phrases = set()
330
+ full_text = " ".join(blocks)
331
 
332
+ # Common words to exclude
333
+ stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
334
+ 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
335
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
336
+ 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
337
+ 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
338
 
339
+ # Extract noun phrases and important terms (2-4 words)
340
+ sentences = re.split(r'[.!?]', full_text)
341
+ for sentence in sentences:
342
+ if not sentence:
343
+ continue
344
+ words = sentence.split()
345
+
346
+ # Extract phrases of 2-4 words
347
+ for length in range(2, min(5, len(words) + 1)):
348
+ for i in range(len(words) - length + 1):
349
+ if i < 0 or i+length > len(words):
350
+ continue
351
+ phrase = ' '.join(words[i:i+length])
352
+ phrase_clean = phrase.strip('.,!?;:"\' ')
353
+
354
+ # Check if phrase is meaningful
355
+ if i < len(words) and i+length-1 < len(words):
356
+ first_word = words[i].lower().strip('.,!?;:')
357
+ last_word = words[i+length-1].lower().strip('.,!?;:')
358
+
359
+ # Skip if starts/ends with stopwords or is too short
360
+ if (first_word not in stopwords and
361
+ last_word not in stopwords and
362
+ len(phrase_clean) > 5 and
363
+ len(phrase_clean) < 50):
364
+ all_phrases.add(phrase_clean)
365
+
366
+ # Also extract single important words (proper nouns, long words)
367
+ for word in words:
368
+ clean_word = word.strip('.,!?;:"\' ')
369
+ if clean_word and (len(clean_word) > 6 or
370
+ (len(clean_word) > 0 and clean_word[0].isupper() and clean_word.lower() not in stopwords)):
371
+ all_phrases.add(clean_word)
372
 
373
+ if not all_phrases:
374
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
+ # Create context query from target URL info
377
+ target_context = f"{title} {meta_desc} {target_content}"[:500]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
+ # Score each phrase based on relevance to target
380
+ try:
381
+ target_emb = embed([target_context])[0]
382
+ except:
383
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
+ best_anchor = None
386
+ best_score = -1
387
+ best_sentence = None
388
 
389
+ # Evaluate each potential anchor
390
+ for phrase in list(all_phrases)[:50]: # Limit to first 50 to avoid too much processing
391
+ # Skip if too similar to original anchor (we want something different)
392
+ if phrase.lower() == original_anchor.lower():
393
+ continue
394
+
395
+ try:
396
+ # Score this phrase against target context
397
+ phrase_emb = embed([phrase])[0]
398
+ relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
399
+
400
+ # Check if this phrase appears in article and find its best context
401
+ if phrase.lower() in full_text.lower():
402
+ # Find sentences containing this phrase
403
+ for block in blocks:
404
+ if phrase.lower() in block.lower():
405
+ sents = re.split(r'(?<=[.!?])\s+', block)
406
+ for sent in sents:
407
+ if sent and phrase.lower() in sent.lower():
408
+ # Score this sentence-phrase combination
409
+ try:
410
+ sent_emb = embed([sent])[0]
411
+ context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
412
+ combined_score = (relevance_score * 0.6) + (context_score * 0.4)
413
+
414
+ if combined_score > best_score:
415
+ best_score = combined_score
416
+ best_anchor = phrase
417
+ best_sentence = sent
418
+ except:
419
+ continue
420
+ except Exception as e:
421
+ print(f"Error evaluating phrase '{phrase}': {e}")
422
+ continue
423
+
424
+ return best_anchor, best_sentence
425
+
426
+ except Exception as e:
427
+ print(f"Critical error in find_alternative_anchor: {e}")
428
+ return None, None
429
 
430
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
431
+ try:
432
+ blocks = get_text_blocks(source_url)
433
+ if not blocks:
434
+ return [{"error":"No text blocks found on the page."}]
435
 
436
+ # Check if keyword is present in the article
437
+ full_text = " ".join(blocks).lower()
438
+ keyword_present = anchor_text.lower() in full_text if anchor_text else False
439
 
440
+ # target context
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  try:
442
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
443
+ tt = BeautifulSoup(tgt_html, "html.parser").title
444
+ tgt_title = tt.get_text().strip() if tt else ""
 
445
  except Exception as e:
446
+ print(f"Error fetching target URL: {e}")
447
+ tgt_title = ""
448
+
449
+ ext = tldextract.extract(target_url)
450
+ tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
451
+
452
+ # First, find best match with original anchor
453
+ query = f"{anchor_text} – relevant to: {tgt_title} ({tgt_domain})"
454
 
455
+ try:
456
+ q_emb = embed([query])[0]
457
+ blk_embs = embed(blocks)
458
+ sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
459
+ top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
460
+ except Exception as e:
461
+ print(f"Error in block embedding/similarity: {e}")
462
+ # Fallback to first block
463
+ top_idx = [0]
464
+
465
+ results = []
466
+ for idx in top_idx:
467
+ try:
468
+ blk = blocks[min(idx, len(blocks)-1)] # Ensure valid index
469
+
470
+ # Split sentences more carefully
471
+ sents = re.split(r'(?<=[.!?])\s+', blk)
472
+ # Filter out empty sentences and ensure they have content
473
+ sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
474
+
475
+ if not sents:
476
+ # If no valid sentences, use the whole block
477
+ sents = [blk]
478
+
479
+ best_sent = sents[0] # Default to first sentence
480
+
481
+ try:
482
+ # Only try embedding if we have valid sentences
483
+ if len(sents) > 0 and all(len(s) > 0 for s in sents):
484
+ s_embs = embed(sents)
485
+ s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
486
+ si = int(torch.argmax(s_sims).item()) # Use .item() to get scalar
487
+ if 0 <= si < len(sents):
488
+ best_sent = sents[si]
489
+ except Exception as e:
490
+ print(f"Error in sentence selection: {e}, using first sentence")
491
+ # Keep default (first sentence)
492
+
493
+ # Ensure best_sent is valid before processing
494
+ if not best_sent or len(best_sent.strip()) == 0:
495
+ best_sent = blk if blk else "Unable to extract sentence from this section."
496
+
497
+ rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
498
+
499
+ result = {
500
+ "anchor_was_present": exact_found,
501
+ "best_sentence_original": best_sent,
502
+ "best_sentence_with_anchor": rewritten_sent,
503
+ "keyword_in_article": keyword_present
504
+ }
505
+
506
+ # If anchor not present and alternative suggestion requested
507
+ if suggest_alternative and not keyword_present:
508
+ try:
509
+ # Find a completely different anchor and sentence
510
+ alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
511
+
512
+ if alt_anchor and alt_sentence:
513
+ # Create the sentence with the alternative anchor
514
+ alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
515
+ result["alternative_anchor"] = alt_anchor
516
+ result["alternative_sentence_original"] = alt_sentence
517
+ result["alternative_sentence"] = alt_rewritten
518
+ result["alternative_exact_match"] = alt_exact
519
+ except Exception as e:
520
+ print(f"Error finding alternative anchor: {e}")
521
+ # Continue without alternative
522
+
523
+ results.append(result)
524
+
525
+ except Exception as e:
526
+ print(f"Error processing block {idx}: {e}")
527
+ # Add a fallback result
528
+ results.append({
529
+ "anchor_was_present": False,
530
+ "best_sentence_original": blocks[0] if blocks else "Error extracting content",
531
+ "best_sentence_with_anchor": f"Error processing content. Please try adding the link manually: <a href='{target_url}'>{anchor_text}</a>",
532
+ "keyword_in_article": keyword_present
533
+ })
534
 
535
+ return results
 
 
 
 
 
 
 
 
 
 
 
536
 
537
+ except Exception as e:
538
+ print(f"Critical error in suggest_insertions: {e}")
539
+ import traceback
540
+ traceback.print_exc()
541
+ return [{
542
+ "error": f"Error processing the page: {str(e)}",
543
+ "anchor_was_present": False,
544
+ "best_sentence_original": "Error occurred",
545
+ "best_sentence_with_anchor": f"Error occurred. Try manually: <a href='{target_url}'>{anchor_text}</a>",
546
+ "keyword_in_article": False
547
+ }]
548
 
549
  # =========================
550
  # OpenAI helpers with caching