dusan-presswhizz commited on
Commit
d107e20
·
verified ·
1 Parent(s): 4bc41f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -49
app.py CHANGED
@@ -226,34 +226,117 @@ def create_anchor_suggestion(anchor_text, target_url):
226
  ]
227
  return suggestions[0]
228
 
229
- def extract_potential_anchors(sentence, target_url):
230
- """Extract potential anchor text phrases from a sentence."""
231
- # Remove very common words and extract meaningful phrases
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
233
  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
234
- 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'}
235
-
236
- # Split into words and find continuous phrases
237
- words = sentence.split()
238
- phrases = []
239
-
240
- # Generate phrases of 2-5 words
241
- for length in range(2, min(6, len(words) + 1)):
242
- for i in range(len(words) - length + 1):
243
- phrase = ' '.join(words[i:i+length])
244
- # Check if phrase doesn't start/end with stopwords
245
- first_word = words[i].lower().strip('.,!?;:')
246
- last_word = words[i+length-1].lower().strip('.,!?;:')
247
- if first_word not in stopwords and last_word not in stopwords:
248
- phrases.append(phrase.strip('.,!?;:'))
249
-
250
- # Also add significant single words (proper nouns, long words)
251
- for word in words:
252
- clean_word = word.strip('.,!?;:')
253
- if (len(clean_word) > 7 or clean_word[0].isupper()) and clean_word.lower() not in stopwords:
254
- phrases.append(clean_word)
255
-
256
- return phrases[:5] # Return top 5 potential anchors
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
259
  blocks = get_text_blocks(source_url)
@@ -302,27 +385,14 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
302
 
303
  # If anchor not present and alternative suggestion requested
304
  if suggest_alternative and not keyword_present:
305
- # Extract potential anchor phrases from the best sentence
306
- potential_anchors = extract_potential_anchors(best_sent, target_url)
307
-
308
- # Find the best alternative anchor
309
- best_alternative = None
310
- best_alt_score = -1
311
 
312
- for alt_anchor in potential_anchors:
313
- # Check relevance to target
314
- alt_query = f"{alt_anchor} — relevant to: {tgt_title}"
315
- alt_q_emb = embed([alt_query])[0]
316
- alt_sim = F.cosine_similarity(alt_q_emb.unsqueeze(0), q_emb.unsqueeze(0)).item()
317
-
318
- if alt_sim > best_alt_score:
319
- best_alt_score = alt_sim
320
- best_alternative = alt_anchor
321
-
322
- if best_alternative:
323
- # Create alternative suggestion with the better anchor
324
- alt_rewritten, alt_exact = inject_anchor_into_sentence(best_sent, best_alternative, target_url)
325
- result["alternative_anchor"] = best_alternative
326
  result["alternative_sentence"] = alt_rewritten
327
  result["alternative_exact_match"] = alt_exact
328
 
@@ -531,18 +601,23 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
531
  # Process alternative anchor if requested and original anchor not found
532
  if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
533
  alt_anchor = res["alternative_anchor"]
 
534
  alt_sentence = res["alternative_sentence"]
535
 
 
 
 
 
536
  # Apply GPT rewriting to alternative as well
537
  if smart_rewrite:
538
- alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=language_name)
539
  alt_final = alt_g["sentence_html"]
540
  else:
541
  alt_final = alt_sentence
542
 
543
  # Polish if needed
544
  if not res.get("alternative_exact_match", False):
545
- alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=language_name)
546
  alt_final = alt_polished.get("sentence_html", alt_final)
547
 
548
  alt_output = to_plain_text(alt_final) if plain_text else alt_final
@@ -551,7 +626,7 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
551
  result += f"💡 OPTION 2 - Better anchor suggestion:\n\n"
552
  result += f"Since '{anchor_text}' is not in the article, consider using:\n"
553
  result += f"Suggested anchor: '{alt_anchor}'\n\n"
554
- result += f"Change this sentence:\n{original_sentence}\n\nWith this one:\n{alt_output}"
555
 
556
  return result
557
 
 
226
  ]
227
  return suggestions[0]
228
 
229
+ def find_alternative_anchor(blocks, target_url, original_anchor):
230
+ """Find a better anchor text from the article that relates to the target URL."""
231
+
232
+ # Get target page context
233
+ try:
234
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
235
+ soup = BeautifulSoup(tgt_html, "html.parser")
236
+
237
+ # Extract target page title and meta description
238
+ title = soup.title.get_text().strip() if soup.title else ""
239
+ meta_desc = ""
240
+ meta_tag = soup.find("meta", attrs={"name": "description"})
241
+ if meta_tag:
242
+ meta_desc = meta_tag.get("content", "")
243
+
244
+ # Extract key terms from target page (first few paragraphs)
245
+ target_paragraphs = []
246
+ for p in soup.find_all("p")[:5]:
247
+ text = p.get_text().strip()
248
+ if len(text) > 50:
249
+ target_paragraphs.append(text)
250
+ target_content = " ".join(target_paragraphs[:3])
251
+
252
+ except Exception as e:
253
+ print(f"Error fetching target URL: {e}")
254
+ title = ""
255
+ meta_desc = ""
256
+ target_content = original_anchor
257
+
258
+ # Extract all potential anchor phrases from the source article
259
+ all_phrases = set()
260
+ full_text = " ".join(blocks)
261
+
262
+ # Common words to exclude
263
  stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
264
  'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
265
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
266
+ 'should', 'je', 'i', 'u', 'na', 'se', 'da', 'su', 'za', 'od', 'sa',
267
+ 'po', 'iz', 'će', 'bi', 'ako', 'ali', 'jer', 'kada', 'gdje', 'što'}
268
+
269
+ # Extract noun phrases and important terms (2-4 words)
270
+ sentences = re.split(r'[.!?]', full_text)
271
+ for sentence in sentences:
272
+ words = sentence.split()
273
+
274
+ # Extract phrases of 2-4 words
275
+ for length in range(2, min(5, len(words) + 1)):
276
+ for i in range(len(words) - length + 1):
277
+ phrase = ' '.join(words[i:i+length])
278
+ phrase_clean = phrase.strip('.,!?;:"\' ')
279
+
280
+ # Check if phrase is meaningful
281
+ first_word = words[i].lower().strip('.,!?;:')
282
+ last_word = words[i+length-1].lower().strip('.,!?;:')
283
+
284
+ # Skip if starts/ends with stopwords or is too short
285
+ if (first_word not in stopwords and
286
+ last_word not in stopwords and
287
+ len(phrase_clean) > 5 and
288
+ len(phrase_clean) < 50):
289
+ all_phrases.add(phrase_clean)
290
+
291
+ # Also extract single important words (proper nouns, long words)
292
+ for word in words:
293
+ clean_word = word.strip('.,!?;:"\' ')
294
+ if (len(clean_word) > 6 or
295
+ (clean_word[0].isupper() and clean_word.lower() not in stopwords)):
296
+ all_phrases.add(clean_word)
297
+
298
+ if not all_phrases:
299
+ return None, None
300
+
301
+ # Create context query from target URL info
302
+ target_context = f"{title} {meta_desc} {target_content}"[:500]
303
+
304
+ # Score each phrase based on relevance to target
305
+ target_emb = embed([target_context])[0]
306
+
307
+ best_anchor = None
308
+ best_score = -1
309
+ best_sentence = None
310
+
311
+ # Evaluate each potential anchor
312
+ for phrase in all_phrases:
313
+ # Skip if too similar to original anchor (we want something different)
314
+ if phrase.lower() == original_anchor.lower():
315
+ continue
316
+
317
+ # Score this phrase against target context
318
+ phrase_emb = embed([phrase])[0]
319
+ relevance_score = F.cosine_similarity(phrase_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
320
+
321
+ # Check if this phrase appears in article and find its best context
322
+ if phrase.lower() in full_text.lower():
323
+ # Find sentences containing this phrase
324
+ for block in blocks:
325
+ if phrase.lower() in block.lower():
326
+ sents = re.split(r'(?<=[.!?])\s+', block)
327
+ for sent in sents:
328
+ if phrase.lower() in sent.lower():
329
+ # Score this sentence-phrase combination
330
+ sent_emb = embed([sent])[0]
331
+ context_score = F.cosine_similarity(sent_emb.unsqueeze(0), target_emb.unsqueeze(0)).item()
332
+ combined_score = (relevance_score * 0.6) + (context_score * 0.4)
333
+
334
+ if combined_score > best_score:
335
+ best_score = combined_score
336
+ best_anchor = phrase
337
+ best_sentence = sent
338
+
339
+ return best_anchor, best_sentence
340
 
341
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
342
  blocks = get_text_blocks(source_url)
 
385
 
386
  # If anchor not present and alternative suggestion requested
387
  if suggest_alternative and not keyword_present:
388
+ # Find a completely different anchor and sentence
389
+ alt_anchor, alt_sentence = find_alternative_anchor(blocks, target_url, anchor_text)
 
 
 
 
390
 
391
+ if alt_anchor and alt_sentence:
392
+ # Create the sentence with the alternative anchor
393
+ alt_rewritten, alt_exact = inject_anchor_into_sentence(alt_sentence, alt_anchor, target_url)
394
+ result["alternative_anchor"] = alt_anchor
395
+ result["alternative_sentence_original"] = alt_sentence
 
 
 
 
 
 
 
 
 
396
  result["alternative_sentence"] = alt_rewritten
397
  result["alternative_exact_match"] = alt_exact
398
 
 
601
  # Process alternative anchor if requested and original anchor not found
602
  if suggest_alternative_anchor and not res.get("keyword_in_article", True) and res.get("alternative_anchor"):
603
  alt_anchor = res["alternative_anchor"]
604
+ alt_sentence_original = res.get("alternative_sentence_original", res["best_sentence_original"])
605
  alt_sentence = res["alternative_sentence"]
606
 
607
+ # Detect language for alternative sentence
608
+ alt_detected_lang = detect_language(alt_sentence_original)
609
+ alt_language_name = get_language_name(alt_detected_lang)
610
+
611
  # Apply GPT rewriting to alternative as well
612
  if smart_rewrite:
613
+ alt_g = gpt_rewrite(alt_sentence, alt_anchor, target_url, style="neutral", language=alt_language_name)
614
  alt_final = alt_g["sentence_html"]
615
  else:
616
  alt_final = alt_sentence
617
 
618
  # Polish if needed
619
  if not res.get("alternative_exact_match", False):
620
+ alt_polished = gpt_validate_and_polish(alt_final, alt_anchor, target_url, language=alt_language_name)
621
  alt_final = alt_polished.get("sentence_html", alt_final)
622
 
623
  alt_output = to_plain_text(alt_final) if plain_text else alt_final
 
626
  result += f"💡 OPTION 2 - Better anchor suggestion:\n\n"
627
  result += f"Since '{anchor_text}' is not in the article, consider using:\n"
628
  result += f"Suggested anchor: '{alt_anchor}'\n\n"
629
+ result += f"Change this sentence:\n{alt_sentence_original}\n\nWith this one:\n{alt_output}"
630
 
631
  return result
632