dusan-presswhizz commited on
Commit
ab9bead
Β·
verified Β·
1 Parent(s): 0ee888e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -239
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os, re, json, requests, urllib.parse, hashlib, html
2
  from functools import lru_cache
3
- from typing import List, Optional
4
 
5
  # Torch / Transformers
6
  import torch, torch.nn.functional as F
@@ -38,9 +38,9 @@ UA = {
38
  )
39
  }
40
 
41
- # --- OpenAI settings ---
42
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
43
- PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
44
  FALLBACK_OPENAI_MODEL = "gpt-4o-mini"
45
  OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
46
 
@@ -222,6 +222,59 @@ def get_text_blocks(url: str, max_paragraphs: int = 8) -> List[str]:
222
  print(f"get_text_blocks fatal: {e}")
223
  return []
224
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  # =========================
226
  # Embedding helpers
227
  # =========================
@@ -247,7 +300,6 @@ def embed(texts: List[str]):
247
  def inject_anchor_into_sentence(sentence, anchor_text, target_url):
248
  if not sentence or not anchor_text:
249
  return sentence, False
250
- # prefer exact word-boundary replacement if present
251
  try:
252
  pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
253
  if pattern.search(sentence):
@@ -255,195 +307,141 @@ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
255
  return result, True
256
  except Exception:
257
  pass
258
-
259
- # else append a natural clause
260
  if len(sentence) > 0 and sentence[-1] in '.!?':
261
  base, punct = sentence[:-1], sentence[-1]
262
  else:
263
  base, punct = sentence, '.'
264
- clause = f' with insights from <a href="{target_url}">{anchor_text}</a>'
265
- rewritten = f'{base}{clause}{punct}'
266
  return rewritten, False
267
 
268
  # =========================
269
- # OpenAI helpers (cached)
270
  # =========================
271
- def _openai_chat_cached(cache_key: str, model_name: str, system: str, user_json: dict):
272
- if cache_key in API_RESPONSE_CACHE:
273
- print(f"[GPT] Using cached response for {cache_key[:8]}...")
274
- return API_RESPONSE_CACHE[cache_key]
275
-
276
  if not OPENAI_API_KEY:
277
  raise RuntimeError("OPENAI_API_KEY not set")
278
 
279
  headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
280
  body = {
281
  "model": model_name,
282
- "response_format": {"type": "json_object"},
283
  "messages": [
284
  {"role": "system", "content": system},
285
- {"role": "user", "content": json.dumps(user_json)}
286
- ],
287
- "temperature": 0.6
288
  }
289
  r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
290
  print(f"[GPT] Model={model_name} HTTP {r.status_code}")
291
  r.raise_for_status()
292
  txt = r.json()["choices"][0]["message"]["content"]
293
- result = json.loads(txt)
294
-
295
- API_RESPONSE_CACHE[cache_key] = result
296
- return result
297
-
298
- def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral", language="English"):
299
- if not OPENAI_API_KEY:
300
- print("[GPT] No OPENAI_API_KEY found β†’ using fallback.")
301
- return {"sentence_html": sentence_html}
302
-
303
- cache_key = hashlib.md5(f"{sentence_html}{anchor_text}{target_url}{style}{language}".encode()).hexdigest()
304
- system = (
305
- f"You are a skilled content editor writing in {language}. "
306
- "Integrate the given anchor naturally into ONE sentence of similar length. "
307
- "STRICT: include an <a href> using the EXACT anchor text; no em dashes. "
308
- f"Return JSON with key sentence_html."
309
- )
310
- user = {
311
- "task": "rewrite_for_link_insertion",
312
- "sentence_html": sentence_html,
313
- "anchor_text": anchor_text,
314
- "target_url": target_url,
315
- "style": style,
316
- "language": language,
317
- "preserve_special_chars": True,
318
- "constraints": {"max_extra_words": 20}
319
- }
320
 
 
 
 
 
321
  try:
322
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
323
  except Exception as e:
324
  print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
325
- try:
326
- obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
327
- except Exception as e2:
328
- print(f"[GPT] Fallback failed: {e2}. Using original sentence.")
329
- return {"sentence_html": sentence_html}
330
-
331
- out = obj.get("sentence_html", sentence_html)
332
- return {"sentence_html": out}
333
 
334
- def gpt_validate_and_polish(sentence_html, anchor_text, target_url, language="English"):
 
 
 
335
  if not OPENAI_API_KEY:
336
  return {"sentence_html": sentence_html}
337
 
338
- cache_key = hashlib.md5(f"polish_{sentence_html}{anchor_text}{target_url}{language}".encode()).hexdigest()
339
  system = (
340
- f"You are an advanced editor writing in {language}. "
341
- "Input: a draft HTML sentence with an <a> link (anchor text fixed). "
342
- "Polish if natural; else rewrite (max 5 sentences). Keep anchor EXACT, href unchanged; no em dashes. "
343
- "Return JSON with key 'sentence_html'."
344
  )
345
  user = {
 
346
  "sentence_html": sentence_html,
347
  "anchor_text": anchor_text,
348
  "target_url": target_url,
349
- "language": language,
350
- "preserve_special_chars": True
351
  }
 
 
352
 
353
- try:
354
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
355
- except Exception:
356
- try:
357
- obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
358
- except Exception:
359
- return {"sentence_html": sentence_html}
360
-
361
- out = obj.get("sentence_html", sentence_html)
362
- soup = BeautifulSoup(out, "html.parser")
363
- if not soup.find("a"):
364
- return {"sentence_html": sentence_html}
365
- return {"sentence_html": out}
366
-
367
- def gpt_get_search_keywords(target_content, target_url):
368
  if not OPENAI_API_KEY:
369
- return ["related content", "learn more", "additional information"]
370
-
371
- content_preview = " ".join(target_content[:5]) if isinstance(target_content, list) else str(target_content)[:3000]
372
- cache_key = hashlib.md5(f"keywords_{target_url}_{content_preview[:500]}".encode()).hexdigest()
373
  system = (
374
- "You are an SEO expert. Identify 5-10 realistic search keywords users would type to find this page. "
375
- "Return JSON: {'keywords': [...]}"
376
  )
377
- user = {
378
- "task": "identify_search_keywords",
379
- "page_content": content_preview,
380
- "url": target_url,
381
- "requirements": {"count": "5-10", "type": "practical"}
382
- }
383
-
384
- try:
385
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
386
- except Exception as e:
387
- print(f"[GPT] Keywords extraction failed: {e}")
388
- return ["related content", "learn more", "additional information"]
389
-
390
- return obj.get("keywords", ["related content"])
391
 
392
  def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
393
  if not OPENAI_API_KEY or not keywords:
394
  return None
395
-
396
  source_preview = " ".join(source_blocks[:3])[:500]
397
- cache_key = hashlib.md5(f"generate_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
398
  system = (
399
  f"You are a skilled content writer in {language}. Given article paragraphs and keyword candidates "
400
- "for a target link, do: 1) choose ONE best keyword; 2) write 1-3 sentences including it as an <a href>; "
401
- "3) provide the exact source sentence AFTER WHICH to insert. "
402
  "Return JSON keys: chosen_keyword, new_content, insert_after_sentence."
403
  )
404
  user = {
405
  "article_paragraphs": source_blocks[:7],
406
  "available_keywords": keywords,
407
  "target_url": target_url,
408
- "language": language,
409
- "requirements": {"natural_flow": True, "include_link": True}
410
  }
411
-
412
- try:
413
- obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
414
- return obj
415
- except Exception as e:
416
- print(f"[GPT] Content generation failed: {e}")
417
- try:
418
- obj = _openai_chat_cached(cache_key + "_fallback", FALLBACK_OPENAI_MODEL, system, user)
419
- return obj
420
- except Exception:
421
- return None
422
 
423
  def to_plain_text(html_or_text: str) -> str:
424
  text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
425
  return html.unescape(text)
426
 
427
  # =========================
428
- # Core logic (with ANCHOR-FIRST fix)
429
  # =========================
 
 
 
 
 
 
 
 
 
430
  def find_alternative_anchor(blocks, target_url, original_anchor):
431
  try:
432
- print(f"[Alternative] Extracting target page content from {target_url}")
433
- target_blocks = get_text_blocks(target_url, max_paragraphs=5)
434
- if not target_blocks:
435
- print("[Alternative] No content extracted from target page")
436
- return None, None
437
-
438
- keywords = gpt_get_search_keywords(target_blocks, target_url)
439
- print(f"[Alternative] Keywords identified: {keywords}")
440
- if not keywords or not isinstance(keywords, list):
441
  return None, None
442
 
443
  source_text = " ".join(blocks[:2])
444
- detected_lang = detect_language(source_text)
445
- language_name = get_language_name(detected_lang)
446
- print(f"[Alternative] Detected language: {language_name}")
447
 
448
  result = gpt_generate_content_with_keyword(
449
  source_blocks=blocks,
@@ -454,7 +452,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
454
  if not result:
455
  return None, None
456
 
457
- chosen_keyword = result.get("chosen_keyword", keywords[0] if keywords else original_anchor)
458
  new_content = result.get("new_content", "")
459
  insert_after_sentence = result.get("insert_after_sentence", "")
460
 
@@ -469,7 +467,7 @@ def find_alternative_anchor(blocks, target_url, original_anchor):
469
  return chosen_keyword, f"{position_text}\n\n{new_content}" if position_text else new_content
470
 
471
  except Exception as e:
472
- print(f"[Alternative] Critical error: {e}")
473
  return None, None
474
 
475
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
@@ -483,109 +481,83 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alt
483
  print("="*50)
484
 
485
  full_text = " ".join(blocks)
486
- anchor_text_lower = anchor_text.lower() if anchor_text else ""
487
-
488
- # Is the anchor anywhere in the article?
489
  keyword_present = _contains_anchor(full_text, anchor_text)
490
 
491
- # If anchor is present, FORCE using the first block that contains it
 
 
 
 
 
 
492
  if keyword_present:
493
- print("Anchor present in article β†’ using anchor-first strategy.")
494
  anchor_block_indices = [i for i, b in enumerate(blocks) if _contains_anchor(b, anchor_text)]
495
  top_idx = [anchor_block_indices[0]] if anchor_block_indices else [0]
496
  else:
497
- # No anchor present: use similarity search to choose the best block
498
- print("Anchor NOT present β†’ using similarity strategy.")
499
- # Get a bit of target context for the query
500
- try:
501
- tgt_html = requests.get(target_url, timeout=20, headers=UA).text
502
- tt = BeautifulSoup(tgt_html, "html.parser").title
503
- tgt_title = tt.get_text().strip() if tt else ""
504
- except Exception as e:
505
- print(f"Error fetching target URL: {e}")
506
- tgt_title = ""
507
- ext = tldextract.extract(target_url)
508
- tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
509
- query = f"{anchor_text} β€” relevant to: {tgt_title} ({tgt_domain})"
510
-
511
  try:
512
  q_emb = embed([query])[0]
513
  blk_embs = embed(blocks)
514
  sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
515
  top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
516
  except Exception as e:
517
- print(f"Error in block embedding/similarity: {e}")
518
  top_idx = [0]
519
 
520
  results = []
521
  for idx in top_idx:
522
- try:
523
- idx = min(idx, len(blocks)-1)
524
- blk = blocks[idx]
525
-
526
- # Split into sentences (also split on newlines)
527
- sents = re.split(r'(?<=[.!?])\s+|\n+', blk)
528
- sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
529
- if not sents:
530
- sents = [blk]
531
-
532
- # If anchor is present overall and in this block, pick the sentence that CONTAINS the anchor
533
- best_sent = None
534
- if keyword_present and _contains_anchor(blk, anchor_text):
535
- anchor_sents = [s for s in sents if _contains_anchor(s, anchor_text)]
536
- if anchor_sents:
537
- best_sent = anchor_sents[0]
538
-
539
- # Otherwise, fall back to embedding-based sentence choice
540
- if best_sent is None:
541
- try:
542
- # Build a lightweight query
543
- query_sent = f"{anchor_text} context"
544
- q_emb_s = embed([query_sent])[0]
545
- s_embs = embed(sents)
546
- s_sims = F.cosine_similarity(s_embs, q_emb_s.repeat(len(sents),1))
547
- si = int(torch.argmax(s_sims).item())
548
- best_sent = sents[si]
549
- except Exception as e:
550
- print(f"Error in sentence selection: {e}, using first sentence")
551
- best_sent = sents[0]
552
-
553
- if not best_sent or len(best_sent.strip()) == 0:
554
- best_sent = blk if blk else "Unable to extract sentence from this section."
555
-
556
- # Anchor presence in the selected sentence
557
- anchor_found_in_sentence = _contains_anchor(best_sent, anchor_text)
558
-
559
- rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
560
-
561
- result = {
562
- "anchor_was_present": anchor_found_in_sentence,
563
- "best_sentence_original": best_sent,
564
- "best_sentence_with_anchor": rewritten_sent,
565
- "keyword_in_article": keyword_present
566
- }
567
-
568
- if suggest_alternative and not keyword_present:
569
- try:
570
- alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
571
- if alt_anchor and alt_content:
572
- result["alternative_anchor"] = alt_anchor
573
- result["alternative_sentence_original"] = ""
574
- result["alternative_sentence"] = alt_content
575
- result["alternative_exact_match"] = True
576
- except Exception as e:
577
- print(f"Error generating alternative content: {e}")
578
-
579
- results.append(result)
580
-
581
- except Exception as e:
582
- print(f"Error processing block {idx}: {e}")
583
- results.append({
584
- "anchor_was_present": False,
585
- "best_sentence_original": blocks[0] if blocks else "Error extracting content",
586
- "best_sentence_with_anchor": f"Error processing content. Please try adding the link manually: <a href='{target_url}'>{anchor_text}</a>",
587
- "keyword_in_article": keyword_present
588
- })
589
 
590
  return results
591
 
@@ -632,35 +604,45 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, sug
632
  anchor_was_present = res.get("anchor_was_present", False)
633
  keyword_in_article = res.get("keyword_in_article", False)
634
 
 
 
 
 
 
 
 
 
 
635
  if keyword_in_article:
636
- if anchor_was_present:
637
- final_html = draft_html
638
- else:
639
- final_html = draft_html
640
- if smart_rewrite:
641
- g = gpt_rewrite(final_html, anchor_text, target_url, style="neutral", language=language_name)
642
- final_html = g["sentence_html"]
643
- polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
644
- final_html = polished.get("sentence_html", final_html)
645
-
646
- final_output = to_plain_text(final_html) if plain_text else final_html
647
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
648
  result += "πŸ”— Add link here:\n\n"
649
  result += f"{final_output}"
650
  else:
651
- final_html = draft_html
652
- if smart_rewrite:
653
- g = gpt_rewrite(final_html, anchor_text, target_url, style="neutral", language=language_name)
654
- final_html = g["sentence_html"]
655
- polished = gpt_validate_and_polish(final_html, anchor_text, target_url, language=language_name)
656
- final_html = polished.get("sentence_html", final_html)
657
- final_output = to_plain_text(final_html) if plain_text else final_html
658
-
659
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
660
  result += "πŸ”— Result 1 - Suggested placement:\n\n"
661
  result += f"Change this sentence: {original_sentence}\n\n"
662
  result += f"With this one: {final_output}"
663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
664
  return result
665
 
666
  def clear_cache():
@@ -714,10 +696,10 @@ with gr.Blocks(title=f"Link Insertion Helper β€’ GPT: {gpt_status}") as demo:
714
 
715
  gr.Markdown("""
716
  ### Features:
717
- - 🌍 **Auto Language Detection**: Preserves special characters (Δ‡, č, Ε‘, ΕΎ, Δ‘, etc.)
718
- - πŸ’Ύ **Smart Caching**: Caches embeddings and API responses for faster repeated queries
719
- - 🎯 **Anchor-First Placement**: If the anchor exists, pick the exact sentence containing it
720
- - 🧠 **Similarity Fallback**: If not found, suggest the most relevant sentence via embeddings
721
  - 🧰 **Robust Extraction**: Trafilatura + BS4; optional PDF/Cloudflare handling
722
  """)
723
 
 
1
  import os, re, json, requests, urllib.parse, hashlib, html
2
  from functools import lru_cache
3
+ from typing import List, Optional, Tuple
4
 
5
  # Torch / Transformers
6
  import torch, torch.nn.functional as F
 
38
  )
39
  }
40
 
41
+ # --- OpenAI settings (simplified for GPT-5) ---
42
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
43
+ PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # simplified per your request
44
  FALLBACK_OPENAI_MODEL = "gpt-4o-mini"
45
  OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
46
 
 
222
  print(f"get_text_blocks fatal: {e}")
223
  return []
224
 
225
+ # -------- target context helpers --------
226
+ def get_target_context(url: str) -> Tuple[str, str, str, List[str]]:
227
+ """
228
+ Return (title, meta_description, h1, content_blocks)
229
+ """
230
+ title = ""; meta = ""; h1 = ""; blocks: List[str] = []
231
+ try:
232
+ r = _fetch_bytes(url)
233
+ if not r:
234
+ return title, meta, h1, blocks
235
+ soup = BeautifulSoup(r.text, "html.parser")
236
+ if soup.title and soup.title.get_text():
237
+ title = soup.title.get_text().strip()
238
+ md = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property":"og:description"})
239
+ if md and md.get("content"):
240
+ meta = md["content"].strip()
241
+ h1_tag = soup.find("h1")
242
+ if h1_tag:
243
+ h1 = h1_tag.get_text(" ", strip=True)
244
+ except Exception as e:
245
+ print(f"[target] soup err: {e}")
246
+
247
+ # text blocks via trafilatura/BS4 too
248
+ tb = get_text_blocks(url, max_paragraphs=6)
249
+ if tb:
250
+ blocks = tb
251
+ return title, meta, h1, blocks
252
+
253
+ def keyword_fallback_from_title_domain(title: str, url: str) -> List[str]:
254
+ ext = tldextract.extract(url)
255
+ brand = (ext.domain or "").replace("-", " ").strip()
256
+ base = []
257
+ if title:
258
+ t = _norm(title)
259
+ # crude noun-ish picks: split and keep non-trivial tokens
260
+ tokens = [w for w in t.split() if len(w) >= 4]
261
+ base.extend(tokens[:6])
262
+ # domain derived guesses
263
+ if brand:
264
+ base.extend([brand, f"{brand} reviews", f"{brand} guide"])
265
+ # simple dedupe
266
+ seen = set()
267
+ out = []
268
+ for k in base:
269
+ k2 = k.strip()
270
+ if k2 and k2 not in seen:
271
+ out.append(k2)
272
+ seen.add(k2)
273
+ # some generic fallbacks if still empty
274
+ if not out:
275
+ out = ["learn more", "full guide", "product details"]
276
+ return out[:8]
277
+
278
  # =========================
279
  # Embedding helpers
280
  # =========================
 
300
  def inject_anchor_into_sentence(sentence, anchor_text, target_url):
301
  if not sentence or not anchor_text:
302
  return sentence, False
 
303
  try:
304
  pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
305
  if pattern.search(sentence):
 
307
  return result, True
308
  except Exception:
309
  pass
310
+ # else append a natural clause (no em dashes)
 
311
  if len(sentence) > 0 and sentence[-1] in '.!?':
312
  base, punct = sentence[:-1], sentence[-1]
313
  else:
314
  base, punct = sentence, '.'
315
+ rewritten = f'{base} {anchor_text}.' if anchor_text.lower().startswith("http") else f'{base} <a href="{target_url}">{anchor_text}</a>{punct}'
 
316
  return rewritten, False
317
 
318
  # =========================
319
+ # OpenAI helpers (SIMPLE BODY for GPT-5)
320
  # =========================
321
+ def _openai_chat_simple(model_name: str, system: str, user_json: dict):
322
+ """
323
+ Minimal body: model + messages only (no response_format / max_tokens etc.)
324
+ """
 
325
  if not OPENAI_API_KEY:
326
  raise RuntimeError("OPENAI_API_KEY not set")
327
 
328
  headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
329
  body = {
330
  "model": model_name,
 
331
  "messages": [
332
  {"role": "system", "content": system},
333
+ {"role": "user", "content": json.dumps(user_json, ensure_ascii=False)}
334
+ ]
 
335
  }
336
  r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
337
  print(f"[GPT] Model={model_name} HTTP {r.status_code}")
338
  r.raise_for_status()
339
  txt = r.json()["choices"][0]["message"]["content"]
340
+ try:
341
+ return json.loads(txt)
342
+ except Exception:
343
+ # if model returns plain text, wrap it
344
+ return {"text": txt}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ def _openai_chat_cached(cache_key: str, model_name: str, system: str, user_json: dict):
347
+ if cache_key in API_RESPONSE_CACHE:
348
+ print(f"[GPT] Using cached response for {cache_key[:8]}...")
349
+ return API_RESPONSE_CACHE[cache_key]
350
  try:
351
+ result = _openai_chat_simple(model_name, system, user_json)
352
  except Exception as e:
353
  print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
354
+ result = _openai_chat_simple(FALLBACK_OPENAI_MODEL, system, user_json)
355
+ API_RESPONSE_CACHE[cache_key] = result
356
+ return result
 
 
 
 
 
357
 
358
+ def gpt_rewrite(sentence_html, anchor_text, target_url, language="English", target_context: str = ""):
359
+ """
360
+ Target-aware rewrite. No 'avoid click here' restriction (supports generic anchors).
361
+ """
362
  if not OPENAI_API_KEY:
363
  return {"sentence_html": sentence_html}
364
 
365
+ cache_key = hashlib.md5(f"rw_{sentence_html}{anchor_text}{target_url}{language}{target_context}".encode()).hexdigest()
366
  system = (
367
+ f"You are a precise editor writing in {language}. "
368
+ "Integrate the provided anchor naturally into the sentence (or add a short clause). "
369
+ "Keep tone and length similar; no em dashes. Return JSON with key 'sentence_html' only."
 
370
  )
371
  user = {
372
+ "task": "rewrite_for_link_insertion",
373
  "sentence_html": sentence_html,
374
  "anchor_text": anchor_text,
375
  "target_url": target_url,
376
+ "target_context": target_context,
377
+ "language": language
378
  }
379
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
380
+ return {"sentence_html": obj.get("sentence_html", obj.get("text", sentence_html))}
381
 
382
+ def gpt_get_search_keywords_from_context(ctx_text: str, target_url: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  if not OPENAI_API_KEY:
384
+ return []
385
+ cache_key = hashlib.md5(f"kw_{target_url}_{ctx_text[:600]}".encode()).hexdigest()
 
 
386
  system = (
387
+ "You are an SEO assistant. From the provided target page context, return 5-10 realistic keyword phrases "
388
+ "users would search for to find it. Return JSON {'keywords': [...] } only."
389
  )
390
+ user = {"url": target_url, "context": ctx_text}
391
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
392
+ return obj.get("keywords", [])
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
395
  if not OPENAI_API_KEY or not keywords:
396
  return None
 
397
  source_preview = " ".join(source_blocks[:3])[:500]
398
+ cache_key = hashlib.md5(f"gen_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
399
  system = (
400
  f"You are a skilled content writer in {language}. Given article paragraphs and keyword candidates "
401
+ "for a target link, do: 1) choose ONE best keyword; 2) write 1-2 natural sentences that include it "
402
+ "as an <a href> to target_url; 3) provide the exact source sentence AFTER WHICH to insert. "
403
  "Return JSON keys: chosen_keyword, new_content, insert_after_sentence."
404
  )
405
  user = {
406
  "article_paragraphs": source_blocks[:7],
407
  "available_keywords": keywords,
408
  "target_url": target_url,
409
+ "language": language
 
410
  }
411
+ obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
412
+ return obj
 
 
 
 
 
 
 
 
 
413
 
414
  def to_plain_text(html_or_text: str) -> str:
415
  text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
416
  return html.unescape(text)
417
 
418
  # =========================
419
+ # Core logic (ANCHOR-FIRST + TARGET-AWARE)
420
  # =========================
421
+ def build_target_context_string(target_url: str) -> str:
422
+ title, meta, h1, blocks = get_target_context(target_url)
423
+ ctx_parts = []
424
+ if title: ctx_parts.append(f"Title: {title}")
425
+ if meta: ctx_parts.append(f"Meta: {meta}")
426
+ if h1: ctx_parts.append(f"H1: {h1}")
427
+ if blocks: ctx_parts.append("Body: " + " ".join(blocks[:3]))
428
+ return "\n".join(ctx_parts)[:2000]
429
+
430
  def find_alternative_anchor(blocks, target_url, original_anchor):
431
  try:
432
+ ctx = build_target_context_string(target_url)
433
+ print(f"[Alt] Target context len={len(ctx)}")
434
+ keywords = gpt_get_search_keywords_from_context(ctx, target_url)
435
+ if not keywords:
436
+ # Heuristic fallback from title/domain if GPT/ctx is weak
437
+ title, _, _, _ = get_target_context(target_url)
438
+ keywords = keyword_fallback_from_title_domain(title, target_url)
439
+
440
+ if not keywords:
441
  return None, None
442
 
443
  source_text = " ".join(blocks[:2])
444
+ language_name = get_language_name(detect_language(source_text))
 
 
445
 
446
  result = gpt_generate_content_with_keyword(
447
  source_blocks=blocks,
 
452
  if not result:
453
  return None, None
454
 
455
+ chosen_keyword = result.get("chosen_keyword", keywords[0])
456
  new_content = result.get("new_content", "")
457
  insert_after_sentence = result.get("insert_after_sentence", "")
458
 
 
467
  return chosen_keyword, f"{position_text}\n\n{new_content}" if position_text else new_content
468
 
469
  except Exception as e:
470
+ print(f"[Alt] Critical error: {e}")
471
  return None, None
472
 
473
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
 
481
  print("="*50)
482
 
483
  full_text = " ".join(blocks)
 
 
 
484
  keyword_present = _contains_anchor(full_text, anchor_text)
485
 
486
+ # Build target-aware query
487
+ t_title, t_meta, t_h1, _ = get_target_context(target_url)
488
+ ext = tldextract.extract(target_url)
489
+ tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
490
+ query = f"{anchor_text} β€” relevant to: {t_title or t_h1} | {t_meta} ({tgt_domain})"
491
+
492
+ # Choose candidate block indices
493
  if keyword_present:
494
+ print("Anchor present β†’ use the first block containing it.")
495
  anchor_block_indices = [i for i, b in enumerate(blocks) if _contains_anchor(b, anchor_text)]
496
  top_idx = [anchor_block_indices[0]] if anchor_block_indices else [0]
497
  else:
498
+ print("Anchor NOT present β†’ similarity search with target context.")
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  try:
500
  q_emb = embed([query])[0]
501
  blk_embs = embed(blocks)
502
  sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
503
  top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
504
  except Exception as e:
505
+ print(f"Similarity error: {e}")
506
  top_idx = [0]
507
 
508
  results = []
509
  for idx in top_idx:
510
+ idx = min(idx, len(blocks)-1)
511
+ blk = blocks[idx]
512
+
513
+ # Split into sentences (also split on newlines)
514
+ sents = re.split(r'(?<=[.!?])\s+|\n+', blk)
515
+ sents = [s.strip() for s in sents if s and len(s.strip()) > 10]
516
+ if not sents:
517
+ sents = [blk]
518
+
519
+ # If anchor is present in block, pick the sentence that contains it
520
+ best_sent = None
521
+ if keyword_present and _contains_anchor(blk, anchor_text):
522
+ anchor_sents = [s for s in sents if _contains_anchor(s, anchor_text)]
523
+ if anchor_sents:
524
+ best_sent = anchor_sents[0]
525
+
526
+ # Otherwise, choose via sentence-level similarity against target-aware mini query
527
+ if best_sent is None:
528
+ try:
529
+ q_emb_s = embed([f"{anchor_text} {t_title} {t_h1}"])[0]
530
+ s_embs = embed(sents)
531
+ s_sims = F.cosine_similarity(s_embs, q_emb_s.repeat(len(sents),1))
532
+ si = int(torch.argmax(s_sims).item())
533
+ best_sent = sents[si]
534
+ except Exception as e:
535
+ print(f"Sentence selection error: {e}")
536
+ best_sent = sents[0]
537
+
538
+ if not best_sent or len(best_sent.strip()) == 0:
539
+ best_sent = blk
540
+
541
+ # Inject anchor (or append clause)
542
+ rewritten_sent, _ = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
543
+
544
+ result = {
545
+ "anchor_was_present": _contains_anchor(best_sent, anchor_text),
546
+ "best_sentence_original": best_sent,
547
+ "best_sentence_with_anchor": rewritten_sent,
548
+ "keyword_in_article": keyword_present
549
+ }
550
+
551
+ # Alternative anchor & content
552
+ if suggest_alternative and not keyword_present:
553
+ alt_anchor, alt_content = find_alternative_anchor(blocks, target_url, anchor_text)
554
+ if alt_anchor and alt_content:
555
+ result["alternative_anchor"] = alt_anchor
556
+ result["alternative_sentence_original"] = ""
557
+ result["alternative_sentence"] = alt_content
558
+ result["alternative_exact_match"] = True
559
+
560
+ results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
 
562
  return results
563
 
 
604
  anchor_was_present = res.get("anchor_was_present", False)
605
  keyword_in_article = res.get("keyword_in_article", False)
606
 
607
+ final_html = draft_html
608
+ if smart_rewrite:
609
+ # Pass target context to the rewrite so it aligns with the target page topic
610
+ ctx = build_target_context_string(target_url)
611
+ g = gpt_rewrite(final_html, anchor_text, target_url, language=language_name, target_context=ctx)
612
+ final_html = g["sentence_html"]
613
+
614
+ final_output = to_plain_text(final_html) if plain_text else final_html
615
+
616
  if keyword_in_article:
 
 
 
 
 
 
 
 
 
 
 
617
  result = warn + f"βœ… **Anchor text '{anchor_text}' found in article!**\n\n"
618
  result += "πŸ”— Add link here:\n\n"
619
  result += f"{final_output}"
620
  else:
 
 
 
 
 
 
 
 
621
  result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
622
  result += "πŸ”— Result 1 - Suggested placement:\n\n"
623
  result += f"Change this sentence: {original_sentence}\n\n"
624
  result += f"With this one: {final_output}"
625
 
626
+ # Show alternative if available
627
+ if suggest_alternative_anchor and res.get("alternative_anchor"):
628
+ alt_anchor = res["alternative_anchor"]
629
+ alt_content = res.get("alternative_sentence", "")
630
+ if alt_content:
631
+ if "[Insert after:" in alt_content:
632
+ parts = alt_content.split("\n\n", 1)
633
+ position_info = parts[0] if len(parts) > 0 else ""
634
+ actual_content = parts[1] if len(parts) > 1 else alt_content
635
+ else:
636
+ position_info = ""
637
+ actual_content = alt_content
638
+ alt_output = to_plain_text(actual_content) if plain_text else actual_content
639
+ result += f"\n\n{'='*50}\n\n"
640
+ result += "πŸ”— Result 2 - Suggested new anchor and placement:\n"
641
+ result += f"πŸ’‘ Using keyword: '{alt_anchor}'\n"
642
+ if position_info and "[Insert after:" in position_info:
643
+ result += f"πŸ“ {position_info}\n"
644
+ result += f"\n{alt_output}"
645
+
646
  return result
647
 
648
  def clear_cache():
 
696
 
697
  gr.Markdown("""
698
  ### Features:
699
+ - 🌍 **Auto Language Detection** (Δ‡, č, Ε‘, ΕΎ, Δ‘ preserved)
700
+ - 🎯 **Anchor-First** if present; otherwise **Target-Aware** similarity
701
+ - 🧠 **Target-Aware Rewrite** (uses title/meta/H1/body from the target page)
702
+ - πŸ”„ **Alternative Anchor** with GPT + heuristic fallback (always tries to return Result 2)
703
  - 🧰 **Robust Extraction**: Trafilatura + BS4; optional PDF/Cloudflare handling
704
  """)
705