import os, re, json, requests, urllib.parse, hashlib, html
from functools import lru_cache
from typing import List, Optional, Tuple

# Torch / Transformers
import torch, torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Parsing / Extraction
from bs4 import BeautifulSoup
import tldextract
import trafilatura

# Optional fallbacks
try:
    import cloudscraper
    HAS_CLOUDSCRAPER = True
except Exception:
    HAS_CLOUDSCRAPER = False

try:
    from pdfminer.high_level import extract_text as pdf_extract_text
    HAS_PDFMINER = True
except Exception:
    HAS_PDFMINER = False

# UI
import gradio as gr

# =========================
# Config
# =========================
MODEL = "michiyasunaga/LinkBERT-base"
UA = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
    )
}

# --- OpenAI settings (simplified for GPT-5) ---
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")   # per your request
FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"
OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"

# Caches
EMBEDDING_CACHE = {}
API_RESPONSE_CACHE = {}

# =========================
# Load LinkBERT (sentence-level embeddings)
# =========================
tok = AutoTokenizer.from_pretrained(MODEL)
enc = AutoModel.from_pretrained(MODEL)

# =========================
# Language Detection
# =========================
from langdetect import detect, LangDetectException

def detect_language(text: str) -> str:
    try:
        return detect(text)
    except LangDetectException:
        return 'en'

def get_language_name(lang_code: str) -> str:
    lang_map = {
        'en': 'English', 'es': 'Spanish', 'fr': 'French', 'de': 'German',
        'it': 'Italian', 'pt': 'Portuguese', 'ru': 'Russian', 'ja': 'Japanese',
        'ko': 'Korean', 'zh': 'Chinese', 'ar': 'Arabic', 'hi': 'Hindi',
        'sr': 'Serbian', 'hr': 'Croatian', 'bs': 'Bosnian', 'sl': 'Slovenian',
        'mk': 'Macedonian', 'bg': 'Bulgarian', 'cs': 'Czech', 'sk': 'Slovak',
        'pl': 'Polish', 'uk': 'Ukrainian', 'ro': 'Romanian', 'hu': 'Hungarian'
    }
    return lang_map.get(lang_code, 'English')

# =========================
# Helpers
# =========================
def looks_like_url(text: str) -> bool:
    if not text:
        return False
    text = text.strip()
    if re.match(r'^(https?://)', text, flags=re.I):
        return True
    parts = urllib.parse.urlparse("http://" + text if "://" not in text else text)
    return bool(parts.netloc and "." in parts.netloc)

def normalize_url(url: str) -> str:
    if not url:
        return url
    if not re.match(r'^https?://', url, flags=re.I):
        return "https://" + url
    return url

def _norm(s: str) -> str:
    return re.sub(r'\s+', ' ', re.sub(r'[^a-z0-9 ]', ' ', s.lower())).strip()

def _contains_anchor(text: str, anchor: str) -> bool:
    if not text or not anchor:
        return False
    t = _norm(text)
    a = _norm(anchor)
    return a in t

# =========================
# Robust fetching + text extraction
# =========================
def _fetch_bytes(url: str, timeout: int = 25) -> Optional[requests.Response]:
    sess = requests.Session()
    sess.headers.update({
        "User-Agent": UA["User-Agent"],
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
    })
    try:
        r = sess.get(url, timeout=timeout, allow_redirects=True)
        print(f"[fetch] requests: {r.status_code} {len(r.content)} bytes from {r.url}")
        if r.ok and r.content:
            return r
    except Exception as e:
        print(f"[fetch] requests error: {e}")

    if HAS_CLOUDSCRAPER:
        try:
            scraper = cloudscraper.create_scraper(browser={'custom': UA["User-Agent"]})
            r = scraper.get(url, timeout=timeout, allow_redirects=True)
            print(f"[fetch] cloudscraper: {r.status_code} {len(r.content)} bytes from {r.url}")
            if r.ok and r.content:
                return r
        except Exception as e:
            print(f"[fetch] cloudscraper error: {e}")

    return None

def _split_to_blocks(raw: str, max_paragraphs: int) -> List[str]:
    raw = re.sub(r'\r', '\n', raw)
    raw = re.sub(r'\n{3,}', '\n\n', raw)
    chunks = [c.strip() for c in re.split(r'\n\s*\n', raw) if c.strip()]
    blocks: List[str] = []
    for c in chunks:
        merged = re.sub(r'\s*\n\s*', ' ', c)
        if len(merged) >= 40:
            blocks.append(merged)
            if len(blocks) >= max_paragraphs:
                break
    return blocks

def get_text_blocks(url: str, max_paragraphs: int = 8) -> List[str]:
    try:
        if re.search(r'\.pdf($|\?)', url, flags=re.I):
            if HAS_PDFMINER:
                try:
                    r = _fetch_bytes(url)
                    if not r:
                        print("PDF fetch failed.")
                        return []
                    txt = pdf_extract_text(fp=bytes(r.content))
                    blocks = _split_to_blocks(txt or "", max_paragraphs)
                    print(f"PDF extracted {len(blocks)} blocks")
                    return blocks
                except Exception as pe:
                    print(f"PDF extract error: {pe}")
                    return []
            else:
                print("PDF detected but pdfminer.six not installed.")
                return []

        r = _fetch_bytes(url)
        if not r:
            print("No response fetched (blocked or network).")
            return []

        try:
            txt = trafilatura.extract(
                r.content,
                base_url=r.url,
                include_comments=False,
                include_tables=False,
                deduplicate=True,
                output_format="txt",
                favor_precision=False
            )
        except Exception as te:
            print(f"Trafilatura extract error: {te}")
            txt = None

        if txt:
            blocks = _split_to_blocks(txt, max_paragraphs)
            if blocks:
                print(f"Trafilatura extracted {len(blocks)} blocks")
                return blocks

        soup = BeautifulSoup(r.text, "html.parser")
        for tag in soup(["script", "style", "noscript", "header", "nav", "aside", "form", "footer"]):
            tag.decompose()

        paras = [p.get_text(" ", strip=True) for p in soup.find_all(["p", "li"]) if p.get_text(strip=True)]
        combined: List[str] = []
        buf: List[str] = []
        for p in paras:
            buf.append(p)
            if len(" ".join(buf)) >= 120:
                combined.append(" ".join(buf))
                buf = []
                if len(combined) >= max_paragraphs:
                    break
        if buf and len(combined) < max_paragraphs:
            if len(" ".join(buf)) >= 40:
                combined.append(" ".join(buf))

        if combined:
            print(f"BeautifulSoup fallback collected {len(combined)} blocks")
            return combined

        print("No usable text extracted after all fallbacks.")
        return []

    except Exception as e:
        print(f"get_text_blocks fatal: {e}")
        return []

# -------- target context helpers --------
def get_target_context(url: str) -> Tuple[str, str, str, List[str]]:
    """
    Return (title, meta_description, h1, content_blocks)
    """
    title = ""; meta = ""; h1 = ""; blocks: List[str] = []
    try:
        r = _fetch_bytes(url)
        if not r:
            return title, meta, h1, blocks
        soup = BeautifulSoup(r.text, "html.parser")
        if soup.title and soup.title.get_text():
            title = soup.title.get_text().strip()
        md = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property":"og:description"})
        if md and md.get("content"):
            meta = md["content"].strip()
        h1_tag = soup.find("h1")
        if h1_tag:
            h1 = h1_tag.get_text(" ", strip=True)
    except Exception as e:
        print(f"[target] soup err: {e}")

    tb = get_text_blocks(url, max_paragraphs=6)
    if tb:
        blocks = tb
    return title, meta, h1, blocks

def keyword_fallback_from_title_domain(title: str, url: str) -> List[str]:
    ext = tldextract.extract(url)
    brand = (ext.domain or "").replace("-", " ").strip()
    base = []
    if title:
        t = _norm(title)
        tokens = [w for w in t.split() if len(w) >= 4]
        base.extend(tokens[:6])
    if brand:
        base.extend([brand, f"{brand} reviews", f"{brand} guide"])
    seen = set(); out=[]
    for k in base:
        k2 = k.strip()
        if k2 and k2 not in seen:
            out.append(k2); seen.add(k2)
    if not out:
        out = ["learn more", "full guide", "product details"]
    return out[:8]

# =========================
# Extract paragraph sentences ONLY (no headings)
# =========================
def _paragraph_sentences_from_html(url: str) -> List[str]:
    """
    Return a flat list of sentences taken only from <p> tags of the source page.
    Excludes headings/lists to avoid proposing H tags.
    """
    sents: List[str] = []
    try:
        r = _fetch_bytes(url)
        if not r:
            return sents
        soup = BeautifulSoup(r.text, "html.parser")
        paras = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]
        for p in paras:
            split = re.split(r'(?<=[.!?])\s+|\n+', p)
            for s in split:
                s = s.strip()
                if len(s) >= 10:
                    sents.append(s)
    except Exception as e:
        print(f"[p-sents] error: {e}")
    return sents

def _sentence_contains_anchor(s: str, anchor: str) -> bool:
    return _contains_anchor(s, anchor)

# =========================
# Embedding helpers
# =========================
def mean_pool(last_hidden_state, mask):
    x = last_hidden_state
    mask = mask.unsqueeze(-1)
    return (x * mask).sum(1) / mask.sum(1)

@lru_cache(maxsize=1000)
def embed_cached(text_tuple):
    texts = list(text_tuple)
    batch = tok(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        out = enc(**batch)
    return mean_pool(out.last_hidden_state, batch["attention_mask"])

def embed(texts: List[str]):
    return embed_cached(tuple(texts))

# =========================
# Anchor injection helper
# =========================
def inject_anchor_into_sentence(sentence, anchor_text, target_url):
    """
    If the sentence already has the anchor text → wrap it; else append a short clause.
    (Used only when anchor exists in article.)
    """
    if not sentence or not anchor_text:
        return sentence, False
    try:
        pattern = re.compile(r'\b' + re.escape(anchor_text) + r'\b', re.IGNORECASE)
        if pattern.search(sentence):
            result = pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence)
            return result, True
    except Exception:
        pass
    if len(sentence) > 0 and sentence[-1] in '.!?':
        base, punct = sentence[:-1], sentence[-1]
    else:
        base, punct = sentence, '.'
    rewritten = f'{base} <a href="{target_url}">{anchor_text}</a>{punct}'
    return rewritten, False

# =========================
# OpenAI helpers (SIMPLE BODY for GPT-5)
# =========================
def _openai_chat_simple(model_name: str, system: str, user_json: dict):
    """
    Minimal body: model + messages only (no response_format/max_tokens/etc.).
    """
    if not OPENAI_API_KEY:
        raise RuntimeError("OPENAI_API_KEY not set")

    headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
    body = {
        "model": model_name,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": json.dumps(user_json, ensure_ascii=False)}
        ]
    }
    r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
    print(f"[GPT] Model={model_name} HTTP {r.status_code}")
    r.raise_for_status()
    txt = r.json()["choices"][0]["message"]["content"]
    try:
        return json.loads(txt)
    except Exception:
        return {"text": txt}

def _openai_chat_cached(cache_key: str, model_name: str, system: str, user_json: dict):
    if cache_key in API_RESPONSE_CACHE:
        print(f"[GPT] Using cached response for {cache_key[:8]}...")
        return API_RESPONSE_CACHE[cache_key]
    try:
        result = _openai_chat_simple(model_name, system, user_json)
    except Exception as e:
        print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
        result = _openai_chat_simple(FALLBACK_OPENAI_MODEL, system, user_json)
    API_RESPONSE_CACHE[cache_key] = result
    return result

# =========================
# Target-aware paragraph generators
# =========================
def build_target_context_string(target_url: str) -> str:
    title, meta, h1, blocks = get_target_context(target_url)
    ctx_parts = []
    if title: ctx_parts.append(f"Title: {title}")
    if meta:  ctx_parts.append(f"Meta: {meta}")
    if h1:    ctx_parts.append(f"H1: {h1}")
    if blocks: ctx_parts.append("Body: " + " ".join(blocks[:3]))
    return "\n".join(ctx_parts)[:2000]

def gpt_generate_insert_paragraph(anchor_text: str, target_url: str, language: str,
                                  insert_after_sentence: str, article_context: List[str],
                                  target_context: str) -> str:
    """
    Generate 1–3 sentences paragraph (HTML) that includes the exact anchor as a link,
    written to fit right after the given sentence.
    """
    if not OPENAI_API_KEY:
        # simple fallback
        return f'<p>For more details, see <a href="{target_url}">{anchor_text}</a>.</p>'

    cache_key = hashlib.md5(
        f"para_{anchor_text}_{target_url}_{language}_{insert_after_sentence}_{' '.join(article_context)[:400]}_{target_context[:400]}".encode()
    ).hexdigest()

    system = (
        f"You are a precise copywriter in {language}. "
        "Write a short paragraph (1–3 sentences) that fits naturally into the article context, "
        "goes immediately AFTER the given sentence, and includes an <a href> with the EXACT provided anchor text "
        "pointing to the target URL. No em dashes. Output JSON with key 'paragraph_html'."
    )
    user = {
        "insert_after_sentence": insert_after_sentence,
        "article_context": article_context[:8],
        "target_context": target_context,
        "anchor_text": anchor_text,
        "target_url": target_url
    }
    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
    return obj.get("paragraph_html", obj.get("text", f'<p><a href="{target_url}">{anchor_text}</a></p>'))

def gpt_get_search_keywords_from_context(ctx_text: str, target_url: str) -> List[str]:
    if not OPENAI_API_KEY:
        return []
    cache_key = hashlib.md5(f"kw_{target_url}_{ctx_text[:600]}".encode()).hexdigest()
    system = (
        "You are an SEO assistant. From the provided target page context, return 5-10 realistic keyword phrases "
        "users would search for to find it. Return JSON {'keywords': [...] } only."
    )
    user = {"url": target_url, "context": ctx_text}
    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
    return obj.get("keywords", [])

def gpt_generate_content_with_keyword(source_blocks, keywords, target_url, language="English"):
    if not OPENAI_API_KEY or not keywords:
        return None
    source_preview = " ".join(source_blocks[:3])[:500]
    cache_key = hashlib.md5(f"gen_{source_preview}_{str(keywords)}_{target_url}_{language}".encode()).hexdigest()
    system = (
        f"You are a skilled content writer in {language}. Given article paragraphs and keyword candidates "
        "for a target link, do: 1) choose ONE best keyword; 2) write 1–2 natural sentences that include it "
        "as an <a href> to target_url; 3) provide the exact source sentence AFTER WHICH to insert. "
        "Return JSON keys: chosen_keyword, new_content, insert_after_sentence."
    )
    user = {
        "article_paragraphs": source_blocks[:7],
        "available_keywords": keywords,
        "target_url": target_url,
        "language": language
    }
    obj = _openai_chat_cached(cache_key, PREFERRED_OPENAI_MODEL, system, user)
    return obj

# =========================
# Alternative anchor pipeline
# =========================
def find_alternative_anchor(blocks, target_url, original_anchor):
    try:
        ctx = build_target_context_string(target_url)
        print(f"[Alt] Target context len={len(ctx)}")
        keywords = gpt_get_search_keywords_from_context(ctx, target_url)
        if not keywords:
            title, _, _, _ = get_target_context(target_url)
            keywords = keyword_fallback_from_title_domain(title, target_url)

        if not keywords:
            return None, None

        source_text = " ".join(blocks[:2])
        language_name = get_language_name(detect_language(source_text))

        result = gpt_generate_content_with_keyword(
            source_blocks=blocks,
            keywords=keywords,
            target_url=target_url,
            language=language_name
        )
        if not result:
            return None, None

        chosen_keyword = result.get("chosen_keyword", keywords[0])
        new_content = result.get("new_content", "")
        insert_after_sentence = result.get("insert_after_sentence", "")

        if insert_after_sentence:
            if len(insert_after_sentence) > 100:
                position_text = f"[Insert after: ...{insert_after_sentence[-80:]}]"
            else:
                position_text = f"[Insert after: {insert_after_sentence}]"
        else:
            position_text = ""

        return chosen_keyword, f"{position_text}\n\n{new_content}" if position_text else new_content

    except Exception as e:
        print(f"[Alt] Critical error: {e}")
        return None, None

# =========================
# Main selector (paragraph-only, anchor-first, add-paragraph if missing)
# =========================
def suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=False):
    """
    - Pull sentences only from <p> tags.
    - If anchor exists → return the exact sentence with anchor injection.
    - If anchor doesn't exist → propose ADDITIONAL PARAGRAPH with [Insert after: …] marker.
    """
    try:
        para_sents = _paragraph_sentences_from_html(source_url)
        if not para_sents:
            blocks = get_text_blocks(source_url)
            if not blocks:
                return [{"error": f"No text blocks found on the page: {source_url}"}]
            para_sents = []
            for blk in blocks:
                for s in re.split(r'(?<=[.!?])\s+|\n+', blk):
                    s = s.strip()
                    if len(s) >= 10:
                        para_sents.append(s)
            if not para_sents:
                return [{"error": f"No sentences found on the page: {source_url}"}]

        keyword_present = any(_sentence_contains_anchor(s, anchor_text) for s in para_sents)

        t_title, t_meta, t_h1, _ = get_target_context(target_url)
        ext = tldextract.extract(target_url)
        tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
        sent_query = f"{anchor_text} — relevant to: {t_title or t_h1} | {t_meta} ({tgt_domain})"

        # Use full article blocks as context for paragraph generation when needed
        article_blocks_ctx = get_text_blocks(source_url) or []

        results = []

        if keyword_present:
            # Use the FIRST sentence that contains the anchor (exact edit)
            best_sent = next(s for s in para_sents if _sentence_contains_anchor(s, anchor_text))
            rewritten_sent, _ = inject_anchor_into_sentence(best_sent, anchor_text, target_url)

            results.append({
                "anchor_was_present": True,
                "best_sentence_original": best_sent,
                "best_sentence_with_anchor": rewritten_sent,
                "keyword_in_article": True
            })

        else:
            # Choose the best insertion point sentence by similarity
            try:
                q = embed([sent_query])[0]
                s_embs = embed(para_sents)
                sims = F.cosine_similarity(s_embs, q.repeat(len(para_sents), 1))
                si = int(torch.argmax(sims).item())
                insert_after_sentence = para_sents[si]
            except Exception as e:
                print(f"Sentence similarity error: {e}")
                insert_after_sentence = para_sents[0]

            # Generate a NEW PARAGRAPH (not a sentence change) with the specified anchor
            language_name = get_language_name(detect_language(" ".join(para_sents[:2]) or "en"))
            target_ctx = build_target_context_string(target_url)
            paragraph_html = gpt_generate_insert_paragraph(
                anchor_text=anchor_text,
                target_url=target_url,
                language=language_name,
                insert_after_sentence=insert_after_sentence,
                article_context=article_blocks_ctx,
                target_context=target_ctx
            )

            position_text = insert_after_sentence
            results.append({
                "anchor_was_present": False,
                "best_sentence_original": position_text,   # we use this field as the insert-after pointer
                "best_sentence_with_anchor": paragraph_html,  # the new paragraph HTML to add
                "keyword_in_article": False,
                "is_new_paragraph": True
            })

            # Alternative anchor block (Result 2)
            if suggest_alternative:
                alt_anchor, alt_content = find_alternative_anchor(article_blocks_ctx, target_url, anchor_text)
                if alt_anchor and alt_content:
                    results[-1]["alternative_anchor"] = alt_anchor
                    results[-1]["alternative_sentence_original"] = ""
                    results[-1]["alternative_sentence"] = alt_content
                    results[-1]["alternative_exact_match"] = True

        return results

    except Exception as e:
        print(f"Critical error in suggest_insertions: {e}")
        return [{
            "error": f"Error processing the page: {str(e)}",
            "anchor_was_present": False,
            "best_sentence_original": "Error occurred",
            "best_sentence_with_anchor": f"Error occurred. Try manually: <a href='{target_url}'>{anchor_text}</a>",
            "keyword_in_article": False
        }]

# =========================
# Gradio UI
# =========================
def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor):
    if not source_url or not target_url or not anchor_text:
        return "❌ Please provide Source URL, Target URL, and Anchor Text."

    warn = ""
    if looks_like_url(anchor_text) and not looks_like_url(target_url):
        anchor_text, target_url = target_url, anchor_text
        warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"

    source_url = normalize_url(source_url)
    target_url = normalize_url(target_url)

    try:
        results = suggest_insertions(source_url, target_url, anchor_text, top_k=1, suggest_alternative=suggest_alternative_anchor)
        res = results[0]
    except Exception as e:
        return f"❌ Error processing the page: {str(e)}"

    if "error" in res:
        return f"❌ {res['error']}"

    original_sentence = res['best_sentence_original']
    draft_html = res["best_sentence_with_anchor"]
    anchor_was_present = res.get("anchor_was_present", False)
    keyword_in_article = res.get("keyword_in_article", False)
    is_new_paragraph = res.get("is_new_paragraph", False)

    # Optional polish only when we are changing an existing sentence (not needed for new paragraph usually)
    final_html = draft_html
    if smart_rewrite and not is_new_paragraph and anchor_was_present:
        language_name = get_language_name(detect_language(original_sentence))
        g = gpt_rewrite(final_html, anchor_text, target_url, language=language_name, target_context=build_target_context_string(target_url))
        final_html = g["sentence_html"]

    final_output = to_plain_text(final_html) if plain_text else final_html

    if keyword_in_article and not is_new_paragraph:
        result = warn + f"✅ **Anchor text '{anchor_text}' found in article!**\n\n"
        result += "🔗 Add link here:\n\n"
        result += f"{final_output}"
    else:
        # NEW DEFAULT: add paragraph after a sentence
        result = warn + f"⚠️ **Anchor text '{anchor_text}' not found in article**\n\n"
        result += "🔗 Result 1 — **Add this NEW paragraph** after the sentence below:\n\n"
        result += f"📍 [Insert after:] {original_sentence}\n\n"
        result += f"{final_output}"

        if suggest_alternative_anchor and res.get("alternative_anchor"):
            alt_anchor = res["alternative_anchor"]
            alt_content = res.get("alternative_sentence", "")
            if alt_content:
                if "[Insert after:" in alt_content:
                    parts = alt_content.split("\n\n", 1)
                    position_info = parts[0] if len(parts) > 0 else ""
                    actual_content = parts[1] if len(parts) > 1 else alt_content
                else:
                    position_info = ""
                    actual_content = alt_content
                alt_output = to_plain_text(actual_content) if plain_text else actual_content
                result += f"\n\n{'='*50}\n\n"
                result += "🔗 Result 2 — **Suggested new anchor & paragraph**:\n"
                result += f"💡 Using keyword: '{alt_anchor}'\n"
                if position_info and "[Insert after:" in position_info:
                    result += f"📍 {position_info}\n"
                result += f"\n{alt_output}"

    return result

def to_plain_text(html_or_text: str) -> str:
    text = BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
    return html.unescape(text)

def clear_cache():
    global EMBEDDING_CACHE, API_RESPONSE_CACHE
    EMBEDDING_CACHE.clear()
    API_RESPONSE_CACHE.clear()
    embed_cached.cache_clear()
    return "✅ Cache cleared successfully!"

# Show GPT status in the header
gpt_status = "ON" if OPENAI_API_KEY else "OFF"
title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"

with gr.Blocks(title=f"Link Insertion Helper • GPT: {gpt_status}") as demo:
    gr.Markdown(f"# Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}")
    gr.Markdown("Suggests the best place to add your link with intelligent language detection and caching.")

    with gr.Row():
        with gr.Column():
            source_url = gr.Textbox(label="Source URL", placeholder="https://example.com/article")
            target_url = gr.Textbox(label="Target URL", placeholder="https://example.com/target")
            anchor_text = gr.Textbox(label="Anchor Text", placeholder="your anchor text")

            with gr.Row():
                smart_rewrite = gr.Checkbox(label="Smart rewrite (GPT)", value=True)
                plain_text = gr.Checkbox(label="Plain text (no URL)", value=True)
                suggest_alternative_anchor = gr.Checkbox(
                    label="Suggest alternative anchor",
                    value=True,
                    info="Also propose a second option with a different anchor and its own paragraph"
                )

            with gr.Row():
                submit_btn = gr.Button("Process", variant="primary")
                clear_cache_btn = gr.Button("Clear Cache", variant="secondary")

        with gr.Column():
            output = gr.Textbox(label="Result", lines=14)
            cache_status = gr.Textbox(label="Cache Status", interactive=False)

    submit_btn.click(
        fn=run_tool,
        inputs=[source_url, target_url, anchor_text, smart_rewrite, plain_text, suggest_alternative_anchor],
        outputs=output
    )

    clear_cache_btn.click(
        fn=clear_cache,
        outputs=cache_status
    )

    gr.Markdown("""
### Features:
- 🧩 **Paragraph-Only Selection**: Never proposes headings; picks sentences from <p> tags only
- 🎯 **Anchor-First**: If anchor exists, returns the exact sentence containing it
- ➕ **No Anchor? Add a Paragraph**: Result 1 always gives a new paragraph with [Insert after:]
- 🧠 **Target-Aware**: Uses title/meta/H1/body of the target URL for relevance
- 🔄 **Alternative Anchor**: Optional Result 2 with a different anchor + ready paragraph
- 🧰 Robust extraction: Trafilatura + BS4; optional Cloudflare/PDF handling
""")

if __name__ == "__main__":
    demo.launch()