# Streamlit Tagline Generator for "About Us" pages (Hugging Face Space)
# ---------------------------------------------------------------
# Deploy this file as `app.py` in a Hugging Face Space (Streamlit SDK).
#
# 🔧 Setup (on Hugging Face):
# 1) Create a new Space → SDK: Streamlit → Python.
# 2) Add this file as `app.py`.
# 3) In **Settings → Variables & secrets**, add a secret named one of:
#       - HUGGINGFACE_API_TOKEN  (preferred)
#       - HF_TOKEN               (fallback)
# 4) (Optional) In `README.md`, paste these Requirements (or keep them here):
#
# Requirements (auto-installed if you add a `requirements.txt`):
#   streamlit>=1.36.0
#   requests>=2.31.0
#   beautifulsoup4>=4.12.2
#   lxml>=5.2.2
#   huggingface_hub>=0.23.0
#   pandas>=2.2.2
#
# If you don't want a separate `requirements.txt`, the Space will still install common libs,
# but it's best practice to include it.
# ---------------------------------------------------------------

import os
import re
import json
import time
import random
from typing import List, Dict, Optional

import requests
import pandas as pd
import streamlit as st
from bs4 import BeautifulSoup

try:
    # Lightweight client for Inference API
    from huggingface_hub import InferenceClient
except Exception:
    InferenceClient = None  # We'll gracefully handle if missing

APP_TITLE = "About→Taglines: LLM-Powered Tagline Generator"
DEFAULT_URL = "https://www.codestratlabs.com/#about"

# Sensible, widely available open-instruct model on HF Inference API.
# You may change this to any chat/instruct model you have access to.
DEFAULT_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

st.set_page_config(page_title=APP_TITLE, page_icon="🪄", layout="wide")

# -----------------------------
# Helper: find HF token
# -----------------------------

def get_hf_token() -> Optional[str]:
    # Priority: Streamlit secrets → env vars
    for k in [
        "HUGGINGFACE_API_TOKEN",
        "HF_TOKEN",
        "HUGGINGFACEHUB_API_TOKEN",
        "HF_API_TOKEN",
    ]:
        try:
            if k in st.secrets and st.secrets[k]:
                return st.secrets[k]
        except Exception:
            pass
        if os.getenv(k):
            return os.getenv(k)
    return None

# -----------------------------
# Web scraping utilities
# -----------------------------

USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/115.0 Safari/537.36"
)

ALLOWED_TAGS = {
    "p", "h1", "h2", "h3", "h4", "li", "blockquote", "em", "strong", "span"
}


def fetch_about_text(url: str, timeout: int = 15) -> str:
    """Fetch the page, extract readable text, and lightly clean it.
    Not bulletproof but good enough for most marketing About pages.
    """
    headers = {"User-Agent": USER_AGENT}
    r = requests.get(url, headers=headers, timeout=timeout)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "lxml")

    # Try common About selectors first
    candidates = []
    # id or class containing 'about'
    about_like = soup.select('[id*="about" i], [class*="about" i]')
    if about_like:
        candidates.extend(about_like)

    # Fallback: main content
    if not candidates:
        main = soup.find("main") or soup.body
        if main:
            candidates.append(main)

    chunks = []
    for node in candidates:
        for tag in node.find_all(ALLOWED_TAGS):
            text = tag.get_text(" ", strip=True)
            if text:
                chunks.append(text)

    text = "\n".join(chunks)

    # Clean up extra whitespace and junk
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(©|Copyright).*?\d{4}.*", "", text, flags=re.I)
    text = text.strip()

    # Limit to a sane context length for prompting
    return text[:4000]

# -----------------------------
# Prompting & generation
# -----------------------------

SYSTEM_PROMPT = (
    "You are a world-class brand copywriter. Given a company 'About Us' "
    "description and some creative directions, craft concise, memorable, and "
    "distinctive marketing taglines that would perform well on landing pages, "
    "social headers, and ads. Always return valid JSON."
)

USER_PROMPT_TEMPLATE = (
"""Write {n} creative marketing taglines for the brand described below.
Constraints:
- Each tagline max {max_words} words.
- Tone(s): {tones}.
- Target audience: {audience}.
- Brand traits to emphasize: {traits}.
- Language: {language}.
- Avoid clichés. Avoid generic buzzwords. Prefer clarity over fluff.
- Make each line unique; avoid repeating structures.
- {style_rule}


Company About (verbatim, possibly trimmed):
""""{about}"""


Return JSON with this exact schema:
{
"taglines": [
{
"line": string,
"explanation": string
}
]
}
"""
)

STYLE_RULES = {
    "One-liners": "Only produce single-line taglines; do not add subheads.",
    "Slogan + Subhead": (
        "Produce single-line slogan candidates; keep explanations focused on the angle."
    ),
    "Alliterative": "Favor gentle alliteration (not forced).",
    "Bold & Punchy": "Favor short, high-impact phrasing.",
}


def call_hf_inference(model: str, messages: List[Dict[str, str]], temperature: float = 0.7, max_new_tokens: int = 512, top_p: float = 0.9, seed: Optional[int] = None) -> str:
    token = get_hf_token()
    if not token:
        raise RuntimeError(
            "No Hugging Face API token found. Set HUGGINGFACE_API_TOKEN (or HF_TOKEN) as a Space secret."
        )

    if InferenceClient is None:
        # Minimal fallback via raw HTTP to Inference API
        api_url = f"https://api-inference.huggingface.co/models/{model}"
        headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
        payload = {
            "inputs": messages,
            "parameters": {
                "temperature": temperature,
                "top_p": top_p,
                "max_new_tokens": max_new_tokens,
                **({"seed": seed} if seed is not None else {}),
            },
            "task": "conversational",
        }
        resp = requests.post(api_url, headers=headers, data=json.dumps(payload), timeout=60)
        resp.raise_for_status()
        data = resp.json()
        # Best-effort extraction
        try:
            return data[0]["generated_text"]
        except Exception:
            return json.dumps(data)

    client = InferenceClient(model=model, token=token)

    # Build a chat-style input for instruct models
    # Many HF chat models accept a list of dicts with role/content
    generated = client.chat.completions.create(
        messages=messages,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_new_tokens,
        seed=seed,
    )
    return generated.choices[0].message.content


# -----------------------------
# UI
# -----------------------------

st.title("🪄 Tagline Generator from About Us (LLM)")
with st.expander("How it works", expanded=False):
    st.markdown(
        "1. Paste an About page URL (or raw text).\n"
        "2. Choose tone, style, and constraints.\n"
        "3. Click **Generate** to get multiple tagline options.\n"
        "4. Copy, edit, or download as CSV."
    )

with st.sidebar:
    st.header("Input Source")
    url = st.text_input("About page URL", value=DEFAULT_URL)
    st.caption("Tip: Works best with dedicated About/Company pages.")
    st.divider()
    raw_text = st.text_area(
        "…or paste About text (overrides URL if provided)",
        height=160,
        placeholder="Paste company description here…"
    )

    st.header("Creative Controls")
    n = st.slider("# of taglines", min_value=3, max_value=30, value=12)
    max_words = st.slider("Max words per tagline", min_value=3, max_value=12, value=7)

    tone_options = [
        "Bold & Punchy",
        "Credible & Trustworthy",
        "Visionary & Innovative",
        "Friendly & Helpful",
        "Premium & Sophisticated",
        "Playful & Witty",
        "Tech-forward & Precise",
    ]
    tones = st.multiselect("Tone(s)", tone_options, default=["Bold & Punchy", "Tech-forward & Precise"]) or ["Clear & Confident"]

    style_choice = st.selectbox("Style bias", ["One-liners", "Slogan + Subhead", "Alliterative", "Bold & Punchy"], index=0)

    audience = st.text_input("Target audience", value="B2B founders, product & growth leaders")
    traits = st.text_input("Brand traits to highlight", value="AI-native, reliable delivery, measurable impact")

    language = st.text_input("Language (e.g., English, Hindi)", value="English")

    temperature = st.slider("Creativity (temperature)", 0.0, 1.5, 0.8, 0.1)
    top_p = st.slider("Nucleus sampling (top_p)", 0.1, 1.0, 0.9, 0.05)
    seed_toggle = st.checkbox("Use seed for reproducibility", value=False)
    seed_val = st.number_input("Seed", min_value=0, max_value=10_000_000, value=42, step=1, disabled=not seed_toggle)

    st.divider()
    st.subheader("Model")
    model = st.text_input("HF Inference model id", value=DEFAULT_MODEL)
    st.caption("Use any instruct/chat model available on the Inference API.")

col1, col2 = st.columns([2, 1])

with col1:
    st.subheader("1) Fetch About content")
    about_text = None
    if raw_text.strip():
        about_text = raw_text.strip()
        st.success("Using pasted About text.")
    else:
        if st.button("Fetch from URL", use_container_width=True):
            try:
                with st.spinner("Fetching & parsing About page..."):
                    about_text = fetch_about_text(url)
                if about_text:
                    st.success(f"Fetched ~{len(about_text)} chars of About content.")
                else:
                    st.warning("Couldn't extract meaningful About text—try pasting it manually.")
            except Exception as e:
                st.error(f"Fetch failed: {e}")

    about_holder = st.empty()
    if about_text:
        with about_holder.container():
            st.text_area("About content used for generation", about_text, height=220)

with col2:
    st.subheader("2) Generate Taglines")
    can_generate = st.button("🪄 Generate", use_container_width=True)

st.markdown("---")

results_df = None
if can_generate:
    if not (raw_text.strip() or about_text):
        st.warning("Please paste About text or click 'Fetch from URL' first.")
    else:
        # Build messages for chat API
        style_rule = STYLE_RULES.get(style_choice, "")
        user_prompt = USER_PROMPT_TEMPLATE.format(
            n=n,
            max_words=max_words,
            tones=", ".join(tones),
            audience=audience,
            traits=traits,
            language=language,
            style_rule=style_rule,
            about=about_text or raw_text.strip(),
        )

        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt},
        ]

        with st.spinner("Asking the model for ideas…"):
            try:
                seed_used = seed_val if seed_toggle else None
                output_text = call_hf_inference(
                    model=model,
                    messages=messages,
                    temperature=temperature,
                    max_new_tokens=768,
                    top_p=top_p,
                    seed=seed_used,
                )
            except Exception as e:
                st.error(f"Generation failed: {e}")
                output_text = ""

        # Try parsing JSON reliably
        parsed = None
        if output_text:
            # Find the first JSON object in the string
            m = re.search(r"\{[\s\S]*\}", output_text)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                except Exception:
                    parsed = None

        taglines: List[Dict[str, str]] = []
        if parsed and isinstance(parsed, dict) and "taglines" in parsed:
            for item in parsed["taglines"]:
                line = (item.get("line") or "").strip()
                expl = (item.get("explanation") or "").strip()
                if line:
                    taglines.append({"Tagline": line, "Why it works": expl})
        else:
            # Fallback: heuristic split by lines/bullets
            candidates = re.split(r"[\n•\-\d\)]\s+", output_text)
            for c in candidates:
                c = c.strip().strip('"').strip()
                if 0 < len(c.split()) <= max_words and 3 <= len(c) <= 90:
                    taglines.append({"Tagline": c, "Why it works": ""})

        if not taglines:
            st.warning("No taglines parsed. Try increasing max tokens or adjust constraints.")
        else:
            results_df = pd.DataFrame(taglines)

if results_df is not None and not results_df.empty:
    st.subheader("Results")
    st.dataframe(results_df, use_container_width=True, hide_index=True)

    csv_bytes = results_df.to_csv(index=False).encode("utf-8")
    st.download_button(
        "Download CSV",
        data=csv_bytes,
        file_name="taglines.csv",
        mime="text/csv",
        use_container_width=True,
    )

    st.markdown(":sparkles: Tip: Click *Generate* again for fresh variations; toggle a seed for reproducibility.")

st.markdown("---")

with st.expander("Troubleshooting"):
    st.markdown(
        "- **401/403 errors** → Ensure your Space has a valid `HUGGINGFACE_API_TOKEN` secret with access to the selected model.\n"
        "- **Empty results** → Paste the About text manually, increase `max words`, or try a different model.\n"
        "- **Slow output** → Reduce `# of taglines` or switch to a lighter model.\n"
        "- **Different language** → Change the *Language* field; the model will write in that language."
    )