Spaces:

Raptive-NC
/

Sales_Creator_Catalog

Configuration error

App Files Files Community

github-actions[bot] commited on Dec 11, 2025

Commit

417d5c2

1 Parent(s): 5a83556

sync: automatic content update from github

Browse files

Files changed (7) hide show

.gitattributes +0 -35
README.md +0 -10
gpt.py +71 -0
headshot_scraper.py +580 -0
index.html +0 -19
streamlit_app.py +237 -0
style.css +0 -28

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md DELETED Viewed

@@ -1,10 +0,0 @@
----
-title: Sales Creator Catalog
-emoji: 😻
-colorFrom: pink
-colorTo: red
-sdk: static
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

gpt.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""Helper for interacting with the Creator Catalog custom GPT."""
+from __future__ import annotations
+import os
+from typing import Dict, List, Optional
+from openai import OpenAI
+def get_env(name: str) -> Optional[str]:
+    """Return an env var, preferring HF space secret prefix."""
+    return os.environ.get(f"REPO_SECRET_{name}") or os.environ.get(name)
+DEFAULT_INSTRUCTIONS = (
+    "You are the Creator Catalog assistant. Provide concise, practical answers "
+    "that help curate creator data, headshots, and related metadata."
+)
+class CustomGPT:
+    """Wrapper around the OpenAI API for a custom GPT."""
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        instructions: Optional[str] = None,
+        temperature: float = 0.4,
+    ) -> None:
+        self.api_key = get_env("OPENAI_API_KEY")
+        if not self.api_key:
+            raise ValueError("Missing OPENAI_API_KEY (or REPO_SECRET_OPENAI_API_KEY)")
+        self.base_url = get_env("OPENAI_BASE_URL")
+        self.model = model or get_env("CUSTOM_GPT_MODEL") or "gpt-4o-mini"
+        self.instructions = (
+            instructions or get_env("CUSTOM_GPT_INSTRUCTIONS") or DEFAULT_INSTRUCTIONS
+        )
+        self.temperature = temperature
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def build_messages(
+        self,
+        prompt: str,
+        history: Optional[List[Dict[str, str]]] = None,
+    ) -> List[Dict[str, str]]:
+        messages: List[Dict[str, str]] = [
+            {"role": "system", "content": self.instructions}
+        ]
+        if history:
+            messages.extend(history)
+        messages.append({"role": "user", "content": prompt})
+        return messages
+    def run(
+        self,
+        prompt: str,
+        history: Optional[List[Dict[str, str]]] = None,
+        temperature: Optional[float] = None,
+    ) -> str:
+        """Send the prompt + history to the custom GPT and return the reply text."""
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=self.build_messages(prompt, history),
+            temperature=temperature if temperature is not None else self.temperature,
+        )
+        return response.choices[0].message.content or ""

headshot_scraper.py ADDED Viewed

	@@ -0,0 +1,580 @@

+import re
+import os
+import io
+import requests
+from urllib.parse import urljoin, urlparse
+from bs4 import BeautifulSoup
+from PIL import Image
+from typing import Optional, List, Dict, Any
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0 Safari/537.36"
+    )
+}
+# ------------------ Basic utils ------------------ #
+def validate_url(url: str) -> str:
+    if not url or not isinstance(url, str):
+        raise ValueError("URL must be a non-empty string.")
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise ValueError("URL must start with http:// or https://")
+    blocked_hosts = {"localhost", "127.0.0.1", "0.0.0.0", "::1"}
+    if parsed.hostname in blocked_hosts:
+        raise ValueError("Local addresses are not allowed.")
+    if parsed.hostname and parsed.hostname.startswith(("10.", "192.168.", "172.16.")):
+        raise ValueError("Private network addresses are not allowed.")
+    return url
+def allowed_by_robots(url: str) -> bool:
+    parsed = urlparse(url)
+    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
+    try:
+        r = requests.get(robots_url, headers=HEADERS, timeout=5)
+        if not r.ok:
+            return True
+        content = r.text.lower()
+        lines = content.splitlines()
+        applies = False
+        disallows: List[str] = []
+        for line in lines:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            if line.startswith("user-agent:"):
+                ua = line.split(":", 1)[1].strip()
+                applies = ua in ("*", "scraper-bot")
+            elif applies and line.startswith("disallow:"):
+                rule = line.split(":", 1)[1].strip()
+                disallows.append(rule)
+        path = parsed.path or "/"
+        for rule in disallows:
+            if rule and path.startswith(rule):
+                return False
+        return True
+    except Exception:
+        return True
+def fetch_html(url: str) -> str:
+    r = requests.get(url, headers=HEADERS, timeout=10)
+    r.raise_for_status()
+    return r.text
+def normalize_url(src: str, base: str) -> Optional[str]:
+    try:
+        return urljoin(base, src)
+    except Exception:
+        return None
+def is_image_candidate(url: str) -> bool:
+    if not url:
+        return False
+    if url.startswith(("data:", "blob:", "about:")):
+        return False
+    lower = url.lower()
+    if lower.endswith(".svg"):
+        return False
+    exts = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".jfif"]
+    if any(ext in lower for ext in exts):
+        return True
+    return lower.startswith("http://") or lower.startswith("https://")
+# ------------------ Author/headshot-focused HTML heuristics ------------------ #
+AUTHOR_SELECTORS: List[str] = [
+    ".author",
+    ".author-info",
+    ".author-bio",
+    ".author-box",
+    ".about",
+    ".about-author",
+    ".byline",
+    ".post-author",
+    ".entry-author",
+    ".site-author",
+    ".profile",
+    ".profile-card",
+    ".user-info",
+    "[rel='author']",
+    "[itemprop='author']",
+    "[itemtype*='Person']",
+]
+def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
+    """
+    Find images likely to be author / headshot images.
+    Returns a list of dicts:
+    [{url, score, source, width_attr, height_attr, width_px, height_px}, ...]
+    """
+    candidates: List[Dict[str, Any]] = []
+    seen = set()
+    def add_image(src: Optional[str], source: str, score_boost: float = 0.0, tag=None) -> None:
+        if not src:
+            return
+        abs_url = normalize_url(src, base_url)
+        if not abs_url or abs_url in seen:
+            return
+        if not is_image_candidate(abs_url):
+            return
+        score = 0.0 + score_boost
+        width_attr = None
+        height_attr = None
+        if tag is not None:
+            width_attr = tag.get("width")
+            height_attr = tag.get("height")
+            try:
+                w = int(width_attr or 0)
+                h = int(height_attr or 0)
+                if w > 0 and h > 0:
+                    aspect = float(h) / float(max(w, 1))
+                    if 0.8 <= aspect <= 2.0:
+                        score += 2.0
+                    if w > 800 and w > h * 2:
+                        score -= 2.0
+            except ValueError:
+                pass
+        seen.add(abs_url)
+        candidates.append({
+            "url": abs_url,
+            "score": score,
+            "source": source,
+            "width_attr": width_attr,
+            "height_attr": height_attr,
+            "width_px": None,
+            "height_px": None,
+        })
+    # 1) Inside author/about-ish containers
+    for selector in AUTHOR_SELECTORS:
+        for container in soup.select(selector):
+            imgs = container.find_all("img")
+            for img in imgs:
+                src = (
+                    img.get("src")
+                    or img.get("data-src")
+                    or img.get("data-lazy-src")
+                    or img.get("data-original")
+                )
+                add_image(src, "author:" + selector, score_boost=5.0, tag=img)
+            style = container.get("style") or ""
+            match = re.search(r"background-image\s*:\s*url\(([^)]+)\)", style, re.I)
+            if match:
+                raw = match.group(1).strip("'\" ")
+                add_image(raw, "author-bg:" + selector, score_boost=4.0, tag=None)
+    # 2) Fallback: near "about"/"author" text
+    text_blocks = soup.find_all(
+        lambda tag: tag.name in ("p", "div", "section") and tag.get_text(strip=True)
+    )
+    for block in text_blocks:
+        txt = block.get_text(" ", strip=True).lower()
+        if any(
+            phrase in txt
+            for phrase in [
+                "about me",
+                "about the author",
+                "about the creator",
+                "meet",
+                "author",
+            ]
+        ):
+            for img in block.find_all("img"):
+                src = (
+                    img.get("src")
+                    or img.get("data-src")
+                    or img.get("data-lazy-src")
+                    or img.get("data-original")
+                )
+                add_image(src, "near-about-text", score_boost=3.0, tag=img)
+    return candidates
+# ------------------ Size-based filtering via Pillow ------------------ #
+def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Optional[int]):
+    """
+    Fetch the image and read dimensions with Pillow.
+    Returns (width, height) or (None, None) on failure.
+    """
+    try:
+        resp = requests.get(url, headers=HEADERS, timeout=timeout)
+        resp.raise_for_status()
+        data = io.BytesIO(resp.content)
+        with Image.open(data) as img:
+            return img.width, img.height
+    except Exception:
+        return None, None
+def refine_candidates_with_dimensions(
+    candidates: List[Dict[str, Any]],
+    max_to_check: int = 10
+) -> List[Dict[str, Any]]:
+    """
+    For up to `max_to_check` candidates, compute real width/height.
+    Adjust scores based on real dimensions.
+    """
+    candidates_sorted = sorted(candidates, key=lambda c: c["score"], reverse=True)
+    for idx, c in enumerate(candidates_sorted):
+        if idx >= max_to_check:
+            break
+        w, h = measure_image_dimensions(c["url"])
+        c["width_px"] = w
+        c["height_px"] = h
+        if not w or not h:
+            continue
+        if w < 80 or h < 80:
+            c["score"] -= 4.0
+            continue
+        aspect = float(h) / float(max(w, 1))
+        if 0.7 <= aspect <= 1.8:
+            c["score"] += 4.0
+        if w > 1000 and w > h * 2.5:
+            c["score"] -= 5.0
+        if w > 2500 or h > 2500:
+            c["score"] -= 3.0
+    return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
+def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    if not candidates:
+        return None
+    return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
+# ------------------ Main entry: scrape_author_image ------------------ #
+def scrape_author_image(url: str) -> Dict[str, Any]:
+    """
+    Given a URL, return the most likely author/headshot image.
+    """
+    url = validate_url(url)
+    if not allowed_by_robots(url):
+        raise PermissionError("Blocked by robots.txt")
+    html = fetch_html(url)
+    soup = BeautifulSoup(html, "lxml")
+    title_tag = (
+        soup.select_one("meta[property='og:title']")
+        or soup.select_one("meta[name='twitter:title']")
+        or soup.find("h1")
+        or soup.title
+    )
+    if title_tag:
+        if hasattr(title_tag, "get"):
+            title = title_tag.get("content", "") or title_tag.get_text("", strip=True)
+        else:
+            title = title_tag.get_text("", strip=True)
+    else:
+        title = urlparse(url).hostname
+    candidates = extract_author_candidates(soup, url)
+    candidates_refined = refine_candidates_with_dimensions(candidates, max_to_check=10)
+    best = pick_best_author_image(candidates_refined)
+    return {
+        "title": title,
+        "author_image_url": best["url"] if best else None,
+        "debug_candidates": candidates_refined,
+    }
+# ------------------ Download helpers ------------------ #
+def download_image(
+    image_url: str,
+    out_dir: str = "author_images",
+    filename: Optional[str] = None
+) -> str:
+    """
+    Download a single image URL to `out_dir`, return local file path.
+    If `filename` is provided, it is used instead of the remote filename.
+    """
+    os.makedirs(out_dir, exist_ok=True)
+    parsed = urlparse(image_url)
+    fallback = os.path.basename(parsed.path) or "image"
+    if filename:
+        base_name = filename
+    else:
+        base_name = fallback
+    # Ensure file extension
+    if not any(base_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]):
+        base_name += ".jpg"
+    file_path = os.path.join(out_dir, base_name)
+    resp = requests.get(image_url, headers=HEADERS, timeout=15)
+    resp.raise_for_status()
+    with open(file_path, "wb") as f:
+        f.write(resp.content)
+    return file_path
+def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict[str, Any]:
+    """
+    High-level helper:
+      1. Scrape the page for the best author/headshot image.
+      2. Download that image to `out_dir`.
+    """
+    result = scrape_author_image(page_url)
+    author_url = result["author_image_url"]
+    if not author_url:
+        return {
+            "title": result["title"],
+            "author_image_url": None,
+            "local_path": None,
+        }
+    local_path = download_image(author_url, out_dir=out_dir)
+    return {
+        "title": result["title"],
+        "author_image_url": author_url,
+        "local_path": local_path,
+    }
+# ---------- Site normalization & about-page discovery ---------- #
+def normalize_site_input(site: str) -> str:
+    """
+    Allow user to input:
+      - damndelicious.net
+      - https://damndelicious.net
+      - http://www.damndelicious.net/
+    and normalize to a base URL like:
+      - https://damndelicious.net
+    """
+    site = site.strip()
+    if not site:
+        raise ValueError("Site must be a non-empty string.")
+    if not site.startswith(("http://", "https://")):
+        site = "https://" + site
+    parsed = urlparse(site)
+    netloc = parsed.netloc
+    if netloc.startswith("www."):
+        netloc = netloc[4:]
+    base_url = f"{parsed.scheme}://{netloc}"
+    return base_url
+ABOUT_PATH_GUESSES: List[str] = [
+    "/about",
+    "/about/",
+    "/about-me",
+    "/about-me/",
+    "/about-us",
+    "/about-us/",
+    "/about-the-author",
+    "/about-the-author/",
+    "/about-the-creator",
+    "/about-the-creator/",
+    "/meet-the-team",
+    "/meet-the-team/",
+    "/meet-the-author",
+    "/meet-the-author/",
+]
+ABOUT_TEXT_KEYWORDS: List[str] = [
+    "about",
+    "about me",
+    "about us",
+    "about the author",
+    "about the creator",
+    "our story",
+    "my story",
+    "meet",
+    "meet the author",
+    "meet the team",
+    "bio",
+]
+def url_is_html(url: str) -> bool:
+    """
+    Quick check that a URL returns HTML (not 404, not an image, etc.).
+    """
+    try:
+        r = requests.get(url, headers=HEADERS, timeout=8)
+        if not r.ok:
+            return False
+        ctype = r.headers.get("Content-Type", "").lower()
+        return "text/html" in ctype or "application/xhtml+xml" in ctype
+    except Exception:
+        return False
+def find_about_like_urls(base_url: str, max_links: int = 20) -> List[str]:
+    """
+    Strategy:
+      1. Try common about paths relative to base_url.
+      2. Fetch the homepage and look for internal <a> links whose
+         text or href contains about-ish keywords.
+    """
+    candidates: List[str] = []
+    # 1) Common about paths
+    for path in ABOUT_PATH_GUESSES:
+        candidates.append(urljoin(base_url, path))
+    # 2) Extract from homepage
+    try:
+        html = fetch_html(base_url)
+    except Exception:
+        html = ""
+    if html:
+        soup = BeautifulSoup(html, "lxml")
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            text = a.get_text(" ", strip=True).lower()
+            href_lower = href.lower()
+            if any(kw in text for kw in ABOUT_TEXT_KEYWORDS) or any(
+                kw in href_lower for kw in ["about", "our-story", "my-story", "meet"]
+            ):
+                abs_url = normalize_url(href, base_url)
+                if not abs_url:
+                    continue
+                candidates.append(abs_url)
+    # Deduplicate while preserving order
+    seen = set()
+    unique: List[str] = []
+    for c in candidates:
+        if c not in seen:
+            seen.add(c)
+            unique.append(c)
+    return unique[:max_links]
+def pick_best_about_url(site_input: str) -> Optional[str]:
+    """
+    Given a site name or URL, try to find the 'best' about page URL.
+    """
+    base_url = normalize_site_input(site_input)
+    base_url = validate_url(base_url)
+    if not allowed_by_robots(base_url):
+        raise PermissionError("Blocked by robots.txt for base site.")
+    candidates = find_about_like_urls(base_url)
+    for cand in candidates:
+        if not url_is_html(cand):
+            continue
+        return cand
+    return base_url
+def clean_site_name(site: str) -> str:
+    """
+    Convert a site or URL into a safe filename component.
+    Examples:
+        https://www.damndelicious.net  ->  damndelicious_net
+        foodblog.com                   ->  foodblog_com
+    """
+    site = site.strip().lower()
+    if site.startswith("http://"):
+        site = site[7:]
+    elif site.startswith("https://"):
+        site = site[8:]
+    site = site.split("/")[0]
+    return site.replace(".", "_").replace("-", "_")
+def download_author_image_for_site(
+    site_input: str,
+    out_dir: str = "author_images"
+) -> Dict[str, Any]:
+    """
+    1. Convert site input into a normalized base URL.
+    2. Locate the site's best 'About' page.
+    3. Extract the author image.
+    4. Download it using a filename that includes the site name.
+    """
+    base_url = normalize_site_input(site_input)
+    about_url = pick_best_about_url(site_input)
+    if not about_url:
+        return {
+            "site_base_url": base_url,
+            "about_url": None,
+            "title": None,
+            "author_image_url": None,
+            "local_path": None,
+        }
+    info = scrape_author_image(about_url)
+    author_url = info["author_image_url"]
+    if not author_url:
+        return {
+            "site_base_url": base_url,
+            "about_url": about_url,
+            "title": info["title"],
+            "author_image_url": None,
+            "local_path": None,
+        }
+    safe_name = clean_site_name(base_url)
+    filename = safe_name + "_author"
+    local_path = download_image(author_url, out_dir=out_dir, filename=filename)
+    return {
+        "site_base_url": base_url,
+        "about_url": about_url,
+        "title": info["title"],
+        "author_image_url": author_url,
+        "local_path": local_path,
+    }
+# ------------------ Optional CLI test ------------------ #
+if __name__ == "__main__":
+    site_or_url = input("Enter site (e.g. 'damndelicious.net' or full URL): ").strip()
+    result = download_author_image_for_site(site_or_url, out_dir="author_images")
+    print("\nBase site:", result["site_base_url"])
+    print("About URL:", result["about_url"])
+    print("Page title:", result["title"])
+    print("Headshot URL:", result["author_image_url"])
+    print("Saved to:", result["local_path"])

index.html DELETED Viewed

@@ -1,19 +0,0 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import os
+import streamlit as st
+import snowflake.connector
+from cryptography.hazmat.primitives import serialization
+from headshot_scraper import download_author_image_for_site
+from gpt import CustomGPT
+# ------------------------------
+# Helper to fetch env with HF prefix fallback
+# ------------------------------
+def get_env(name: str):
+    """Try HF space secrets (REPO_SECRET_name), else fallback to plain name."""
+    return os.environ.get(f"REPO_SECRET_{name}") or os.environ.get(name)
+# ------------------------------
+# Snowflake connection
+# ------------------------------
+def connect_to_snowflake():
+    pem = get_env("snowflake_private_key")
+    if pem is None:
+        st.warning("⚠️ Missing Snowflake private key. Add it as a HF Secret.")
+        return None
+    try:
+        private_key = serialization.load_pem_private_key(
+            pem.encode(),
+            password=None,
+        )
+    except Exception as e:
+        st.error(f"❌ Could not load Snowflake private key: {e}")
+        return None
+    try:
+        conn = snowflake.connector.connect(
+            user=get_env("snowflake_user"),
+            account=get_env("snowflake_account_identifier"),
+            private_key=private_key,
+            role=get_env("snowflake_role"),
+            warehouse=get_env("snowflake_warehouse"),
+            database=get_env("snowflake_database"),
+            schema=get_env("snowflake_schema"),
+        )
+        return conn
+    except Exception as e:
+        st.error(f"❌ Snowflake connection failed: {e}")
+        return None
+def fetch_sites(conn):
+    """
+    Return a list of dicts:
+      [{"site_name": ..., "url": ...}, ...]
+    """
+    try:
+        cur = conn.cursor()
+        cur.execute(
+            """
+            SELECT DISTINCT
+                  site_name,
+                  url   -- Replace with actual URL column if different
+            FROM analytics.adthrive.SITE_EXTENDED
+            WHERE site_name IS NOT NULL
+              AND url IS NOT NULL
+            ORDER BY site_name
+            """
+        )
+        rows = cur.fetchall()
+        return [{"site_name": r[0], "url": r[1]} for r in rows]
+    except Exception as e:
+        st.error(f"Failed to fetch site list: {e}")
+        return []
+# ------------------------------
+# Streamlit UI setup
+# ------------------------------
+st.set_page_config(page_title="Headshot Scraper", page_icon="🧑‍🍳", layout="wide")
+st.title("Headshot / Author Image Scraper")
+st.write(
+    "Select a site from Snowflake (by name) or enter one manually. "
+    "The scraper will use the stored URL to find the About page and extract the headshot."
+)
+# Initialize session state for last_result (so results persist across reruns)
+if "last_result" not in st.session_state:
+    st.session_state["last_result"] = None
+if "chat_history" not in st.session_state:
+    st.session_state["chat_history"] = []
+# ------------------------------
+# Snowflake: connect + dropdown
+# ------------------------------
+st.write("🔑 Connecting to Snowflake…")
+conn = connect_to_snowflake()
+sites = []
+selected_site_name = ""
+selected_site_url = ""
+if conn:
+    st.success(f"Connected to Snowflake as {get_env('snowflake_user')}")
+    sites = fetch_sites(conn)
+    site_name_options = [""] + [s["site_name"] for s in sites]
+    selected_site_name = st.selectbox("Select site by name:", site_name_options)
+    if selected_site_name:
+        match = next((s for s in sites if s["site_name"] == selected_site_name), None)
+        if match:
+            selected_site_url = match["url"]
+            st.caption(f"URL from Snowflake: {selected_site_url}")
+        else:
+            st.warning("No URL found for the selected site.")
+else:
+    st.warning("Snowflake connection not available. Manual entry only.")
+# ------------------------------
+# Manual URL entry fallback
+# ------------------------------
+manual_entry = st.text_input(
+    "Or enter a site manually:",
+    placeholder="damndelicious.net",
+)
+# Final URL to be used (Snowflake URL takes precedence)
+site_or_url = selected_site_url if selected_site_url else manual_entry
+# ------------------------------
+# Scrape button (updates session_state)
+# ------------------------------
+if st.button("Scrape headshot"):
+    if not site_or_url.strip():
+        st.error("Please select or enter a site.")
+    else:
+        with st.spinner("Scraping…"):
+            try:
+                result = download_author_image_for_site(
+                    site_or_url, out_dir="/tmp/author_images"
+                )
+                # Store result so it persists across reruns
+                st.session_state["last_result"] = result
+            except Exception as e:
+                st.error(f"Scrape failed: {e}")
+                st.session_state["last_result"] = None
+# ------------------------------
+# Display last result (persistent across reruns)
+# ------------------------------
+result = st.session_state.get("last_result")
+if result:
+    st.subheader("Result")
+    st.write(f"**Base site:** {result['site_base_url']}")
+    st.write(f"**About URL:** {result['about_url']}")
+    st.write(f"**Page title:** {result['title']}")
+    st.write(f"**Headshot URL:** {result['author_image_url']}")
+    st.write(f"**Saved file:** {result['local_path']}")
+    local_path = result.get("local_path")
+    if local_path:
+        st.image(local_path, caption="Detected headshot", width=350)
+        # Download button – this will trigger a rerun,
+        # but the result is preserved in st.session_state
+        try:
+            with open(local_path, "rb") as f:
+                img_bytes = f.read()
+            st.download_button(
+                "⬇️ Download Image",
+                data=img_bytes,
+                file_name=os.path.basename(local_path),
+                mime="image/jpeg",
+            )
+        except Exception as e:
+            st.warning(f"Could not prepare download: {e}")
+    else:
+        st.warning("No headshot found for this site.")
+# ------------------------------
+# Custom GPT helper
+# ------------------------------
+st.divider()
+st.header("Creator Catalog GPT")
+st.caption(
+    "Chat with the custom GPT using your OpenAI credentials. "
+    "Set REPO_SECRET_OPENAI_API_KEY (and optional OPENAI_BASE_URL, CUSTOM_GPT_MODEL, "
+    "CUSTOM_GPT_INSTRUCTIONS) as secrets in the Hugging Face Space."
+)
+prompt = st.text_area(
+    "Ask the GPT a question",
+    key="gpt_prompt",
+    placeholder="E.g., summarize the most recent scraping result",
+)
+if st.button("Send to GPT"):
+    if not prompt.strip():
+        st.error("Please enter a question or prompt for the GPT.")
+    else:
+        try:
+            client = CustomGPT()
+            reply = client.run(prompt, history=st.session_state["chat_history"])
+            st.session_state["chat_history"].extend(
+                [
+                    {"role": "user", "content": prompt},
+                    {"role": "assistant", "content": reply},
+                ]
+            )
+        except Exception as e:
+            st.error(f"GPT request failed: {e}")
+if st.session_state["chat_history"]:
+    st.subheader("Conversation")
+    for message in st.session_state["chat_history"]:
+        prefix = "You" if message["role"] == "user" else "GPT"
+        st.markdown(f"**{prefix}:** {message['content']}")

style.css DELETED Viewed

@@ -1,28 +0,0 @@
-body {
-	padding: 2rem;
-	font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
-}
-h1 {
-	font-size: 16px;
-	margin-top: 0;
-}
-p {
-	color: rgb(107, 114, 128);
-	font-size: 15px;
-	margin-bottom: 10px;
-	margin-top: 5px;
-}
-.card {
-	max-width: 620px;
-	margin: 0 auto;
-	padding: 16px;
-	border: 1px solid lightgray;
-	border-radius: 16px;
-}
-.card p:last-child {
-	margin-bottom: 0;
-}