github-actions[bot] commited on
Commit
417d5c2
·
1 Parent(s): 5a83556

sync: automatic content update from github

Browse files
Files changed (7) hide show
  1. .gitattributes +0 -35
  2. README.md +0 -10
  3. gpt.py +71 -0
  4. headshot_scraper.py +580 -0
  5. index.html +0 -19
  6. streamlit_app.py +237 -0
  7. style.css +0 -28
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md DELETED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Sales Creator Catalog
3
- emoji: 😻
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: static
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
gpt.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper for interacting with the Creator Catalog custom GPT."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Dict, List, Optional
7
+
8
+ from openai import OpenAI
9
+
10
+
11
+ def get_env(name: str) -> Optional[str]:
12
+ """Return an env var, preferring HF space secret prefix."""
13
+ return os.environ.get(f"REPO_SECRET_{name}") or os.environ.get(name)
14
+
15
+
16
+ DEFAULT_INSTRUCTIONS = (
17
+ "You are the Creator Catalog assistant. Provide concise, practical answers "
18
+ "that help curate creator data, headshots, and related metadata."
19
+ )
20
+
21
+
22
+ class CustomGPT:
23
+ """Wrapper around the OpenAI API for a custom GPT."""
24
+
25
+ def __init__(
26
+ self,
27
+ model: Optional[str] = None,
28
+ instructions: Optional[str] = None,
29
+ temperature: float = 0.4,
30
+ ) -> None:
31
+ self.api_key = get_env("OPENAI_API_KEY")
32
+ if not self.api_key:
33
+ raise ValueError("Missing OPENAI_API_KEY (or REPO_SECRET_OPENAI_API_KEY)")
34
+
35
+ self.base_url = get_env("OPENAI_BASE_URL")
36
+ self.model = model or get_env("CUSTOM_GPT_MODEL") or "gpt-4o-mini"
37
+ self.instructions = (
38
+ instructions or get_env("CUSTOM_GPT_INSTRUCTIONS") or DEFAULT_INSTRUCTIONS
39
+ )
40
+ self.temperature = temperature
41
+ self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
42
+
43
+ def build_messages(
44
+ self,
45
+ prompt: str,
46
+ history: Optional[List[Dict[str, str]]] = None,
47
+ ) -> List[Dict[str, str]]:
48
+ messages: List[Dict[str, str]] = [
49
+ {"role": "system", "content": self.instructions}
50
+ ]
51
+
52
+ if history:
53
+ messages.extend(history)
54
+
55
+ messages.append({"role": "user", "content": prompt})
56
+ return messages
57
+
58
+ def run(
59
+ self,
60
+ prompt: str,
61
+ history: Optional[List[Dict[str, str]]] = None,
62
+ temperature: Optional[float] = None,
63
+ ) -> str:
64
+ """Send the prompt + history to the custom GPT and return the reply text."""
65
+
66
+ response = self.client.chat.completions.create(
67
+ model=self.model,
68
+ messages=self.build_messages(prompt, history),
69
+ temperature=temperature if temperature is not None else self.temperature,
70
+ )
71
+ return response.choices[0].message.content or ""
headshot_scraper.py ADDED
@@ -0,0 +1,580 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import io
4
+ import requests
5
+ from urllib.parse import urljoin, urlparse
6
+ from bs4 import BeautifulSoup
7
+ from PIL import Image
8
+ from typing import Optional, List, Dict, Any
9
+
10
+ HEADERS = {
11
+ "User-Agent": (
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
13
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
14
+ "Chrome/120.0 Safari/537.36"
15
+ )
16
+ }
17
+
18
+ # ------------------ Basic utils ------------------ #
19
+
20
+ def validate_url(url: str) -> str:
21
+ if not url or not isinstance(url, str):
22
+ raise ValueError("URL must be a non-empty string.")
23
+ parsed = urlparse(url)
24
+ if parsed.scheme not in ("http", "https"):
25
+ raise ValueError("URL must start with http:// or https://")
26
+ blocked_hosts = {"localhost", "127.0.0.1", "0.0.0.0", "::1"}
27
+ if parsed.hostname in blocked_hosts:
28
+ raise ValueError("Local addresses are not allowed.")
29
+ if parsed.hostname and parsed.hostname.startswith(("10.", "192.168.", "172.16.")):
30
+ raise ValueError("Private network addresses are not allowed.")
31
+ return url
32
+
33
+
34
+ def allowed_by_robots(url: str) -> bool:
35
+ parsed = urlparse(url)
36
+ robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
37
+ try:
38
+ r = requests.get(robots_url, headers=HEADERS, timeout=5)
39
+ if not r.ok:
40
+ return True
41
+ content = r.text.lower()
42
+ lines = content.splitlines()
43
+ applies = False
44
+ disallows: List[str] = []
45
+ for line in lines:
46
+ line = line.strip()
47
+ if not line or line.startswith("#"):
48
+ continue
49
+ if line.startswith("user-agent:"):
50
+ ua = line.split(":", 1)[1].strip()
51
+ applies = ua in ("*", "scraper-bot")
52
+ elif applies and line.startswith("disallow:"):
53
+ rule = line.split(":", 1)[1].strip()
54
+ disallows.append(rule)
55
+ path = parsed.path or "/"
56
+ for rule in disallows:
57
+ if rule and path.startswith(rule):
58
+ return False
59
+ return True
60
+ except Exception:
61
+ return True
62
+
63
+
64
+ def fetch_html(url: str) -> str:
65
+ r = requests.get(url, headers=HEADERS, timeout=10)
66
+ r.raise_for_status()
67
+ return r.text
68
+
69
+
70
+ def normalize_url(src: str, base: str) -> Optional[str]:
71
+ try:
72
+ return urljoin(base, src)
73
+ except Exception:
74
+ return None
75
+
76
+
77
+ def is_image_candidate(url: str) -> bool:
78
+ if not url:
79
+ return False
80
+ if url.startswith(("data:", "blob:", "about:")):
81
+ return False
82
+ lower = url.lower()
83
+ if lower.endswith(".svg"):
84
+ return False
85
+ exts = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".jfif"]
86
+ if any(ext in lower for ext in exts):
87
+ return True
88
+ return lower.startswith("http://") or lower.startswith("https://")
89
+
90
+
91
+ # ------------------ Author/headshot-focused HTML heuristics ------------------ #
92
+
93
+ AUTHOR_SELECTORS: List[str] = [
94
+ ".author",
95
+ ".author-info",
96
+ ".author-bio",
97
+ ".author-box",
98
+ ".about",
99
+ ".about-author",
100
+ ".byline",
101
+ ".post-author",
102
+ ".entry-author",
103
+ ".site-author",
104
+ ".profile",
105
+ ".profile-card",
106
+ ".user-info",
107
+ "[rel='author']",
108
+ "[itemprop='author']",
109
+ "[itemtype*='Person']",
110
+ ]
111
+
112
+
113
+ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
114
+ """
115
+ Find images likely to be author / headshot images.
116
+ Returns a list of dicts:
117
+ [{url, score, source, width_attr, height_attr, width_px, height_px}, ...]
118
+ """
119
+ candidates: List[Dict[str, Any]] = []
120
+ seen = set()
121
+
122
+ def add_image(src: Optional[str], source: str, score_boost: float = 0.0, tag=None) -> None:
123
+ if not src:
124
+ return
125
+ abs_url = normalize_url(src, base_url)
126
+ if not abs_url or abs_url in seen:
127
+ return
128
+ if not is_image_candidate(abs_url):
129
+ return
130
+
131
+ score = 0.0 + score_boost
132
+ width_attr = None
133
+ height_attr = None
134
+
135
+ if tag is not None:
136
+ width_attr = tag.get("width")
137
+ height_attr = tag.get("height")
138
+ try:
139
+ w = int(width_attr or 0)
140
+ h = int(height_attr or 0)
141
+ if w > 0 and h > 0:
142
+ aspect = float(h) / float(max(w, 1))
143
+ if 0.8 <= aspect <= 2.0:
144
+ score += 2.0
145
+ if w > 800 and w > h * 2:
146
+ score -= 2.0
147
+ except ValueError:
148
+ pass
149
+
150
+ seen.add(abs_url)
151
+ candidates.append({
152
+ "url": abs_url,
153
+ "score": score,
154
+ "source": source,
155
+ "width_attr": width_attr,
156
+ "height_attr": height_attr,
157
+ "width_px": None,
158
+ "height_px": None,
159
+ })
160
+
161
+ # 1) Inside author/about-ish containers
162
+ for selector in AUTHOR_SELECTORS:
163
+ for container in soup.select(selector):
164
+ imgs = container.find_all("img")
165
+ for img in imgs:
166
+ src = (
167
+ img.get("src")
168
+ or img.get("data-src")
169
+ or img.get("data-lazy-src")
170
+ or img.get("data-original")
171
+ )
172
+ add_image(src, "author:" + selector, score_boost=5.0, tag=img)
173
+
174
+ style = container.get("style") or ""
175
+ match = re.search(r"background-image\s*:\s*url\(([^)]+)\)", style, re.I)
176
+ if match:
177
+ raw = match.group(1).strip("'\" ")
178
+ add_image(raw, "author-bg:" + selector, score_boost=4.0, tag=None)
179
+
180
+ # 2) Fallback: near "about"/"author" text
181
+ text_blocks = soup.find_all(
182
+ lambda tag: tag.name in ("p", "div", "section") and tag.get_text(strip=True)
183
+ )
184
+ for block in text_blocks:
185
+ txt = block.get_text(" ", strip=True).lower()
186
+ if any(
187
+ phrase in txt
188
+ for phrase in [
189
+ "about me",
190
+ "about the author",
191
+ "about the creator",
192
+ "meet",
193
+ "author",
194
+ ]
195
+ ):
196
+ for img in block.find_all("img"):
197
+ src = (
198
+ img.get("src")
199
+ or img.get("data-src")
200
+ or img.get("data-lazy-src")
201
+ or img.get("data-original")
202
+ )
203
+ add_image(src, "near-about-text", score_boost=3.0, tag=img)
204
+
205
+ return candidates
206
+
207
+
208
+ # ------------------ Size-based filtering via Pillow ------------------ #
209
+
210
+ def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Optional[int]):
211
+ """
212
+ Fetch the image and read dimensions with Pillow.
213
+ Returns (width, height) or (None, None) on failure.
214
+ """
215
+ try:
216
+ resp = requests.get(url, headers=HEADERS, timeout=timeout)
217
+ resp.raise_for_status()
218
+ data = io.BytesIO(resp.content)
219
+ with Image.open(data) as img:
220
+ return img.width, img.height
221
+ except Exception:
222
+ return None, None
223
+
224
+
225
+ def refine_candidates_with_dimensions(
226
+ candidates: List[Dict[str, Any]],
227
+ max_to_check: int = 10
228
+ ) -> List[Dict[str, Any]]:
229
+ """
230
+ For up to `max_to_check` candidates, compute real width/height.
231
+ Adjust scores based on real dimensions.
232
+ """
233
+ candidates_sorted = sorted(candidates, key=lambda c: c["score"], reverse=True)
234
+
235
+ for idx, c in enumerate(candidates_sorted):
236
+ if idx >= max_to_check:
237
+ break
238
+
239
+ w, h = measure_image_dimensions(c["url"])
240
+ c["width_px"] = w
241
+ c["height_px"] = h
242
+
243
+ if not w or not h:
244
+ continue
245
+
246
+ if w < 80 or h < 80:
247
+ c["score"] -= 4.0
248
+ continue
249
+
250
+ aspect = float(h) / float(max(w, 1))
251
+
252
+ if 0.7 <= aspect <= 1.8:
253
+ c["score"] += 4.0
254
+
255
+ if w > 1000 and w > h * 2.5:
256
+ c["score"] -= 5.0
257
+
258
+ if w > 2500 or h > 2500:
259
+ c["score"] -= 3.0
260
+
261
+ return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
262
+
263
+
264
+ def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
265
+ if not candidates:
266
+ return None
267
+ return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
268
+
269
+
270
+ # ------------------ Main entry: scrape_author_image ------------------ #
271
+
272
+ def scrape_author_image(url: str) -> Dict[str, Any]:
273
+ """
274
+ Given a URL, return the most likely author/headshot image.
275
+ """
276
+ url = validate_url(url)
277
+ if not allowed_by_robots(url):
278
+ raise PermissionError("Blocked by robots.txt")
279
+
280
+ html = fetch_html(url)
281
+ soup = BeautifulSoup(html, "lxml")
282
+
283
+ title_tag = (
284
+ soup.select_one("meta[property='og:title']")
285
+ or soup.select_one("meta[name='twitter:title']")
286
+ or soup.find("h1")
287
+ or soup.title
288
+ )
289
+ if title_tag:
290
+ if hasattr(title_tag, "get"):
291
+ title = title_tag.get("content", "") or title_tag.get_text("", strip=True)
292
+ else:
293
+ title = title_tag.get_text("", strip=True)
294
+ else:
295
+ title = urlparse(url).hostname
296
+
297
+ candidates = extract_author_candidates(soup, url)
298
+ candidates_refined = refine_candidates_with_dimensions(candidates, max_to_check=10)
299
+ best = pick_best_author_image(candidates_refined)
300
+
301
+ return {
302
+ "title": title,
303
+ "author_image_url": best["url"] if best else None,
304
+ "debug_candidates": candidates_refined,
305
+ }
306
+
307
+
308
+ # ------------------ Download helpers ------------------ #
309
+
310
+ def download_image(
311
+ image_url: str,
312
+ out_dir: str = "author_images",
313
+ filename: Optional[str] = None
314
+ ) -> str:
315
+ """
316
+ Download a single image URL to `out_dir`, return local file path.
317
+ If `filename` is provided, it is used instead of the remote filename.
318
+ """
319
+ os.makedirs(out_dir, exist_ok=True)
320
+
321
+ parsed = urlparse(image_url)
322
+ fallback = os.path.basename(parsed.path) or "image"
323
+
324
+ if filename:
325
+ base_name = filename
326
+ else:
327
+ base_name = fallback
328
+
329
+ # Ensure file extension
330
+ if not any(base_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]):
331
+ base_name += ".jpg"
332
+
333
+ file_path = os.path.join(out_dir, base_name)
334
+
335
+ resp = requests.get(image_url, headers=HEADERS, timeout=15)
336
+ resp.raise_for_status()
337
+
338
+ with open(file_path, "wb") as f:
339
+ f.write(resp.content)
340
+
341
+ return file_path
342
+
343
+
344
+ def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict[str, Any]:
345
+ """
346
+ High-level helper:
347
+ 1. Scrape the page for the best author/headshot image.
348
+ 2. Download that image to `out_dir`.
349
+ """
350
+ result = scrape_author_image(page_url)
351
+ author_url = result["author_image_url"]
352
+
353
+ if not author_url:
354
+ return {
355
+ "title": result["title"],
356
+ "author_image_url": None,
357
+ "local_path": None,
358
+ }
359
+
360
+ local_path = download_image(author_url, out_dir=out_dir)
361
+ return {
362
+ "title": result["title"],
363
+ "author_image_url": author_url,
364
+ "local_path": local_path,
365
+ }
366
+
367
+
368
+ # ---------- Site normalization & about-page discovery ---------- #
369
+
370
+ def normalize_site_input(site: str) -> str:
371
+ """
372
+ Allow user to input:
373
+ - damndelicious.net
374
+ - https://damndelicious.net
375
+ - http://www.damndelicious.net/
376
+ and normalize to a base URL like:
377
+ - https://damndelicious.net
378
+ """
379
+ site = site.strip()
380
+
381
+ if not site:
382
+ raise ValueError("Site must be a non-empty string.")
383
+
384
+ if not site.startswith(("http://", "https://")):
385
+ site = "https://" + site
386
+
387
+ parsed = urlparse(site)
388
+ netloc = parsed.netloc
389
+ if netloc.startswith("www."):
390
+ netloc = netloc[4:]
391
+
392
+ base_url = f"{parsed.scheme}://{netloc}"
393
+ return base_url
394
+
395
+
396
+ ABOUT_PATH_GUESSES: List[str] = [
397
+ "/about",
398
+ "/about/",
399
+ "/about-me",
400
+ "/about-me/",
401
+ "/about-us",
402
+ "/about-us/",
403
+ "/about-the-author",
404
+ "/about-the-author/",
405
+ "/about-the-creator",
406
+ "/about-the-creator/",
407
+ "/meet-the-team",
408
+ "/meet-the-team/",
409
+ "/meet-the-author",
410
+ "/meet-the-author/",
411
+ ]
412
+
413
+ ABOUT_TEXT_KEYWORDS: List[str] = [
414
+ "about",
415
+ "about me",
416
+ "about us",
417
+ "about the author",
418
+ "about the creator",
419
+ "our story",
420
+ "my story",
421
+ "meet",
422
+ "meet the author",
423
+ "meet the team",
424
+ "bio",
425
+ ]
426
+
427
+
428
+ def url_is_html(url: str) -> bool:
429
+ """
430
+ Quick check that a URL returns HTML (not 404, not an image, etc.).
431
+ """
432
+ try:
433
+ r = requests.get(url, headers=HEADERS, timeout=8)
434
+ if not r.ok:
435
+ return False
436
+ ctype = r.headers.get("Content-Type", "").lower()
437
+ return "text/html" in ctype or "application/xhtml+xml" in ctype
438
+ except Exception:
439
+ return False
440
+
441
+
442
+ def find_about_like_urls(base_url: str, max_links: int = 20) -> List[str]:
443
+ """
444
+ Strategy:
445
+ 1. Try common about paths relative to base_url.
446
+ 2. Fetch the homepage and look for internal <a> links whose
447
+ text or href contains about-ish keywords.
448
+ """
449
+ candidates: List[str] = []
450
+
451
+ # 1) Common about paths
452
+ for path in ABOUT_PATH_GUESSES:
453
+ candidates.append(urljoin(base_url, path))
454
+
455
+ # 2) Extract from homepage
456
+ try:
457
+ html = fetch_html(base_url)
458
+ except Exception:
459
+ html = ""
460
+
461
+ if html:
462
+ soup = BeautifulSoup(html, "lxml")
463
+ for a in soup.find_all("a", href=True):
464
+ href = a["href"]
465
+ text = a.get_text(" ", strip=True).lower()
466
+ href_lower = href.lower()
467
+
468
+ if any(kw in text for kw in ABOUT_TEXT_KEYWORDS) or any(
469
+ kw in href_lower for kw in ["about", "our-story", "my-story", "meet"]
470
+ ):
471
+ abs_url = normalize_url(href, base_url)
472
+ if not abs_url:
473
+ continue
474
+ candidates.append(abs_url)
475
+
476
+ # Deduplicate while preserving order
477
+ seen = set()
478
+ unique: List[str] = []
479
+ for c in candidates:
480
+ if c not in seen:
481
+ seen.add(c)
482
+ unique.append(c)
483
+
484
+ return unique[:max_links]
485
+
486
+
487
+ def pick_best_about_url(site_input: str) -> Optional[str]:
488
+ """
489
+ Given a site name or URL, try to find the 'best' about page URL.
490
+ """
491
+ base_url = normalize_site_input(site_input)
492
+ base_url = validate_url(base_url)
493
+
494
+ if not allowed_by_robots(base_url):
495
+ raise PermissionError("Blocked by robots.txt for base site.")
496
+
497
+ candidates = find_about_like_urls(base_url)
498
+
499
+ for cand in candidates:
500
+ if not url_is_html(cand):
501
+ continue
502
+ return cand
503
+
504
+ return base_url
505
+
506
+
507
+ def clean_site_name(site: str) -> str:
508
+ """
509
+ Convert a site or URL into a safe filename component.
510
+ Examples:
511
+ https://www.damndelicious.net -> damndelicious_net
512
+ foodblog.com -> foodblog_com
513
+ """
514
+ site = site.strip().lower()
515
+ if site.startswith("http://"):
516
+ site = site[7:]
517
+ elif site.startswith("https://"):
518
+ site = site[8:]
519
+ site = site.split("/")[0]
520
+ return site.replace(".", "_").replace("-", "_")
521
+
522
+
523
+ def download_author_image_for_site(
524
+ site_input: str,
525
+ out_dir: str = "author_images"
526
+ ) -> Dict[str, Any]:
527
+ """
528
+ 1. Convert site input into a normalized base URL.
529
+ 2. Locate the site's best 'About' page.
530
+ 3. Extract the author image.
531
+ 4. Download it using a filename that includes the site name.
532
+ """
533
+ base_url = normalize_site_input(site_input)
534
+ about_url = pick_best_about_url(site_input)
535
+
536
+ if not about_url:
537
+ return {
538
+ "site_base_url": base_url,
539
+ "about_url": None,
540
+ "title": None,
541
+ "author_image_url": None,
542
+ "local_path": None,
543
+ }
544
+
545
+ info = scrape_author_image(about_url)
546
+ author_url = info["author_image_url"]
547
+
548
+ if not author_url:
549
+ return {
550
+ "site_base_url": base_url,
551
+ "about_url": about_url,
552
+ "title": info["title"],
553
+ "author_image_url": None,
554
+ "local_path": None,
555
+ }
556
+
557
+ safe_name = clean_site_name(base_url)
558
+ filename = safe_name + "_author"
559
+
560
+ local_path = download_image(author_url, out_dir=out_dir, filename=filename)
561
+
562
+ return {
563
+ "site_base_url": base_url,
564
+ "about_url": about_url,
565
+ "title": info["title"],
566
+ "author_image_url": author_url,
567
+ "local_path": local_path,
568
+ }
569
+
570
+
571
+ # ------------------ Optional CLI test ------------------ #
572
+
573
+ if __name__ == "__main__":
574
+ site_or_url = input("Enter site (e.g. 'damndelicious.net' or full URL): ").strip()
575
+ result = download_author_image_for_site(site_or_url, out_dir="author_images")
576
+ print("\nBase site:", result["site_base_url"])
577
+ print("About URL:", result["about_url"])
578
+ print("Page title:", result["title"])
579
+ print("Headshot URL:", result["author_image_url"])
580
+ print("Saved to:", result["local_path"])
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit_app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import snowflake.connector
4
+ from cryptography.hazmat.primitives import serialization
5
+
6
+ from headshot_scraper import download_author_image_for_site
7
+ from gpt import CustomGPT
8
+
9
+ # ------------------------------
10
+ # Helper to fetch env with HF prefix fallback
11
+ # ------------------------------
12
+
13
+
14
+ def get_env(name: str):
15
+ """Try HF space secrets (REPO_SECRET_name), else fallback to plain name."""
16
+ return os.environ.get(f"REPO_SECRET_{name}") or os.environ.get(name)
17
+
18
+
19
+ # ------------------------------
20
+ # Snowflake connection
21
+ # ------------------------------
22
+
23
+
24
+ def connect_to_snowflake():
25
+ pem = get_env("snowflake_private_key")
26
+
27
+ if pem is None:
28
+ st.warning("⚠️ Missing Snowflake private key. Add it as a HF Secret.")
29
+ return None
30
+
31
+ try:
32
+ private_key = serialization.load_pem_private_key(
33
+ pem.encode(),
34
+ password=None,
35
+ )
36
+ except Exception as e:
37
+ st.error(f"❌ Could not load Snowflake private key: {e}")
38
+ return None
39
+
40
+ try:
41
+ conn = snowflake.connector.connect(
42
+ user=get_env("snowflake_user"),
43
+ account=get_env("snowflake_account_identifier"),
44
+ private_key=private_key,
45
+ role=get_env("snowflake_role"),
46
+ warehouse=get_env("snowflake_warehouse"),
47
+ database=get_env("snowflake_database"),
48
+ schema=get_env("snowflake_schema"),
49
+ )
50
+ return conn
51
+ except Exception as e:
52
+ st.error(f"❌ Snowflake connection failed: {e}")
53
+ return None
54
+
55
+
56
+ def fetch_sites(conn):
57
+ """
58
+ Return a list of dicts:
59
+ [{"site_name": ..., "url": ...}, ...]
60
+ """
61
+ try:
62
+ cur = conn.cursor()
63
+ cur.execute(
64
+ """
65
+ SELECT DISTINCT
66
+ site_name,
67
+ url -- Replace with actual URL column if different
68
+ FROM analytics.adthrive.SITE_EXTENDED
69
+ WHERE site_name IS NOT NULL
70
+ AND url IS NOT NULL
71
+ ORDER BY site_name
72
+ """
73
+ )
74
+ rows = cur.fetchall()
75
+ return [{"site_name": r[0], "url": r[1]} for r in rows]
76
+ except Exception as e:
77
+ st.error(f"Failed to fetch site list: {e}")
78
+ return []
79
+
80
+
81
+ # ------------------------------
82
+ # Streamlit UI setup
83
+ # ------------------------------
84
+
85
+ st.set_page_config(page_title="Headshot Scraper", page_icon="🧑‍🍳", layout="wide")
86
+
87
+ st.title("Headshot / Author Image Scraper")
88
+ st.write(
89
+ "Select a site from Snowflake (by name) or enter one manually. "
90
+ "The scraper will use the stored URL to find the About page and extract the headshot."
91
+ )
92
+
93
+ # Initialize session state for last_result (so results persist across reruns)
94
+ if "last_result" not in st.session_state:
95
+ st.session_state["last_result"] = None
96
+ if "chat_history" not in st.session_state:
97
+ st.session_state["chat_history"] = []
98
+
99
+ # ------------------------------
100
+ # Snowflake: connect + dropdown
101
+ # ------------------------------
102
+
103
+ st.write("🔑 Connecting to Snowflake…")
104
+ conn = connect_to_snowflake()
105
+
106
+ sites = []
107
+ selected_site_name = ""
108
+ selected_site_url = ""
109
+
110
+ if conn:
111
+ st.success(f"Connected to Snowflake as {get_env('snowflake_user')}")
112
+ sites = fetch_sites(conn)
113
+
114
+ site_name_options = [""] + [s["site_name"] for s in sites]
115
+ selected_site_name = st.selectbox("Select site by name:", site_name_options)
116
+
117
+ if selected_site_name:
118
+ match = next((s for s in sites if s["site_name"] == selected_site_name), None)
119
+ if match:
120
+ selected_site_url = match["url"]
121
+ st.caption(f"URL from Snowflake: {selected_site_url}")
122
+ else:
123
+ st.warning("No URL found for the selected site.")
124
+ else:
125
+ st.warning("Snowflake connection not available. Manual entry only.")
126
+
127
+ # ------------------------------
128
+ # Manual URL entry fallback
129
+ # ------------------------------
130
+
131
+ manual_entry = st.text_input(
132
+ "Or enter a site manually:",
133
+ placeholder="damndelicious.net",
134
+ )
135
+
136
+ # Final URL to be used (Snowflake URL takes precedence)
137
+ site_or_url = selected_site_url if selected_site_url else manual_entry
138
+
139
+ # ------------------------------
140
+ # Scrape button (updates session_state)
141
+ # ------------------------------
142
+
143
+ if st.button("Scrape headshot"):
144
+ if not site_or_url.strip():
145
+ st.error("Please select or enter a site.")
146
+ else:
147
+ with st.spinner("Scraping…"):
148
+ try:
149
+ result = download_author_image_for_site(
150
+ site_or_url, out_dir="/tmp/author_images"
151
+ )
152
+
153
+ # Store result so it persists across reruns
154
+ st.session_state["last_result"] = result
155
+
156
+ except Exception as e:
157
+ st.error(f"Scrape failed: {e}")
158
+ st.session_state["last_result"] = None
159
+
160
+ # ------------------------------
161
+ # Display last result (persistent across reruns)
162
+ # ------------------------------
163
+
164
+ result = st.session_state.get("last_result")
165
+
166
+ if result:
167
+ st.subheader("Result")
168
+
169
+ st.write(f"**Base site:** {result['site_base_url']}")
170
+ st.write(f"**About URL:** {result['about_url']}")
171
+ st.write(f"**Page title:** {result['title']}")
172
+ st.write(f"**Headshot URL:** {result['author_image_url']}")
173
+ st.write(f"**Saved file:** {result['local_path']}")
174
+
175
+ local_path = result.get("local_path")
176
+
177
+ if local_path:
178
+ st.image(local_path, caption="Detected headshot", width=350)
179
+
180
+ # Download button – this will trigger a rerun,
181
+ # but the result is preserved in st.session_state
182
+ try:
183
+ with open(local_path, "rb") as f:
184
+ img_bytes = f.read()
185
+
186
+ st.download_button(
187
+ "⬇️ Download Image",
188
+ data=img_bytes,
189
+ file_name=os.path.basename(local_path),
190
+ mime="image/jpeg",
191
+ )
192
+ except Exception as e:
193
+ st.warning(f"Could not prepare download: {e}")
194
+ else:
195
+ st.warning("No headshot found for this site.")
196
+
197
+
198
+ # ------------------------------
199
+ # Custom GPT helper
200
+ # ------------------------------
201
+
202
+ st.divider()
203
+ st.header("Creator Catalog GPT")
204
+ st.caption(
205
+ "Chat with the custom GPT using your OpenAI credentials. "
206
+ "Set REPO_SECRET_OPENAI_API_KEY (and optional OPENAI_BASE_URL, CUSTOM_GPT_MODEL, "
207
+ "CUSTOM_GPT_INSTRUCTIONS) as secrets in the Hugging Face Space."
208
+ )
209
+
210
+ prompt = st.text_area(
211
+ "Ask the GPT a question",
212
+ key="gpt_prompt",
213
+ placeholder="E.g., summarize the most recent scraping result",
214
+ )
215
+
216
+ if st.button("Send to GPT"):
217
+ if not prompt.strip():
218
+ st.error("Please enter a question or prompt for the GPT.")
219
+ else:
220
+ try:
221
+ client = CustomGPT()
222
+ reply = client.run(prompt, history=st.session_state["chat_history"])
223
+
224
+ st.session_state["chat_history"].extend(
225
+ [
226
+ {"role": "user", "content": prompt},
227
+ {"role": "assistant", "content": reply},
228
+ ]
229
+ )
230
+ except Exception as e:
231
+ st.error(f"GPT request failed: {e}")
232
+
233
+ if st.session_state["chat_history"]:
234
+ st.subheader("Conversation")
235
+ for message in st.session_state["chat_history"]:
236
+ prefix = "You" if message["role"] == "user" else "GPT"
237
+ st.markdown(f"**{prefix}:** {message['content']}")
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }