Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
Β·
88cc214
1
Parent(s):
4adc9cf
sync: automatic content update from github
Browse files- src/gpt.py +2 -2
- src/headshot_scraper.py +38 -22
src/gpt.py
CHANGED
|
@@ -37,12 +37,12 @@ User must request it.
|
|
| 37 |
3. Approved Data Sources
|
| 38 |
The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
|
| 39 |
β /src/data.csv
|
| 40 |
-
Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags.
|
| 41 |
β /src/brandpositioninglibrary.txt
|
| 42 |
Brand vertical β subvertical β content priorities.\nUsed for vertical fit scoring.
|
| 43 |
β /src/creatorkeywords.csv
|
| 44 |
Maps creators to brand-relevant keywords used for keyword alignment scoring.
|
| 45 |
-
No other data may be invented. Only these files exist.
|
| 46 |
|
| 47 |
4. Mandatory Initial Filters
|
| 48 |
Applied before anything else:
|
|
|
|
| 37 |
3. Approved Data Sources
|
| 38 |
The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
|
| 39 |
β /src/data.csv
|
| 40 |
+
Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags. The only sites you may return must be listed in the "Site Name" column of this file, and you must match the site name exactly as written there.
|
| 41 |
β /src/brandpositioninglibrary.txt
|
| 42 |
Brand vertical β subvertical β content priorities.\nUsed for vertical fit scoring.
|
| 43 |
β /src/creatorkeywords.csv
|
| 44 |
Maps creators to brand-relevant keywords used for keyword alignment scoring.
|
| 45 |
+
No other data may be invented or referenced. Only these files exist, and only these sources may be used. If a user asks for a site or data not present in these files, respond that it is unavailable.
|
| 46 |
|
| 47 |
4. Mandatory Initial Filters
|
| 48 |
Applied before anything else:
|
src/headshot_scraper.py
CHANGED
|
@@ -17,6 +17,7 @@ HEADERS = {
|
|
| 17 |
|
| 18 |
# ------------------ Basic utils ------------------ #
|
| 19 |
|
|
|
|
| 20 |
def validate_url(url: str) -> str:
|
| 21 |
if not url or not isinstance(url, str):
|
| 22 |
raise ValueError("URL must be a non-empty string.")
|
|
@@ -110,7 +111,9 @@ AUTHOR_SELECTORS: List[str] = [
|
|
| 110 |
]
|
| 111 |
|
| 112 |
|
| 113 |
-
def extract_author_candidates(
|
|
|
|
|
|
|
| 114 |
"""
|
| 115 |
Find images likely to be author / headshot images.
|
| 116 |
Returns a list of dicts:
|
|
@@ -119,7 +122,9 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
|
|
| 119 |
candidates: List[Dict[str, Any]] = []
|
| 120 |
seen = set()
|
| 121 |
|
| 122 |
-
def add_image(
|
|
|
|
|
|
|
| 123 |
if not src:
|
| 124 |
return
|
| 125 |
abs_url = normalize_url(src, base_url)
|
|
@@ -148,15 +153,17 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
|
|
| 148 |
pass
|
| 149 |
|
| 150 |
seen.add(abs_url)
|
| 151 |
-
candidates.append(
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
| 160 |
|
| 161 |
# 1) Inside author/about-ish containers
|
| 162 |
for selector in AUTHOR_SELECTORS:
|
|
@@ -207,7 +214,10 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
|
|
| 207 |
|
| 208 |
# ------------------ Size-based filtering via Pillow ------------------ #
|
| 209 |
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
| 211 |
"""
|
| 212 |
Fetch the image and read dimensions with Pillow.
|
| 213 |
Returns (width, height) or (None, None) on failure.
|
|
@@ -223,8 +233,7 @@ def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Opt
|
|
| 223 |
|
| 224 |
|
| 225 |
def refine_candidates_with_dimensions(
|
| 226 |
-
candidates: List[Dict[str, Any]],
|
| 227 |
-
max_to_check: int = 10
|
| 228 |
) -> List[Dict[str, Any]]:
|
| 229 |
"""
|
| 230 |
For up to `max_to_check` candidates, compute real width/height.
|
|
@@ -261,7 +270,9 @@ def refine_candidates_with_dimensions(
|
|
| 261 |
return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
|
| 262 |
|
| 263 |
|
| 264 |
-
def pick_best_author_image(
|
|
|
|
|
|
|
| 265 |
if not candidates:
|
| 266 |
return None
|
| 267 |
return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
|
|
@@ -269,6 +280,7 @@ def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[st
|
|
| 269 |
|
| 270 |
# ------------------ Main entry: scrape_author_image ------------------ #
|
| 271 |
|
|
|
|
| 272 |
def scrape_author_image(url: str) -> Dict[str, Any]:
|
| 273 |
"""
|
| 274 |
Given a URL, return the most likely author/headshot image.
|
|
@@ -307,10 +319,9 @@ def scrape_author_image(url: str) -> Dict[str, Any]:
|
|
| 307 |
|
| 308 |
# ------------------ Download helpers ------------------ #
|
| 309 |
|
|
|
|
| 310 |
def download_image(
|
| 311 |
-
image_url: str,
|
| 312 |
-
out_dir: str = "author_images",
|
| 313 |
-
filename: Optional[str] = None
|
| 314 |
) -> str:
|
| 315 |
"""
|
| 316 |
Download a single image URL to `out_dir`, return local file path.
|
|
@@ -327,7 +338,10 @@ def download_image(
|
|
| 327 |
base_name = fallback
|
| 328 |
|
| 329 |
# Ensure file extension
|
| 330 |
-
if not any(
|
|
|
|
|
|
|
|
|
|
| 331 |
base_name += ".jpg"
|
| 332 |
|
| 333 |
file_path = os.path.join(out_dir, base_name)
|
|
@@ -341,7 +355,9 @@ def download_image(
|
|
| 341 |
return file_path
|
| 342 |
|
| 343 |
|
| 344 |
-
def download_author_image(
|
|
|
|
|
|
|
| 345 |
"""
|
| 346 |
High-level helper:
|
| 347 |
1. Scrape the page for the best author/headshot image.
|
|
@@ -367,6 +383,7 @@ def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict
|
|
| 367 |
|
| 368 |
# ---------- Site normalization & about-page discovery ---------- #
|
| 369 |
|
|
|
|
| 370 |
def normalize_site_input(site: str) -> str:
|
| 371 |
"""
|
| 372 |
Allow user to input:
|
|
@@ -521,8 +538,7 @@ def clean_site_name(site: str) -> str:
|
|
| 521 |
|
| 522 |
|
| 523 |
def download_author_image_for_site(
|
| 524 |
-
site_input: str,
|
| 525 |
-
out_dir: str = "author_images"
|
| 526 |
) -> Dict[str, Any]:
|
| 527 |
"""
|
| 528 |
1. Convert site input into a normalized base URL.
|
|
|
|
| 17 |
|
| 18 |
# ------------------ Basic utils ------------------ #
|
| 19 |
|
| 20 |
+
|
| 21 |
def validate_url(url: str) -> str:
|
| 22 |
if not url or not isinstance(url, str):
|
| 23 |
raise ValueError("URL must be a non-empty string.")
|
|
|
|
| 111 |
]
|
| 112 |
|
| 113 |
|
| 114 |
+
def extract_author_candidates(
|
| 115 |
+
soup: BeautifulSoup, base_url: str
|
| 116 |
+
) -> List[Dict[str, Any]]:
|
| 117 |
"""
|
| 118 |
Find images likely to be author / headshot images.
|
| 119 |
Returns a list of dicts:
|
|
|
|
| 122 |
candidates: List[Dict[str, Any]] = []
|
| 123 |
seen = set()
|
| 124 |
|
| 125 |
+
def add_image(
|
| 126 |
+
src: Optional[str], source: str, score_boost: float = 0.0, tag=None
|
| 127 |
+
) -> None:
|
| 128 |
if not src:
|
| 129 |
return
|
| 130 |
abs_url = normalize_url(src, base_url)
|
|
|
|
| 153 |
pass
|
| 154 |
|
| 155 |
seen.add(abs_url)
|
| 156 |
+
candidates.append(
|
| 157 |
+
{
|
| 158 |
+
"url": abs_url,
|
| 159 |
+
"score": score,
|
| 160 |
+
"source": source,
|
| 161 |
+
"width_attr": width_attr,
|
| 162 |
+
"height_attr": height_attr,
|
| 163 |
+
"width_px": None,
|
| 164 |
+
"height_px": None,
|
| 165 |
+
}
|
| 166 |
+
)
|
| 167 |
|
| 168 |
# 1) Inside author/about-ish containers
|
| 169 |
for selector in AUTHOR_SELECTORS:
|
|
|
|
| 214 |
|
| 215 |
# ------------------ Size-based filtering via Pillow ------------------ #
|
| 216 |
|
| 217 |
+
|
| 218 |
+
def measure_image_dimensions(
|
| 219 |
+
url: str, timeout: int = 10
|
| 220 |
+
) -> (Optional[int], Optional[int]):
|
| 221 |
"""
|
| 222 |
Fetch the image and read dimensions with Pillow.
|
| 223 |
Returns (width, height) or (None, None) on failure.
|
|
|
|
| 233 |
|
| 234 |
|
| 235 |
def refine_candidates_with_dimensions(
|
| 236 |
+
candidates: List[Dict[str, Any]], max_to_check: int = 10
|
|
|
|
| 237 |
) -> List[Dict[str, Any]]:
|
| 238 |
"""
|
| 239 |
For up to `max_to_check` candidates, compute real width/height.
|
|
|
|
| 270 |
return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
|
| 271 |
|
| 272 |
|
| 273 |
+
def pick_best_author_image(
|
| 274 |
+
candidates: List[Dict[str, Any]],
|
| 275 |
+
) -> Optional[Dict[str, Any]]:
|
| 276 |
if not candidates:
|
| 277 |
return None
|
| 278 |
return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
|
|
|
|
| 280 |
|
| 281 |
# ------------------ Main entry: scrape_author_image ------------------ #
|
| 282 |
|
| 283 |
+
|
| 284 |
def scrape_author_image(url: str) -> Dict[str, Any]:
|
| 285 |
"""
|
| 286 |
Given a URL, return the most likely author/headshot image.
|
|
|
|
| 319 |
|
| 320 |
# ------------------ Download helpers ------------------ #
|
| 321 |
|
| 322 |
+
|
| 323 |
def download_image(
|
| 324 |
+
image_url: str, out_dir: str = "author_images", filename: Optional[str] = None
|
|
|
|
|
|
|
| 325 |
) -> str:
|
| 326 |
"""
|
| 327 |
Download a single image URL to `out_dir`, return local file path.
|
|
|
|
| 338 |
base_name = fallback
|
| 339 |
|
| 340 |
# Ensure file extension
|
| 341 |
+
if not any(
|
| 342 |
+
base_name.lower().endswith(ext)
|
| 343 |
+
for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]
|
| 344 |
+
):
|
| 345 |
base_name += ".jpg"
|
| 346 |
|
| 347 |
file_path = os.path.join(out_dir, base_name)
|
|
|
|
| 355 |
return file_path
|
| 356 |
|
| 357 |
|
| 358 |
+
def download_author_image(
|
| 359 |
+
page_url: str, out_dir: str = "author_images"
|
| 360 |
+
) -> Dict[str, Any]:
|
| 361 |
"""
|
| 362 |
High-level helper:
|
| 363 |
1. Scrape the page for the best author/headshot image.
|
|
|
|
| 383 |
|
| 384 |
# ---------- Site normalization & about-page discovery ---------- #
|
| 385 |
|
| 386 |
+
|
| 387 |
def normalize_site_input(site: str) -> str:
|
| 388 |
"""
|
| 389 |
Allow user to input:
|
|
|
|
| 538 |
|
| 539 |
|
| 540 |
def download_author_image_for_site(
|
| 541 |
+
site_input: str, out_dir: str = "author_images"
|
|
|
|
| 542 |
) -> Dict[str, Any]:
|
| 543 |
"""
|
| 544 |
1. Convert site input into a normalized base URL.
|