github-actions[bot] commited on
Commit
88cc214
Β·
1 Parent(s): 4adc9cf

sync: automatic content update from github

Browse files
Files changed (2) hide show
  1. src/gpt.py +2 -2
  2. src/headshot_scraper.py +38 -22
src/gpt.py CHANGED
@@ -37,12 +37,12 @@ User must request it.
37
  3. Approved Data Sources
38
  The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
39
  βœ” /src/data.csv
40
- Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags.
41
  βœ” /src/brandpositioninglibrary.txt
42
  Brand vertical β†’ subvertical β†’ content priorities.\nUsed for vertical fit scoring.
43
  βœ” /src/creatorkeywords.csv
44
  Maps creators to brand-relevant keywords used for keyword alignment scoring.
45
- No other data may be invented. Only these files exist.
46
 
47
  4. Mandatory Initial Filters
48
  Applied before anything else:
 
37
  3. Approved Data Sources
38
  The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
39
  βœ” /src/data.csv
40
+ Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags. The only sites you may return must be listed in the "Site Name" column of this file, and you must match the site name exactly as written there.
41
  βœ” /src/brandpositioninglibrary.txt
42
  Brand vertical β†’ subvertical β†’ content priorities.\nUsed for vertical fit scoring.
43
  βœ” /src/creatorkeywords.csv
44
  Maps creators to brand-relevant keywords used for keyword alignment scoring.
45
+ No other data may be invented or referenced. Only these files exist, and only these sources may be used. If a user asks for a site or data not present in these files, respond that it is unavailable.
46
 
47
  4. Mandatory Initial Filters
48
  Applied before anything else:
src/headshot_scraper.py CHANGED
@@ -17,6 +17,7 @@ HEADERS = {
17
 
18
  # ------------------ Basic utils ------------------ #
19
 
 
20
  def validate_url(url: str) -> str:
21
  if not url or not isinstance(url, str):
22
  raise ValueError("URL must be a non-empty string.")
@@ -110,7 +111,9 @@ AUTHOR_SELECTORS: List[str] = [
110
  ]
111
 
112
 
113
- def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
 
 
114
  """
115
  Find images likely to be author / headshot images.
116
  Returns a list of dicts:
@@ -119,7 +122,9 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
119
  candidates: List[Dict[str, Any]] = []
120
  seen = set()
121
 
122
- def add_image(src: Optional[str], source: str, score_boost: float = 0.0, tag=None) -> None:
 
 
123
  if not src:
124
  return
125
  abs_url = normalize_url(src, base_url)
@@ -148,15 +153,17 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
148
  pass
149
 
150
  seen.add(abs_url)
151
- candidates.append({
152
- "url": abs_url,
153
- "score": score,
154
- "source": source,
155
- "width_attr": width_attr,
156
- "height_attr": height_attr,
157
- "width_px": None,
158
- "height_px": None,
159
- })
 
 
160
 
161
  # 1) Inside author/about-ish containers
162
  for selector in AUTHOR_SELECTORS:
@@ -207,7 +214,10 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
207
 
208
  # ------------------ Size-based filtering via Pillow ------------------ #
209
 
210
- def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Optional[int]):
 
 
 
211
  """
212
  Fetch the image and read dimensions with Pillow.
213
  Returns (width, height) or (None, None) on failure.
@@ -223,8 +233,7 @@ def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Opt
223
 
224
 
225
  def refine_candidates_with_dimensions(
226
- candidates: List[Dict[str, Any]],
227
- max_to_check: int = 10
228
  ) -> List[Dict[str, Any]]:
229
  """
230
  For up to `max_to_check` candidates, compute real width/height.
@@ -261,7 +270,9 @@ def refine_candidates_with_dimensions(
261
  return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
262
 
263
 
264
- def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
 
 
265
  if not candidates:
266
  return None
267
  return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
@@ -269,6 +280,7 @@ def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[st
269
 
270
  # ------------------ Main entry: scrape_author_image ------------------ #
271
 
 
272
  def scrape_author_image(url: str) -> Dict[str, Any]:
273
  """
274
  Given a URL, return the most likely author/headshot image.
@@ -307,10 +319,9 @@ def scrape_author_image(url: str) -> Dict[str, Any]:
307
 
308
  # ------------------ Download helpers ------------------ #
309
 
 
310
  def download_image(
311
- image_url: str,
312
- out_dir: str = "author_images",
313
- filename: Optional[str] = None
314
  ) -> str:
315
  """
316
  Download a single image URL to `out_dir`, return local file path.
@@ -327,7 +338,10 @@ def download_image(
327
  base_name = fallback
328
 
329
  # Ensure file extension
330
- if not any(base_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]):
 
 
 
331
  base_name += ".jpg"
332
 
333
  file_path = os.path.join(out_dir, base_name)
@@ -341,7 +355,9 @@ def download_image(
341
  return file_path
342
 
343
 
344
- def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict[str, Any]:
 
 
345
  """
346
  High-level helper:
347
  1. Scrape the page for the best author/headshot image.
@@ -367,6 +383,7 @@ def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict
367
 
368
  # ---------- Site normalization & about-page discovery ---------- #
369
 
 
370
  def normalize_site_input(site: str) -> str:
371
  """
372
  Allow user to input:
@@ -521,8 +538,7 @@ def clean_site_name(site: str) -> str:
521
 
522
 
523
  def download_author_image_for_site(
524
- site_input: str,
525
- out_dir: str = "author_images"
526
  ) -> Dict[str, Any]:
527
  """
528
  1. Convert site input into a normalized base URL.
 
17
 
18
  # ------------------ Basic utils ------------------ #
19
 
20
+
21
  def validate_url(url: str) -> str:
22
  if not url or not isinstance(url, str):
23
  raise ValueError("URL must be a non-empty string.")
 
111
  ]
112
 
113
 
114
+ def extract_author_candidates(
115
+ soup: BeautifulSoup, base_url: str
116
+ ) -> List[Dict[str, Any]]:
117
  """
118
  Find images likely to be author / headshot images.
119
  Returns a list of dicts:
 
122
  candidates: List[Dict[str, Any]] = []
123
  seen = set()
124
 
125
+ def add_image(
126
+ src: Optional[str], source: str, score_boost: float = 0.0, tag=None
127
+ ) -> None:
128
  if not src:
129
  return
130
  abs_url = normalize_url(src, base_url)
 
153
  pass
154
 
155
  seen.add(abs_url)
156
+ candidates.append(
157
+ {
158
+ "url": abs_url,
159
+ "score": score,
160
+ "source": source,
161
+ "width_attr": width_attr,
162
+ "height_attr": height_attr,
163
+ "width_px": None,
164
+ "height_px": None,
165
+ }
166
+ )
167
 
168
  # 1) Inside author/about-ish containers
169
  for selector in AUTHOR_SELECTORS:
 
214
 
215
  # ------------------ Size-based filtering via Pillow ------------------ #
216
 
217
+
218
+ def measure_image_dimensions(
219
+ url: str, timeout: int = 10
220
+ ) -> (Optional[int], Optional[int]):
221
  """
222
  Fetch the image and read dimensions with Pillow.
223
  Returns (width, height) or (None, None) on failure.
 
233
 
234
 
235
  def refine_candidates_with_dimensions(
236
+ candidates: List[Dict[str, Any]], max_to_check: int = 10
 
237
  ) -> List[Dict[str, Any]]:
238
  """
239
  For up to `max_to_check` candidates, compute real width/height.
 
270
  return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
271
 
272
 
273
+ def pick_best_author_image(
274
+ candidates: List[Dict[str, Any]],
275
+ ) -> Optional[Dict[str, Any]]:
276
  if not candidates:
277
  return None
278
  return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
 
280
 
281
  # ------------------ Main entry: scrape_author_image ------------------ #
282
 
283
+
284
  def scrape_author_image(url: str) -> Dict[str, Any]:
285
  """
286
  Given a URL, return the most likely author/headshot image.
 
319
 
320
  # ------------------ Download helpers ------------------ #
321
 
322
+
323
  def download_image(
324
+ image_url: str, out_dir: str = "author_images", filename: Optional[str] = None
 
 
325
  ) -> str:
326
  """
327
  Download a single image URL to `out_dir`, return local file path.
 
338
  base_name = fallback
339
 
340
  # Ensure file extension
341
+ if not any(
342
+ base_name.lower().endswith(ext)
343
+ for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]
344
+ ):
345
  base_name += ".jpg"
346
 
347
  file_path = os.path.join(out_dir, base_name)
 
355
  return file_path
356
 
357
 
358
+ def download_author_image(
359
+ page_url: str, out_dir: str = "author_images"
360
+ ) -> Dict[str, Any]:
361
  """
362
  High-level helper:
363
  1. Scrape the page for the best author/headshot image.
 
383
 
384
  # ---------- Site normalization & about-page discovery ---------- #
385
 
386
+
387
  def normalize_site_input(site: str) -> str:
388
  """
389
  Allow user to input:
 
538
 
539
 
540
  def download_author_image_for_site(
541
+ site_input: str, out_dir: str = "author_images"
 
542
  ) -> Dict[str, Any]:
543
  """
544
  1. Convert site input into a normalized base URL.