Spaces:

Raptive-NC
/

Creator_Catalog

Sleeping

App Files Files Community

github-actions[bot] commited on 25 days ago

Commit

88cc214

1 Parent(s): 4adc9cf

sync: automatic content update from github

Browse files

Files changed (2) hide show

src/gpt.py +2 -2
src/headshot_scraper.py +38 -22

src/gpt.py CHANGED Viewed

@@ -37,12 +37,12 @@ User must request it.
 3. Approved Data Sources
 The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
 ✔ /src/data.csv
-Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags.
 ✔ /src/brandpositioninglibrary.txt
 Brand vertical → subvertical → content priorities.\nUsed for vertical fit scoring.
 ✔ /src/creatorkeywords.csv
 Maps creators to brand-relevant keywords used for keyword alignment scoring.
-No other data may be invented. Only these files exist.
 4. Mandatory Initial Filters
 Applied before anything else:

 3. Approved Data Sources
 The GPT may only use the following uploaded files (all stored in the `/src` directory of this repo; use those exact paths when loading data):
 ✔ /src/data.csv
+Primary creator catalog: handles, verticals, formats, follower counts, pageviews, demographics, links, opt-in status, advertiser concerns, brand avoidance flags. The only sites you may return must be listed in the "Site Name" column of this file, and you must match the site name exactly as written there.
 ✔ /src/brandpositioninglibrary.txt
 Brand vertical → subvertical → content priorities.\nUsed for vertical fit scoring.
 ✔ /src/creatorkeywords.csv
 Maps creators to brand-relevant keywords used for keyword alignment scoring.
+No other data may be invented or referenced. Only these files exist, and only these sources may be used. If a user asks for a site or data not present in these files, respond that it is unavailable.
 4. Mandatory Initial Filters
 Applied before anything else:

src/headshot_scraper.py CHANGED Viewed

@@ -17,6 +17,7 @@ HEADERS = {
 # ------------------ Basic utils ------------------ #
 def validate_url(url: str) -> str:
     if not url or not isinstance(url, str):
         raise ValueError("URL must be a non-empty string.")
@@ -110,7 +111,9 @@ AUTHOR_SELECTORS: List[str] = [
 ]
-def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
     """
     Find images likely to be author / headshot images.
     Returns a list of dicts:
@@ -119,7 +122,9 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
     candidates: List[Dict[str, Any]] = []
     seen = set()
-    def add_image(src: Optional[str], source: str, score_boost: float = 0.0, tag=None) -> None:
         if not src:
             return
         abs_url = normalize_url(src, base_url)
@@ -148,15 +153,17 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
                 pass
         seen.add(abs_url)
-        candidates.append({
-            "url": abs_url,
-            "score": score,
-            "source": source,
-            "width_attr": width_attr,
-            "height_attr": height_attr,
-            "width_px": None,
-            "height_px": None,
-        })
     # 1) Inside author/about-ish containers
     for selector in AUTHOR_SELECTORS:
@@ -207,7 +214,10 @@ def extract_author_candidates(soup: BeautifulSoup, base_url: str) -> List[Dict[s
 # ------------------ Size-based filtering via Pillow ------------------ #
-def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Optional[int]):
     """
     Fetch the image and read dimensions with Pillow.
     Returns (width, height) or (None, None) on failure.
@@ -223,8 +233,7 @@ def measure_image_dimensions(url: str, timeout: int = 10) -> (Optional[int], Opt
 def refine_candidates_with_dimensions(
-    candidates: List[Dict[str, Any]],
-    max_to_check: int = 10
 ) -> List[Dict[str, Any]]:
     """
     For up to `max_to_check` candidates, compute real width/height.
@@ -261,7 +270,9 @@ def refine_candidates_with_dimensions(
     return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
-def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
     if not candidates:
         return None
     return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
@@ -269,6 +280,7 @@ def pick_best_author_image(candidates: List[Dict[str, Any]]) -> Optional[Dict[st
 # ------------------ Main entry: scrape_author_image ------------------ #
 def scrape_author_image(url: str) -> Dict[str, Any]:
     """
     Given a URL, return the most likely author/headshot image.
@@ -307,10 +319,9 @@ def scrape_author_image(url: str) -> Dict[str, Any]:
 # ------------------ Download helpers ------------------ #
 def download_image(
-    image_url: str,
-    out_dir: str = "author_images",
-    filename: Optional[str] = None
 ) -> str:
     """
     Download a single image URL to `out_dir`, return local file path.
@@ -327,7 +338,10 @@ def download_image(
         base_name = fallback
     # Ensure file extension
-    if not any(base_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]):
         base_name += ".jpg"
     file_path = os.path.join(out_dir, base_name)
@@ -341,7 +355,9 @@ def download_image(
     return file_path
-def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict[str, Any]:
     """
     High-level helper:
       1. Scrape the page for the best author/headshot image.
@@ -367,6 +383,7 @@ def download_author_image(page_url: str, out_dir: str = "author_images") -> Dict
 # ---------- Site normalization & about-page discovery ---------- #
 def normalize_site_input(site: str) -> str:
     """
     Allow user to input:
@@ -521,8 +538,7 @@ def clean_site_name(site: str) -> str:
 def download_author_image_for_site(
-    site_input: str,
-    out_dir: str = "author_images"
 ) -> Dict[str, Any]:
     """
     1. Convert site input into a normalized base URL.

 # ------------------ Basic utils ------------------ #
 def validate_url(url: str) -> str:
     if not url or not isinstance(url, str):
         raise ValueError("URL must be a non-empty string.")
 ]
+def extract_author_candidates(
+    soup: BeautifulSoup, base_url: str
+) -> List[Dict[str, Any]]:
     """
     Find images likely to be author / headshot images.
     Returns a list of dicts:
     candidates: List[Dict[str, Any]] = []
     seen = set()
+    def add_image(
+        src: Optional[str], source: str, score_boost: float = 0.0, tag=None
+    ) -> None:
         if not src:
             return
         abs_url = normalize_url(src, base_url)
                 pass
         seen.add(abs_url)
+        candidates.append(
+            {
+                "url": abs_url,
+                "score": score,
+                "source": source,
+                "width_attr": width_attr,
+                "height_attr": height_attr,
+                "width_px": None,
+                "height_px": None,
+            }
+        )
     # 1) Inside author/about-ish containers
     for selector in AUTHOR_SELECTORS:
 # ------------------ Size-based filtering via Pillow ------------------ #
+def measure_image_dimensions(
+    url: str, timeout: int = 10
+) -> (Optional[int], Optional[int]):
     """
     Fetch the image and read dimensions with Pillow.
     Returns (width, height) or (None, None) on failure.
 def refine_candidates_with_dimensions(
+    candidates: List[Dict[str, Any]], max_to_check: int = 10
 ) -> List[Dict[str, Any]]:
     """
     For up to `max_to_check` candidates, compute real width/height.
     return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)
+def pick_best_author_image(
+    candidates: List[Dict[str, Any]],
+) -> Optional[Dict[str, Any]]:
     if not candidates:
         return None
     return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]
 # ------------------ Main entry: scrape_author_image ------------------ #
 def scrape_author_image(url: str) -> Dict[str, Any]:
     """
     Given a URL, return the most likely author/headshot image.
 # ------------------ Download helpers ------------------ #
 def download_image(
+    image_url: str, out_dir: str = "author_images", filename: Optional[str] = None
 ) -> str:
     """
     Download a single image URL to `out_dir`, return local file path.
         base_name = fallback
     # Ensure file extension
+    if not any(
+        base_name.lower().endswith(ext)
+        for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]
+    ):
         base_name += ".jpg"
     file_path = os.path.join(out_dir, base_name)
     return file_path
+def download_author_image(
+    page_url: str, out_dir: str = "author_images"
+) -> Dict[str, Any]:
     """
     High-level helper:
       1. Scrape the page for the best author/headshot image.
 # ---------- Site normalization & about-page discovery ---------- #
 def normalize_site_input(site: str) -> str:
     """
     Allow user to input:
 def download_author_image_for_site(
+    site_input: str, out_dir: str = "author_images"
 ) -> Dict[str, Any]:
     """
     1. Convert site input into a normalized base URL.