Spaces:

cherrykiwidd
/

nsbecf

Sleeping

App Files Files Community

acarey5 commited on 30 days ago

Commit

70f34e2

1 Parent(s): cde0f3c

small fix

Browse files

Files changed (2) hide show

app.py +2 -3
src/jobs/company_loader.py +8 -79

app.py CHANGED Viewed

@@ -759,9 +759,8 @@ def analyze_resume(
             msg = (
                 f"Loaded {total_loaded} companies but none have a usable careers URL. "
                 f"CSV columns detected: {col_sample}. "
-                "The 'Direct links' column (col 4) appears to be blank in this CSV — "
-                "add career page URLs there, or the pre-conference links column (col 2) "
-                "will be used as a fallback when available."
             )
             print("[analyze] WARNING:", msg)
             return (

             msg = (
                 f"Loaded {total_loaded} companies but none have a usable careers URL. "
                 f"CSV columns detected: {col_sample}. "
+                "This app now reads only the opening page column (col 4 / 'Direct links to company career/job openings page'). "
+                "Add valid https URLs in that column."
             )
             print("[analyze] WARNING:", msg)
             return (

src/jobs/company_loader.py CHANGED Viewed

@@ -7,34 +7,6 @@ from src.models import CompanyRecord
 COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
-CAREERS_KEYS = [
-    "careers_url",
-    "career url",
-    "jobs_url",
-    "job board",
-    "direct links to company career/job openings page",
-    "direct links to company career/job openings page ",  # trailing-space variant
-]
-OPENINGS_KEYS = [
-    # Literal column 4 header used in NSBE CSVs
-    "column 4",
-    # Full openings header variants
-    "direct links to company career/job openings page",
-    "direct links to company career/job openings page ",
-    # Common generic names
-    "careers page",
-    "careers link",
-    "job postings",
-    "job openings",
-    "openings url",
-    "open positions url",
-]
-# Substring tokens checked against every column name when no exact match is found.
-_URL_COLUMN_TOKENS = [
-    "career", "job", "opening", "openings", "position", "positions",
-    "roles", "link", "url", "apply", "recruit",
-]
 def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
@@ -55,42 +27,6 @@ def _extract_first_http(text: str) -> str:
     return match.group(0).rstrip(")],.;\"") if match else ""
-def _first_url(row: Dict[str, str], *, _debug: bool = False) -> str:
-    """Return the first HTTP(S) URL found in a CSV row.
-    Search order:
-    1. Exact match against OPENINGS_KEYS / CAREERS_KEYS (already lowercase).
-    2. Substring scan of every column key using _URL_COLUMN_TOKENS.
-    3. Last-resort: scan ALL column values for any https?:// URL.
-    """
-    # Pass 1 — exact-key lookup (lists already lower-cased)
-    for key in OPENINGS_KEYS + CAREERS_KEYS:
-        url = _extract_first_http(row.get(key, ""))
-        if url:
-            if _debug:
-                print(f"  [_first_url] hit (exact) key={key!r}")
-            return url
-    # Pass 2 — case-insensitive substring scan of every column name
-    for key, value in row.items():
-        if any(token in key for token in _URL_COLUMN_TOKENS):
-            url = _extract_first_http(value)
-            if url:
-                if _debug:
-                    print(f"  [_first_url] hit (token) key={key!r}")
-                return url
-    # Pass 3 — last resort: scan every column value for an HTTP URL
-    for key, value in row.items():
-        url = _extract_first_http(value)
-        if url:
-            if _debug:
-                print(f"  [_first_url] hit (fallback) key={key!r}")
-            return url
-    return ""
 def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
     print(f"[company_loader] Reading CSV: {csv_path}")
     companies: List[CompanyRecord] = []
@@ -107,14 +43,11 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
         # Positional column mapping (0-indexed):
         #   col 0 → company name
-        #   col 3 → "Direct links to company career/job openings page"  (primary URL)
-        #   col 1 → "Pre-Conference interview or registration links"     (fallback URL)
-        name_key       = normalized_cols[0] if len(normalized_cols) > 0 else None
-        direct_url_key = normalized_cols[3] if len(normalized_cols) > 3 else None  # col D
-        preconf_key    = normalized_cols[1] if len(normalized_cols) > 1 else None  # col B fallback
         print(f"[company_loader] name col: index=0 key={name_key!r}")
-        print(f"[company_loader] url  col: index=3 key={direct_url_key!r}  (primary)")
-        print(f"[company_loader] url  col: index=1 key={preconf_key!r}  (fallback)")
         total_rows = 0
         skipped_no_name = 0
@@ -129,17 +62,13 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
                 skipped_no_name += 1
                 continue
-            # URL: try column 4 (index 3) first, then column 2 (index 1) as fallback.
-            careers_url = (
-                _extract_first_http(row.get(direct_url_key, ""))
-                or _extract_first_http(row.get(preconf_key, ""))
-            )
             if not careers_url:
                 skipped_no_url += 1
                 if skipped_no_url <= 5:
-                    d_val = row.get(direct_url_key, "<missing>") if direct_url_key else "<no col4>"
-                    p_val = row.get(preconf_key, "<missing>") if preconf_key else "<no col2>"
-                    print(f"[company_loader] No URL for '{company}' — col4={d_val!r}  col2={p_val!r}")
             companies.append(
                 CompanyRecord(

 COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
 def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
     return match.group(0).rstrip(")],.;\"") if match else ""
 def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
     print(f"[company_loader] Reading CSV: {csv_path}")
     companies: List[CompanyRecord] = []
         # Positional column mapping (0-indexed):
         #   col 0 → company name
+        #   col 3 → Direct links to company career/job openings page (ONLY URL source)
+        name_key = normalized_cols[0] if len(normalized_cols) > 0 else None
+        opening_page_key = normalized_cols[3] if len(normalized_cols) > 3 else None
         print(f"[company_loader] name col: index=0 key={name_key!r}")
+        print(f"[company_loader] url  col: index=3 key={opening_page_key!r}  (only source)")
         total_rows = 0
         skipped_no_name = 0
                 skipped_no_name += 1
                 continue
+            # URL comes ONLY from column 4 (index 3). Ignore all other columns.
+            careers_url = _extract_first_http(row.get(opening_page_key, "")) if opening_page_key else ""
             if not careers_url:
                 skipped_no_url += 1
                 if skipped_no_url <= 5:
+                    d_val = row.get(opening_page_key, "<missing>") if opening_page_key else "<no col4>"
+                    print(f"[company_loader] No URL for '{company}' — col4={d_val!r}")
             companies.append(
                 CompanyRecord(