Spaces:
Sleeping
Sleeping
acarey5 commited on
Commit Β·
70f34e2
1
Parent(s): cde0f3c
small fix
Browse files- app.py +2 -3
- src/jobs/company_loader.py +8 -79
app.py
CHANGED
|
@@ -759,9 +759,8 @@ def analyze_resume(
|
|
| 759 |
msg = (
|
| 760 |
f"Loaded {total_loaded} companies but none have a usable careers URL. "
|
| 761 |
f"CSV columns detected: {col_sample}. "
|
| 762 |
-
"
|
| 763 |
-
"
|
| 764 |
-
"will be used as a fallback when available."
|
| 765 |
)
|
| 766 |
print("[analyze] WARNING:", msg)
|
| 767 |
return (
|
|
|
|
| 759 |
msg = (
|
| 760 |
f"Loaded {total_loaded} companies but none have a usable careers URL. "
|
| 761 |
f"CSV columns detected: {col_sample}. "
|
| 762 |
+
"This app now reads only the opening page column (col 4 / 'Direct links to company career/job openings page'). "
|
| 763 |
+
"Add valid https URLs in that column."
|
|
|
|
| 764 |
)
|
| 765 |
print("[analyze] WARNING:", msg)
|
| 766 |
return (
|
src/jobs/company_loader.py
CHANGED
|
@@ -7,34 +7,6 @@ from src.models import CompanyRecord
|
|
| 7 |
|
| 8 |
|
| 9 |
COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
|
| 10 |
-
CAREERS_KEYS = [
|
| 11 |
-
"careers_url",
|
| 12 |
-
"career url",
|
| 13 |
-
"jobs_url",
|
| 14 |
-
"job board",
|
| 15 |
-
"direct links to company career/job openings page",
|
| 16 |
-
"direct links to company career/job openings page ", # trailing-space variant
|
| 17 |
-
]
|
| 18 |
-
OPENINGS_KEYS = [
|
| 19 |
-
# Literal column 4 header used in NSBE CSVs
|
| 20 |
-
"column 4",
|
| 21 |
-
# Full openings header variants
|
| 22 |
-
"direct links to company career/job openings page",
|
| 23 |
-
"direct links to company career/job openings page ",
|
| 24 |
-
# Common generic names
|
| 25 |
-
"careers page",
|
| 26 |
-
"careers link",
|
| 27 |
-
"job postings",
|
| 28 |
-
"job openings",
|
| 29 |
-
"openings url",
|
| 30 |
-
"open positions url",
|
| 31 |
-
]
|
| 32 |
-
|
| 33 |
-
# Substring tokens checked against every column name when no exact match is found.
|
| 34 |
-
_URL_COLUMN_TOKENS = [
|
| 35 |
-
"career", "job", "opening", "openings", "position", "positions",
|
| 36 |
-
"roles", "link", "url", "apply", "recruit",
|
| 37 |
-
]
|
| 38 |
|
| 39 |
|
| 40 |
def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
|
|
@@ -55,42 +27,6 @@ def _extract_first_http(text: str) -> str:
|
|
| 55 |
return match.group(0).rstrip(")],.;\"") if match else ""
|
| 56 |
|
| 57 |
|
| 58 |
-
def _first_url(row: Dict[str, str], *, _debug: bool = False) -> str:
|
| 59 |
-
"""Return the first HTTP(S) URL found in a CSV row.
|
| 60 |
-
|
| 61 |
-
Search order:
|
| 62 |
-
1. Exact match against OPENINGS_KEYS / CAREERS_KEYS (already lowercase).
|
| 63 |
-
2. Substring scan of every column key using _URL_COLUMN_TOKENS.
|
| 64 |
-
3. Last-resort: scan ALL column values for any https?:// URL.
|
| 65 |
-
"""
|
| 66 |
-
# Pass 1 β exact-key lookup (lists already lower-cased)
|
| 67 |
-
for key in OPENINGS_KEYS + CAREERS_KEYS:
|
| 68 |
-
url = _extract_first_http(row.get(key, ""))
|
| 69 |
-
if url:
|
| 70 |
-
if _debug:
|
| 71 |
-
print(f" [_first_url] hit (exact) key={key!r}")
|
| 72 |
-
return url
|
| 73 |
-
|
| 74 |
-
# Pass 2 β case-insensitive substring scan of every column name
|
| 75 |
-
for key, value in row.items():
|
| 76 |
-
if any(token in key for token in _URL_COLUMN_TOKENS):
|
| 77 |
-
url = _extract_first_http(value)
|
| 78 |
-
if url:
|
| 79 |
-
if _debug:
|
| 80 |
-
print(f" [_first_url] hit (token) key={key!r}")
|
| 81 |
-
return url
|
| 82 |
-
|
| 83 |
-
# Pass 3 β last resort: scan every column value for an HTTP URL
|
| 84 |
-
for key, value in row.items():
|
| 85 |
-
url = _extract_first_http(value)
|
| 86 |
-
if url:
|
| 87 |
-
if _debug:
|
| 88 |
-
print(f" [_first_url] hit (fallback) key={key!r}")
|
| 89 |
-
return url
|
| 90 |
-
|
| 91 |
-
return ""
|
| 92 |
-
|
| 93 |
-
|
| 94 |
def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
|
| 95 |
print(f"[company_loader] Reading CSV: {csv_path}")
|
| 96 |
companies: List[CompanyRecord] = []
|
|
@@ -107,14 +43,11 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
|
|
| 107 |
|
| 108 |
# Positional column mapping (0-indexed):
|
| 109 |
# col 0 β company name
|
| 110 |
-
# col 3 β
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
direct_url_key = normalized_cols[3] if len(normalized_cols) > 3 else None # col D
|
| 114 |
-
preconf_key = normalized_cols[1] if len(normalized_cols) > 1 else None # col B fallback
|
| 115 |
print(f"[company_loader] name col: index=0 key={name_key!r}")
|
| 116 |
-
print(f"[company_loader] url col: index=3 key={
|
| 117 |
-
print(f"[company_loader] url col: index=1 key={preconf_key!r} (fallback)")
|
| 118 |
|
| 119 |
total_rows = 0
|
| 120 |
skipped_no_name = 0
|
|
@@ -129,17 +62,13 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
|
|
| 129 |
skipped_no_name += 1
|
| 130 |
continue
|
| 131 |
|
| 132 |
-
# URL
|
| 133 |
-
careers_url = (
|
| 134 |
-
_extract_first_http(row.get(direct_url_key, ""))
|
| 135 |
-
or _extract_first_http(row.get(preconf_key, ""))
|
| 136 |
-
)
|
| 137 |
if not careers_url:
|
| 138 |
skipped_no_url += 1
|
| 139 |
if skipped_no_url <= 5:
|
| 140 |
-
d_val = row.get(
|
| 141 |
-
|
| 142 |
-
print(f"[company_loader] No URL for '{company}' β col4={d_val!r} col2={p_val!r}")
|
| 143 |
|
| 144 |
companies.append(
|
| 145 |
CompanyRecord(
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
|
|
|
|
| 27 |
return match.group(0).rstrip(")],.;\"") if match else ""
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
|
| 31 |
print(f"[company_loader] Reading CSV: {csv_path}")
|
| 32 |
companies: List[CompanyRecord] = []
|
|
|
|
| 43 |
|
| 44 |
# Positional column mapping (0-indexed):
|
| 45 |
# col 0 β company name
|
| 46 |
+
# col 3 β Direct links to company career/job openings page (ONLY URL source)
|
| 47 |
+
name_key = normalized_cols[0] if len(normalized_cols) > 0 else None
|
| 48 |
+
opening_page_key = normalized_cols[3] if len(normalized_cols) > 3 else None
|
|
|
|
|
|
|
| 49 |
print(f"[company_loader] name col: index=0 key={name_key!r}")
|
| 50 |
+
print(f"[company_loader] url col: index=3 key={opening_page_key!r} (only source)")
|
|
|
|
| 51 |
|
| 52 |
total_rows = 0
|
| 53 |
skipped_no_name = 0
|
|
|
|
| 62 |
skipped_no_name += 1
|
| 63 |
continue
|
| 64 |
|
| 65 |
+
# URL comes ONLY from column 4 (index 3). Ignore all other columns.
|
| 66 |
+
careers_url = _extract_first_http(row.get(opening_page_key, "")) if opening_page_key else ""
|
|
|
|
|
|
|
|
|
|
| 67 |
if not careers_url:
|
| 68 |
skipped_no_url += 1
|
| 69 |
if skipped_no_url <= 5:
|
| 70 |
+
d_val = row.get(opening_page_key, "<missing>") if opening_page_key else "<no col4>"
|
| 71 |
+
print(f"[company_loader] No URL for '{company}' β col4={d_val!r}")
|
|
|
|
| 72 |
|
| 73 |
companies.append(
|
| 74 |
CompanyRecord(
|