acarey5 commited on
Commit
70f34e2
Β·
1 Parent(s): cde0f3c

small fix

Browse files
Files changed (2) hide show
  1. app.py +2 -3
  2. src/jobs/company_loader.py +8 -79
app.py CHANGED
@@ -759,9 +759,8 @@ def analyze_resume(
759
  msg = (
760
  f"Loaded {total_loaded} companies but none have a usable careers URL. "
761
  f"CSV columns detected: {col_sample}. "
762
- "The 'Direct links' column (col 4) appears to be blank in this CSV β€” "
763
- "add career page URLs there, or the pre-conference links column (col 2) "
764
- "will be used as a fallback when available."
765
  )
766
  print("[analyze] WARNING:", msg)
767
  return (
 
759
  msg = (
760
  f"Loaded {total_loaded} companies but none have a usable careers URL. "
761
  f"CSV columns detected: {col_sample}. "
762
+ "This app now reads only the opening page column (col 4 / 'Direct links to company career/job openings page'). "
763
+ "Add valid https URLs in that column."
 
764
  )
765
  print("[analyze] WARNING:", msg)
766
  return (
src/jobs/company_loader.py CHANGED
@@ -7,34 +7,6 @@ from src.models import CompanyRecord
7
 
8
 
9
  COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
10
- CAREERS_KEYS = [
11
- "careers_url",
12
- "career url",
13
- "jobs_url",
14
- "job board",
15
- "direct links to company career/job openings page",
16
- "direct links to company career/job openings page ", # trailing-space variant
17
- ]
18
- OPENINGS_KEYS = [
19
- # Literal column 4 header used in NSBE CSVs
20
- "column 4",
21
- # Full openings header variants
22
- "direct links to company career/job openings page",
23
- "direct links to company career/job openings page ",
24
- # Common generic names
25
- "careers page",
26
- "careers link",
27
- "job postings",
28
- "job openings",
29
- "openings url",
30
- "open positions url",
31
- ]
32
-
33
- # Substring tokens checked against every column name when no exact match is found.
34
- _URL_COLUMN_TOKENS = [
35
- "career", "job", "opening", "openings", "position", "positions",
36
- "roles", "link", "url", "apply", "recruit",
37
- ]
38
 
39
 
40
  def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
@@ -55,42 +27,6 @@ def _extract_first_http(text: str) -> str:
55
  return match.group(0).rstrip(")],.;\"") if match else ""
56
 
57
 
58
- def _first_url(row: Dict[str, str], *, _debug: bool = False) -> str:
59
- """Return the first HTTP(S) URL found in a CSV row.
60
-
61
- Search order:
62
- 1. Exact match against OPENINGS_KEYS / CAREERS_KEYS (already lowercase).
63
- 2. Substring scan of every column key using _URL_COLUMN_TOKENS.
64
- 3. Last-resort: scan ALL column values for any https?:// URL.
65
- """
66
- # Pass 1 β€” exact-key lookup (lists already lower-cased)
67
- for key in OPENINGS_KEYS + CAREERS_KEYS:
68
- url = _extract_first_http(row.get(key, ""))
69
- if url:
70
- if _debug:
71
- print(f" [_first_url] hit (exact) key={key!r}")
72
- return url
73
-
74
- # Pass 2 β€” case-insensitive substring scan of every column name
75
- for key, value in row.items():
76
- if any(token in key for token in _URL_COLUMN_TOKENS):
77
- url = _extract_first_http(value)
78
- if url:
79
- if _debug:
80
- print(f" [_first_url] hit (token) key={key!r}")
81
- return url
82
-
83
- # Pass 3 β€” last resort: scan every column value for an HTTP URL
84
- for key, value in row.items():
85
- url = _extract_first_http(value)
86
- if url:
87
- if _debug:
88
- print(f" [_first_url] hit (fallback) key={key!r}")
89
- return url
90
-
91
- return ""
92
-
93
-
94
  def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
95
  print(f"[company_loader] Reading CSV: {csv_path}")
96
  companies: List[CompanyRecord] = []
@@ -107,14 +43,11 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
107
 
108
  # Positional column mapping (0-indexed):
109
  # col 0 β†’ company name
110
- # col 3 β†’ "Direct links to company career/job openings page" (primary URL)
111
- # col 1 β†’ "Pre-Conference interview or registration links" (fallback URL)
112
- name_key = normalized_cols[0] if len(normalized_cols) > 0 else None
113
- direct_url_key = normalized_cols[3] if len(normalized_cols) > 3 else None # col D
114
- preconf_key = normalized_cols[1] if len(normalized_cols) > 1 else None # col B fallback
115
  print(f"[company_loader] name col: index=0 key={name_key!r}")
116
- print(f"[company_loader] url col: index=3 key={direct_url_key!r} (primary)")
117
- print(f"[company_loader] url col: index=1 key={preconf_key!r} (fallback)")
118
 
119
  total_rows = 0
120
  skipped_no_name = 0
@@ -129,17 +62,13 @@ def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
129
  skipped_no_name += 1
130
  continue
131
 
132
- # URL: try column 4 (index 3) first, then column 2 (index 1) as fallback.
133
- careers_url = (
134
- _extract_first_http(row.get(direct_url_key, ""))
135
- or _extract_first_http(row.get(preconf_key, ""))
136
- )
137
  if not careers_url:
138
  skipped_no_url += 1
139
  if skipped_no_url <= 5:
140
- d_val = row.get(direct_url_key, "<missing>") if direct_url_key else "<no col4>"
141
- p_val = row.get(preconf_key, "<missing>") if preconf_key else "<no col2>"
142
- print(f"[company_loader] No URL for '{company}' β€” col4={d_val!r} col2={p_val!r}")
143
 
144
  companies.append(
145
  CompanyRecord(
 
7
 
8
 
9
  COMPANY_KEYS = ["company", "company list", "name", "employer", "organization"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  def _normalize_headers(row: Dict[str, str]) -> Dict[str, str]:
 
27
  return match.group(0).rstrip(")],.;\"") if match else ""
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def _read_companies(csv_path: Path, source: str) -> List[CompanyRecord]:
31
  print(f"[company_loader] Reading CSV: {csv_path}")
32
  companies: List[CompanyRecord] = []
 
43
 
44
  # Positional column mapping (0-indexed):
45
  # col 0 β†’ company name
46
+ # col 3 β†’ Direct links to company career/job openings page (ONLY URL source)
47
+ name_key = normalized_cols[0] if len(normalized_cols) > 0 else None
48
+ opening_page_key = normalized_cols[3] if len(normalized_cols) > 3 else None
 
 
49
  print(f"[company_loader] name col: index=0 key={name_key!r}")
50
+ print(f"[company_loader] url col: index=3 key={opening_page_key!r} (only source)")
 
51
 
52
  total_rows = 0
53
  skipped_no_name = 0
 
62
  skipped_no_name += 1
63
  continue
64
 
65
+ # URL comes ONLY from column 4 (index 3). Ignore all other columns.
66
+ careers_url = _extract_first_http(row.get(opening_page_key, "")) if opening_page_key else ""
 
 
 
67
  if not careers_url:
68
  skipped_no_url += 1
69
  if skipped_no_url <= 5:
70
+ d_val = row.get(opening_page_key, "<missing>") if opening_page_key else "<no col4>"
71
+ print(f"[company_loader] No URL for '{company}' β€” col4={d_val!r}")
 
72
 
73
  companies.append(
74
  CompanyRecord(