Spaces:

HipFil98
/

PhDScout

Sleeping

HipFil98 Claude Sonnet 4.6 commited on Mar 18

Commit

cd035f4

1 Parent(s): c58f98c

refactor: remove dead code, extract shared job helpers

agent/utils.py:
- Add job_institution(job) — eliminates repeated .get("institution", .get("company",...))
pattern; the "company" fallback was dead (no scraper ever sets it)
- Add job_description(job, max_chars) — eliminates repeated [:3000] truncation pattern

agent/job_searcher.py:
- Remove _TYPE_QUERY dict (defined but never referenced anywhere)
- Move _MLSCI_NON_COUNTRY from inside the loop body to module-level frozenset
- Fix module docstring (remove stale FindAPhD/jobs.ac.uk-only references)

agent/job_matcher.py, cover_letter.py, cv_tailor.py:
- Use job_institution() and job_description() from utils

app.py:
- Use job_institution() in all 7 display/logic locations

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (6) hide show

agent/cover_letter.py +4 -3
agent/cv_tailor.py +3 -3
agent/job_matcher.py +3 -3
agent/job_searcher.py +9 -17
agent/utils.py +10 -0
app.py +9 -7

agent/cover_letter.py CHANGED Viewed

@@ -6,6 +6,7 @@ import re
 from typing import Any
 from agent.llm_client import LLMClient
 _ITALIAN_KEYWORDS = {
@@ -78,7 +79,7 @@ class CoverLetterWriter:
         """
         language = self._detect_language(job)
         title = job.get("title", "Unknown Position")
-        institution = job.get("institution", job.get("company", "Unknown Institution"))
         prompt = _PROMPT.format(
             profile=profile_text,
@@ -86,7 +87,7 @@ class CoverLetterWriter:
             institution=institution,
             location=job.get("location", "Unknown"),
             pos_type=job.get("type", "research"),
-            description=(job.get("description") or "No description provided.")[:3000],
             language=language,
             regen_note=_REGEN_NOTE if regenerate else "",
         )
@@ -108,7 +109,7 @@ class CoverLetterWriter:
             job.get("title", ""),
             job.get("description", ""),
             job.get("location", ""),
-            job.get("institution", job.get("company", "")),
         ]).lower()
         hits = sum(1 for kw in _ITALIAN_KEYWORDS if kw.lower() in text)
         return "Italian" if hits >= 2 else "English"

 from typing import Any
 from agent.llm_client import LLMClient
+from agent.utils import job_institution, job_description
 _ITALIAN_KEYWORDS = {
         """
         language = self._detect_language(job)
         title = job.get("title", "Unknown Position")
+        institution = job_institution(job) or "Unknown Institution"
         prompt = _PROMPT.format(
             profile=profile_text,
             institution=institution,
             location=job.get("location", "Unknown"),
             pos_type=job.get("type", "research"),
+            description=job_description(job),
             language=language,
             regen_note=_REGEN_NOTE if regenerate else "",
         )
             job.get("title", ""),
             job.get("description", ""),
             job.get("location", ""),
+            job_institution(job),
         ]).lower()
         hits = sum(1 for kw in _ITALIAN_KEYWORDS if kw.lower() in text)
         return "Italian" if hits >= 2 else "English"

agent/cv_tailor.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 from typing import Any, TypedDict
 from agent.llm_client import LLMClient
-from agent.utils import parse_json
 class TailoringHints(TypedDict, total=False):
@@ -95,9 +95,9 @@ class CVTailor:
         """Generate actionable tailoring hints for a specific position."""
         prompt = _PROMPT.format(
             title=job.get("title", "Unknown"),
-            institution=job.get("institution", job.get("company", "Unknown")),
             pos_type=job.get("type", "unknown"),
-            description=(job.get("description") or "No description provided.")[:3000],
             profile=profile_text,
         )

 from typing import Any, TypedDict
 from agent.llm_client import LLMClient
+from agent.utils import parse_json, job_institution, job_description
 class TailoringHints(TypedDict, total=False):
         """Generate actionable tailoring hints for a specific position."""
         prompt = _PROMPT.format(
             title=job.get("title", "Unknown"),
+            institution=job_institution(job) or "Unknown",
             pos_type=job.get("type", "unknown"),
+            description=job_description(job),
             profile=profile_text,
         )

agent/job_matcher.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 from typing import Any, TypedDict
 from agent.llm_client import LLMClient
-from agent.utils import parse_json
 class MatchResult(TypedDict, total=False):
@@ -83,10 +83,10 @@ class JobMatcher:
         prompt = _PROMPT.format(
             profile=profile_text,
             title=job.get("title", "Unknown"),
-            institution=job.get("institution", job.get("company", "Unknown")),
             location=job.get("location", "Unknown"),
             pos_type=job.get("type", "unknown"),
-            description=(job.get("description") or "No description provided.")[:3000],
         )
         try:

 from typing import Any, TypedDict
 from agent.llm_client import LLMClient
+from agent.utils import parse_json, job_institution, job_description
 class MatchResult(TypedDict, total=False):
         prompt = _PROMPT.format(
             profile=profile_text,
             title=job.get("title", "Unknown"),
+            institution=job_institution(job) or "Unknown",
             location=job.get("location", "Unknown"),
             pos_type=job.get("type", "unknown"),
+            description=job_description(job),
         )
         try:

agent/job_searcher.py CHANGED Viewed

@@ -1,8 +1,10 @@
 """Job searcher: finds PhD / postdoc / research positions from free public sources.
 Sources:
-- jobs.ac.uk (UK academic jobs) — HTML scraping with facet filters; only queried for UK/worldwide
-- FindAPhD (worldwide PhD board) — HTML scraping; handles location filtering globally
 All scrapers are wrapped in try/except — if one source is down the rest continue.
 """
@@ -57,15 +59,6 @@ _TYPE_KEYWORDS: dict[str, list[str]] = {
     ],
 }
-# Position type → keywords appended to search query for sites without native facets
-_TYPE_QUERY: dict[str, str] = {
-    "predoctoral": "predoctoral OR \"early-stage researcher\" OR \"research trainee\"",
-    "phd": "PhD",
-    "postdoc": "postdoc OR \"research associate\" OR \"research fellow\"",
-    "fellowship": "fellowship OR scholarship",
-    "research_staff": "researcher OR lecturer OR professor",
-}
 _HEADERS = {
     "User-Agent": (
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
@@ -119,6 +112,11 @@ _MLSCI_TYPE_SLUG: dict[str, str] = {
     "any":            "jobs",
 }
 # mlscientist.com country slug mapping (lowercase location → slug)
 _MLSCI_COUNTRY_SLUG: dict[str, str] = {
     "uk": "united-kingdom",
@@ -166,12 +164,6 @@ def _search_mlscientist(field: str, location: str, position_type: str) -> list[d
     listings: list[dict] = []
     seen_urls: set[str] = set()
-    # Non-type category slugs to ignore when extracting country from CSS classes
-    _MLSCI_NON_COUNTRY = {
-        "jobs", "phd-positions", "postdoc-positions", "featured",
-        "conference-calls", "mlnews",
-    }
     for url in urls_to_try:
         try:
             resp = requests.get(url, headers=_HEADERS, timeout=15)

 """Job searcher: finds PhD / postdoc / research positions from free public sources.
 Sources:
+- Euraxess (euraxess.ec.europa.eu) — EU/worldwide research portal, country-filtered
+- mlscientist.com — ML/AI academic positions, WordPress category + search
+- jobs.ac.uk — UK academic jobs (queried only for UK/worldwide locations)
+- DuckDuckGo web search — targeted queries for open calls
 All scrapers are wrapped in try/except — if one source is down the rest continue.
 """
     ],
 }
 _HEADERS = {
     "User-Agent": (
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
     "any":            "jobs",
 }
+_MLSCI_NON_COUNTRY: frozenset[str] = frozenset({
+    "jobs", "phd-positions", "postdoc-positions", "featured",
+    "conference-calls", "mlnews",
+})
 # mlscientist.com country slug mapping (lowercase location → slug)
 _MLSCI_COUNTRY_SLUG: dict[str, str] = {
     "uk": "united-kingdom",
     listings: list[dict] = []
     seen_urls: set[str] = set()
     for url in urls_to_try:
         try:
             resp = requests.get(url, headers=_HEADERS, timeout=15)

agent/utils.py CHANGED Viewed

@@ -30,3 +30,13 @@ def parse_json(raw: str) -> dict[str, Any] | None:
             except json.JSONDecodeError:
                 pass
     return None

             except json.JSONDecodeError:
                 pass
     return None
+def job_institution(job: dict) -> str:
+    """Return the job's institution name, empty string if absent."""
+    return job.get("institution") or ""
+def job_description(job: dict, max_chars: int = 3000) -> str:
+    """Return the job description truncated to max_chars."""
+    return (job.get("description") or "No description provided.")[:max_chars]

app.py CHANGED Viewed

@@ -22,6 +22,8 @@ from typing import Any
 import gradio as gr
 # ---------------------------------------------------------------------------
 # Formatting helpers  (pure functions — no LLM dependency)
@@ -59,7 +61,7 @@ def _fmt_profile(profile: dict) -> str:
 def _fmt_jobs_table(jobs: list) -> list[list]:
     return [
-        [i, j.get("title", ""), j.get("institution", j.get("company", "")),
          j.get("location", ""), j.get("type", ""), j.get("source", ""),
          j.get("deadline") or "—"]
         for i, j in enumerate(jobs, 1)
@@ -74,7 +76,7 @@ def _fmt_scored_table(jobs: list) -> list[list]:
         why = m.get("why_good_fit") or ""
         rows.append([
             i, m.get("match_score", 0), job.get("title", ""),
-            job.get("institution", job.get("company", "")), job.get("type", ""),
             icons.get(m.get("recommendation", ""), ""),
             why[:60] + "..." if len(why) > 60 else why,
         ])
@@ -89,7 +91,7 @@ def _fmt_job_details(job: dict, match: dict) -> str:
     url = job.get("url", "")
     lines = [
         f"## {job.get('title', 'Unknown')}",
-        f"**{job.get('institution', job.get('company', 'Unknown'))}** — {job.get('location', '')}",
         "",
         f"**Type:** {job.get('type', '')}  |  **Deadline:** {job.get('deadline') or 'N/A'}",
     ]
@@ -248,7 +250,7 @@ def load_position(
         hints, cover_letter = agent.prepare_application(job, profile_text)
         progress(1.0, desc="Done!")
-        status = f"✅ Loaded: **{job.get('title', '')}** @ {job.get('institution', job.get('company', ''))}"
         return _fmt_job_details(job, match), _fmt_hints(hints), cover_letter, status, idx
     except Exception as exc:
@@ -285,7 +287,7 @@ def approve_position(
     if current_idx < 0 or not scored_jobs or current_idx >= len(scored_jobs):
         return approved, "❌ No position loaded."
     job = scored_jobs[current_idx]
-    title, institution = job.get("title", "Unknown"), job.get("institution", job.get("company", "Unknown"))
     if any(a["job"].get("title") == title and a["job"].get("institution") == institution for a in approved):
         return approved, f"⚠️ **{title}** @ {institution} already approved."
     new_approved = list(approved) + [{
@@ -299,7 +301,7 @@ def skip_position(current_idx: int, scored_jobs: list) -> str:
     if current_idx < 0 or not scored_jobs or current_idx >= len(scored_jobs):
         return "⏭ Skipped."
     job = scored_jobs[current_idx]
-    return f"⏭ Skipped: **{job.get('title', '')}** @ {job.get('institution', job.get('company', ''))}"
 def approved_display(approved: list) -> str:
@@ -330,7 +332,7 @@ def export_zip(approved: list) -> tuple:
             for entry in approved:
                 job = entry.get("job") or {}
                 title = job.get("title", "Unknown")
-                institution = job.get("institution", job.get("company", "Unknown"))
                 safe = (
                     f"{institution}_{title}"
                     .replace(" ", "_").replace("/", "-").replace("\\", "-")

 import gradio as gr
+from agent.utils import job_institution
 # ---------------------------------------------------------------------------
 # Formatting helpers  (pure functions — no LLM dependency)
 def _fmt_jobs_table(jobs: list) -> list[list]:
     return [
+        [i, j.get("title", ""), job_institution(j),
          j.get("location", ""), j.get("type", ""), j.get("source", ""),
          j.get("deadline") or "—"]
         for i, j in enumerate(jobs, 1)
         why = m.get("why_good_fit") or ""
         rows.append([
             i, m.get("match_score", 0), job.get("title", ""),
+            job_institution(job), job.get("type", ""),
             icons.get(m.get("recommendation", ""), ""),
             why[:60] + "..." if len(why) > 60 else why,
         ])
     url = job.get("url", "")
     lines = [
         f"## {job.get('title', 'Unknown')}",
+        f"**{job_institution(job) or 'Unknown'}** — {job.get('location', '')}",
         "",
         f"**Type:** {job.get('type', '')}  |  **Deadline:** {job.get('deadline') or 'N/A'}",
     ]
         hints, cover_letter = agent.prepare_application(job, profile_text)
         progress(1.0, desc="Done!")
+        status = f"✅ Loaded: **{job.get('title', '')}** @ {job_institution(job)}"
         return _fmt_job_details(job, match), _fmt_hints(hints), cover_letter, status, idx
     except Exception as exc:
     if current_idx < 0 or not scored_jobs or current_idx >= len(scored_jobs):
         return approved, "❌ No position loaded."
     job = scored_jobs[current_idx]
+    title, institution = job.get("title", "Unknown"), job_institution(job) or "Unknown"
     if any(a["job"].get("title") == title and a["job"].get("institution") == institution for a in approved):
         return approved, f"⚠️ **{title}** @ {institution} already approved."
     new_approved = list(approved) + [{
     if current_idx < 0 or not scored_jobs or current_idx >= len(scored_jobs):
         return "⏭ Skipped."
     job = scored_jobs[current_idx]
+    return f"⏭ Skipped: **{job.get('title', '')}** @ {job_institution(job)}"
 def approved_display(approved: list) -> str:
             for entry in approved:
                 job = entry.get("job") or {}
                 title = job.get("title", "Unknown")
+                institution = job_institution(job) or "Unknown"
                 safe = (
                     f"{institution}_{title}"
                     .replace(" ", "_").replace("/", "-").replace("\\", "-")