PhDScout / tests /test_scraper_base.py
HipFil98's picture
feat: fix location filtering bugs + add scholarshipdb and nature.com scrapers
1579508
"""Tests for BaseScraper static helpers — no HTTP calls needed."""
from datetime import datetime
import pytest
from agent.search.scrapers.base import BaseScraper
class TestParseDate:
def _p(self, text):
return BaseScraper._parse_date(text)
def test_none_returns_none(self):
assert self._p(None) is None
def test_empty_returns_none(self):
assert self._p("") is None
def test_garbage_returns_none(self):
assert self._p("no date here") is None
def test_iso_date(self):
assert self._p("2025-03-15") == datetime(2025, 3, 15)
def test_iso_datetime(self):
assert self._p("2025-11-01T10:30:00") == datetime(2025, 11, 1)
def test_dd_month_yyyy_full(self):
assert self._p("15 March 2025") == datetime(2025, 3, 15)
def test_dd_month_yyyy_abbrev(self):
assert self._p("3 Apr 2026") == datetime(2026, 4, 3)
def test_month_dd_yyyy(self):
assert self._p("March 15, 2025") == datetime(2025, 3, 15)
def test_month_dd_yyyy_no_comma(self):
assert self._p("January 5 2026") == datetime(2026, 1, 5)
def test_dd_mm_yyyy_slash(self):
assert self._p("15/03/2025") == datetime(2025, 3, 15)
def test_strips_posted_on_prefix(self):
assert self._p("Posted on: 15 March 2025") == datetime(2025, 3, 15)
def test_strips_closes_prefix(self):
assert self._p("Closes 3 Apr 2026") == datetime(2026, 4, 3)
def test_strips_deadline_prefix(self):
assert self._p("Deadline: 01/06/2026") == datetime(2026, 6, 1)
def test_case_insensitive_prefix(self):
assert self._p("POSTED ON: 10 January 2026") == datetime(2026, 1, 10)
# -- relative dates (scholarshipdb / nature.com format) ------------------
def test_relative_hours_ago(self):
result = self._p("about 3 hours ago")
assert result is not None
delta = (datetime.now() - result).total_seconds()
assert 0 <= delta < 4 * 3600 # within 4 h of now
def test_relative_days_ago(self):
result = self._p("5 days ago")
assert result is not None
delta_days = (datetime.now() - result).days
assert 4 <= delta_days <= 6
def test_relative_weeks_ago(self):
result = self._p("2 weeks ago")
assert result is not None
delta_days = (datetime.now() - result).days
assert 13 <= delta_days <= 15
def test_relative_months_ago(self):
result = self._p("about 1 month ago")
assert result is not None
delta_days = (datetime.now() - result).days
assert 28 <= delta_days <= 32
def test_relative_years_ago(self):
result = self._p("1 year ago")
assert result is not None
delta_days = (datetime.now() - result).days
assert 363 <= delta_days <= 367
def test_relative_minutes_ago(self):
result = self._p("10 minutes ago")
assert result is not None
delta = (datetime.now() - result).total_seconds()
assert 0 <= delta < 11 * 60
class TestDetectType:
def _d(self, title, desc=""):
return BaseScraper._detect_type(title, desc)
def test_phd_in_title(self):
assert self._d("PhD Position in Machine Learning") == "phd"
def test_postdoc_in_title(self):
assert self._d("Postdoctoral Researcher in NLP") == "postdoc"
def test_fellowship_in_desc(self):
assert self._d("Research Position", "Marie Curie fellowship available") == "fellowship"
def test_predoctoral_in_title(self):
assert self._d("Predoctoral Researcher") == "predoctoral"
def test_research_staff(self):
assert self._d("Research Scientist at DeepMind") == "research_staff"
def test_unknown_returns_other(self):
assert self._d("Open Position", "Some vague description") == "other"
class TestExtractEmail:
def _e(self, text):
return BaseScraper._extract_email(text)
def test_extracts_simple_email(self):
assert self._e("Contact us at jobs@mit.edu for details") == "jobs@mit.edu"
def test_no_email_returns_none(self):
assert self._e("No contact information provided") is None
def test_extracts_first_of_multiple(self):
result = self._e("Email a@x.com or b@y.org")
assert result == "a@x.com"