Spaces:
Running
Running
| """DB μ°κ²° λ° upsert λ¨μ ν μ€νΈ (μΈλ©λͺ¨λ¦¬ SQLite μ¬μ©).""" | |
| import sys | |
| import sqlite3 | |
| from pathlib import Path | |
| from datetime import date | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import pytest | |
| from unittest.mock import MagicMock | |
| from db.connection import ( | |
| init_db, get_conn, upsert_job, insert_skills, | |
| _is_cross_site_duplicate, _titles_are_duplicate, | |
| deactivate_unseen_jobs, deactivate_expired_jobs, | |
| ) | |
| from crawler.run import validate_job_links | |
| # ν μ€νΈμ© μΈλ©λͺ¨λ¦¬ DB κ²½λ‘ | |
| _TEST_DB = Path(":memory:") | |
| def _make_in_memory_conn(): | |
| """μΈλ©λͺ¨λ¦¬ SQLite 컀λ₯μ μμ± ν μ€ν€λ§ μ΄κΈ°ν.""" | |
| schema = (Path(__file__).parent.parent / "db" / "schema.sql").read_text(encoding="utf-8") | |
| conn = sqlite3.connect(":memory:") | |
| conn.row_factory = sqlite3.Row | |
| conn.execute("PRAGMA foreign_keys = ON") | |
| conn.executescript(schema) | |
| return conn | |
| def _sample_job(**overrides) -> dict: | |
| base = { | |
| "source_site": "wanted", | |
| "source_id": "12345", | |
| "url": "https://wanted.co.kr/wd/12345", | |
| "title": "λ°μ΄ν° μμ§λμ΄", | |
| "company_name": "ν μ€νΈμ»΄νΌλ", | |
| "job_category": "λ°μ΄ν° μμ§λμ΄", | |
| "industry": "IT", | |
| "employment_type": "μ κ·μ§", | |
| "location": "μμΈ", | |
| "experience_min": 3, | |
| "experience_max": 7, | |
| "salary_min": 5000, | |
| "salary_max": 8000, | |
| "posted_date": "2025-01-01", | |
| "deadline_date": "2025-03-31", | |
| } | |
| base.update(overrides) | |
| return base | |
| # ββ upsert_job ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestUpsertJob: | |
| def test_insert_new_job(self): | |
| conn = _make_in_memory_conn() | |
| job_id, action = upsert_job(conn, _sample_job()) | |
| assert action == "inserted" | |
| assert job_id > 0 | |
| def test_update_existing_job(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job()) | |
| _, action = upsert_job(conn, _sample_job(title="λ³κ²½λ μ λͺ©")) | |
| assert action == "updated" | |
| def test_unique_key_is_source_and_id(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1")) | |
| upsert_job(conn, _sample_job(source_site="saramin", source_id="1")) | |
| count = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0] | |
| assert count == 2 # μ¬μ΄νΈκ° λ€λ₯΄λ©΄ λ³κ° κ³΅κ³ | |
| def test_updated_job_is_active(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job()) | |
| # is_active=0μΌλ‘ μλ μ€μ ν upsert β λ€μ 1λ‘ λ³΅μ | |
| conn.execute("UPDATE jobs SET is_active=0 WHERE source_site='wanted' AND source_id='12345'") | |
| upsert_job(conn, _sample_job(title="μ λ°μ΄νΈλ¨")) | |
| row = conn.execute("SELECT is_active FROM jobs WHERE source_site='wanted'").fetchone() | |
| assert row["is_active"] == 1 | |
| # ββ insert_skills βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestInsertSkills: | |
| def test_insert_skills(self): | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job()) | |
| insert_skills(conn, job_id, ["Python", "SQL", "Apache Spark"]) | |
| conn.commit() | |
| rows = conn.execute("SELECT skill_name FROM job_skills WHERE job_id=?", (job_id,)).fetchall() | |
| names = {r["skill_name"] for r in rows} | |
| assert names == {"Python", "SQL", "Apache Spark"} | |
| def test_preserves_canonical_case(self): | |
| """μλ¬Έμλ‘ μ μ₯λμ§ μκ³ canonical νκΈ° κ·Έλλ‘ μ μ₯.""" | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job()) | |
| insert_skills(conn, job_id, ["Python", "PostgreSQL", "Apache Spark"]) | |
| conn.commit() | |
| rows = conn.execute("SELECT skill_name FROM job_skills WHERE job_id=?", (job_id,)).fetchall() | |
| names = {r["skill_name"] for r in rows} | |
| # μλ¬Έμκ° μλ canonical κ°μΌλ‘ μ μ₯λμ΄μΌ ν¨ | |
| assert "Python" in names | |
| assert "python" not in names | |
| assert "PostgreSQL" in names | |
| assert "Apache Spark" in names | |
| def test_no_duplicate_skills(self): | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job()) | |
| insert_skills(conn, job_id, ["Python", "Python", "Python"]) | |
| conn.commit() | |
| count = conn.execute( | |
| "SELECT COUNT(*) FROM job_skills WHERE job_id=? AND skill_name='Python'", | |
| (job_id,), | |
| ).fetchone()[0] | |
| assert count == 1 | |
| def test_empty_skills_no_error(self): | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job()) | |
| insert_skills(conn, job_id, []) | |
| conn.commit() | |
| count = conn.execute("SELECT COUNT(*) FROM job_skills WHERE job_id=?", (job_id,)).fetchone()[0] | |
| assert count == 0 | |
| # ββ _is_cross_site_duplicate ββββββββββββββββββββββββββββββββββββββ | |
| class TestCrossSiteDuplicate: | |
| def test_detects_duplicate(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1")) | |
| # κ°μ νμ¬+μ λͺ©μ΄ λ€λ₯Έ μ¬μ΄νΈμ μμΌλ©΄ μ€λ³΅ | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="saramin", source_id="99"), | |
| ) | |
| assert is_dup is True | |
| def test_not_duplicate_different_title(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1")) | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="saramin", source_id="99", title="μ ν λ€λ₯Έ κ³΅κ³ "), | |
| ) | |
| assert is_dup is False | |
| def test_not_duplicate_same_site(self): | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1")) | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="wanted", source_id="2"), | |
| ) | |
| assert is_dup is False | |
| # ββ _titles_are_duplicate (νΌμ§ λ§€μΉ) βββββββββββββββββββββββββββββ | |
| class TestTitlesAreDuplicate: | |
| def test_exact_same(self): | |
| assert _titles_are_duplicate("λ°μ΄ν° μμ§λμ΄", "λ°μ΄ν° μμ§λμ΄") is True | |
| def test_space_difference(self): | |
| """λμ΄μ°κΈ° μ°¨μ΄λ λμΌ κ³΅κ³ λ‘ μ²λ¦¬.""" | |
| assert _titles_are_duplicate("λ°μ΄ν° μμ§λμ΄", "λ°μ΄ν°μμ§λμ΄") is True | |
| def test_bracket_suffix_ignored(self): | |
| """κ΄νΈ μ λ΄μ© μ κ±° ν λΉκ΅: (κ²½λ ₯), [μ μ ] λ±μ 무μ.""" | |
| assert _titles_are_duplicate("λ°μ΄ν° μμ§λμ΄ (κ²½λ ₯ 3λ β)", "λ°μ΄ν° μμ§λμ΄") is True | |
| assert _titles_are_duplicate("λ°μ΄ν° λΆμκ° [μ μ ]", "λ°μ΄ν° λΆμκ°") is True | |
| def test_senior_prefix(self): | |
| """μ§κΈ μ λμ΄ ν¬ν¨ β μ§§μ μͺ½μ΄ κΈ΄ μͺ½μ ν¬ν¨.""" | |
| assert _titles_are_duplicate("μλμ΄ λ°μ΄ν° μμ§λμ΄", "λ°μ΄ν° μμ§λμ΄") is True | |
| assert _titles_are_duplicate("Senior λ°μ΄ν° μμ§λμ΄", "λ°μ΄ν° μμ§λμ΄") is True | |
| def test_different_job_type(self): | |
| """μμ ν λ€λ₯Έ μ§κ΅°μ μ€λ³΅ μλ.""" | |
| assert _titles_are_duplicate("λ°μ΄ν° μμ§λμ΄", "λ°μ΄ν° λΆμκ°") is False | |
| def test_ml_vs_data_engineer(self): | |
| assert _titles_are_duplicate("ML μμ§λμ΄", "λ°μ΄ν° μμ§λμ΄") is False | |
| def test_empty_strings(self): | |
| assert _titles_are_duplicate("", "λ°μ΄ν° μμ§λμ΄") is False | |
| assert _titles_are_duplicate("λ°μ΄ν° μμ§λμ΄", "") is False | |
| class TestFuzzyCrossSiteDuplicate: | |
| def test_fuzzy_detects_space_diff(self): | |
| """μ¬μ΄νΈλ§λ€ λμ΄μ°κΈ°κ° λ¬λΌλ μ€λ³΅μΌλ‘ μλ³.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="λ°μ΄ν° μμ§λμ΄")) | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="saramin", source_id="99", title="λ°μ΄ν°μμ§λμ΄"), | |
| ) | |
| assert is_dup is True | |
| def test_fuzzy_detects_bracket_diff(self): | |
| """κ΄νΈλ‘ κ²½λ ₯ νκΈ° μ°¨μ΄λ μ€λ³΅μΌλ‘ μλ³.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="λ°μ΄ν° μμ§λμ΄")) | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="jobkorea", source_id="99", title="λ°μ΄ν° μμ§λμ΄ (κ²½λ ₯ 5λ β)"), | |
| ) | |
| assert is_dup is True | |
| def test_fuzzy_does_not_false_positive(self): | |
| """λ€λ₯Έ μ§κ΅°μ μ€λ³΅μΌλ‘ μ²λ¦¬νμ§ μμ.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="λ°μ΄ν° μμ§λμ΄")) | |
| is_dup = _is_cross_site_duplicate( | |
| conn, | |
| _sample_job(source_site="saramin", source_id="99", title="λ°μ΄ν° λΆμκ°"), | |
| ) | |
| assert is_dup is False | |
| # ββ validate_job_links ββββββββββββββββββββββββββββββββββββββββββββ | |
| def _mock_head(status_code: int): | |
| """requests.Session.head() λ₯Ό νλ΄λ΄λ mock λ°ν.""" | |
| resp = MagicMock() | |
| resp.status_code = status_code | |
| session = MagicMock() | |
| session.head.return_value = resp | |
| return session | |
| class TestValidateJobLinks: | |
| def test_deactivates_404_link(self): | |
| """HTTP 404 μλ΅ β is_active=0.""" | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job(source_id="10", deadline_date=None)) | |
| conn.commit() | |
| result = validate_job_links(conn, session=_mock_head(404), delay=0) | |
| conn.commit() | |
| assert result["deactivated"] == 1 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone() | |
| assert row["is_active"] == 0 | |
| def test_keeps_200_active(self): | |
| """HTTP 200 μλ΅ β is_active μ μ§.""" | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job(source_id="11", deadline_date=None)) | |
| conn.commit() | |
| result = validate_job_links(conn, session=_mock_head(200), delay=0) | |
| conn.commit() | |
| assert result["deactivated"] == 0 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone() | |
| assert row["is_active"] == 1 | |
| def test_skips_jobs_with_deadline(self): | |
| """λ§κ°μΌμ΄ μ€μ λ κ³΅κ³ λ URL κ²μ¬ λμ μ μΈ (deadline κΈ°λ° λΉνμ±νμ μν λΆλ¦¬).""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_id="12", deadline_date="2099-12-31")) | |
| conn.commit() | |
| result = validate_job_links(conn, session=_mock_head(404), delay=0) | |
| assert result["checked"] == 0 | |
| def test_network_error_ignored(self): | |
| """λ€νΈμν¬ μ€λ₯ λ°μ μ ν΄λΉ κ³΅κ³ λ 건λλ (보μμ μ κ·Ό).""" | |
| conn = _make_in_memory_conn() | |
| job_id, _ = upsert_job(conn, _sample_job(source_id="13", deadline_date=None)) | |
| conn.commit() | |
| session = MagicMock() | |
| session.head.side_effect = Exception("connection timeout") | |
| result = validate_job_links(conn, session=session, delay=0) | |
| conn.commit() | |
| assert result["deactivated"] == 0 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone() | |
| assert row["is_active"] == 1 | |
| def test_returns_correct_keys(self): | |
| conn = _make_in_memory_conn() | |
| result = validate_job_links(conn, session=_mock_head(200), delay=0) | |
| assert "checked" in result | |
| assert "deactivated" in result | |
| # ββ deactivate_unseen_jobs ββββββββββββββββββββββββββββββββββββββββ | |
| class TestDeactivateUnseenJobs: | |
| def test_deactivates_job_not_seen_in_crawl(self): | |
| """ν¬λ‘€ μμ μ updated_at β μ€λ λ―Έλ°κ²¬ κ³΅κ³ λΉνμ±ν.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="1")) | |
| conn.execute("UPDATE jobs SET updated_at='2020-01-01 00:00:00' WHERE source_id='1'") | |
| conn.commit() | |
| count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00") | |
| conn.commit() | |
| assert count == 1 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE source_id='1'").fetchone() | |
| assert row["is_active"] == 0 | |
| def test_keeps_job_seen_in_crawl(self): | |
| """ν¬λ‘€ μμ μ΄ν updated_at β νμ± μ μ§.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="2")) | |
| conn.commit() | |
| # crawl_startλ₯Ό κ³Όκ±°λ‘ β νμ¬ updated_at(βnow)μ΄ μ΄νμ | |
| count = deactivate_unseen_jobs(conn, "wanted", "2020-01-01 00:00:00") | |
| conn.commit() | |
| assert count == 0 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE source_id='2'").fetchone() | |
| assert row["is_active"] == 1 | |
| def test_only_affects_matching_source(self): | |
| """λ€λ₯Έ source_site κ³΅κ³ λ μν₯ μμ.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="3")) | |
| upsert_job(conn, _sample_job(source_site="saramin", source_id="4")) | |
| conn.execute("UPDATE jobs SET updated_at='2020-01-01 00:00:00'") | |
| conn.commit() | |
| count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00") | |
| conn.commit() | |
| assert count == 1 | |
| assert conn.execute("SELECT is_active FROM jobs WHERE source_id='3'").fetchone()["is_active"] == 0 | |
| assert conn.execute("SELECT is_active FROM jobs WHERE source_id='4'").fetchone()["is_active"] == 1 | |
| def test_already_inactive_not_counted(self): | |
| """μ΄λ―Έ λΉνμ±μΈ κ³΅κ³ λ μΉ΄μ΄νΈ μ μΈ.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_site="wanted", source_id="5")) | |
| conn.execute("UPDATE jobs SET is_active=0, updated_at='2020-01-01' WHERE source_id='5'") | |
| conn.commit() | |
| count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00") | |
| assert count == 0 | |
| def test_returns_int(self): | |
| conn = _make_in_memory_conn() | |
| result = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00") | |
| assert isinstance(result, int) | |
| # ββ deactivate_expired_jobs βββββββββββββββββββββββββββββββββββββββ | |
| class TestDeactivateExpiredJobs: | |
| def test_deactivates_past_deadline(self): | |
| """λ§κ°μΌμ΄ μ§λ κ³΅κ³ λ λΉνμ±ν.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_id="10", deadline_date="2020-01-01")) | |
| conn.commit() | |
| count = deactivate_expired_jobs(conn) | |
| conn.commit() | |
| assert count >= 1 | |
| row = conn.execute("SELECT is_active FROM jobs WHERE source_id='10'").fetchone() | |
| assert row["is_active"] == 0 | |
| def test_keeps_future_deadline_active(self): | |
| """λ§κ°μΌμ΄ μμ§ λ¨μ κ³΅κ³ λ μ μ§.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_id="11", deadline_date="2099-12-31")) | |
| conn.commit() | |
| deactivate_expired_jobs(conn) | |
| conn.commit() | |
| row = conn.execute("SELECT is_active FROM jobs WHERE source_id='11'").fetchone() | |
| assert row["is_active"] == 1 | |
| def test_already_inactive_not_counted(self): | |
| """μ΄λ―Έ λΉνμ±μΈ κ³΅κ³ λ μΉ΄μ΄νΈμ ν¬ν¨λμ§ μμ.""" | |
| conn = _make_in_memory_conn() | |
| upsert_job(conn, _sample_job(source_id="12", deadline_date="2020-01-01")) | |
| conn.execute("UPDATE jobs SET is_active=0 WHERE source_id='12'") | |
| conn.commit() | |
| count = deactivate_expired_jobs(conn) | |
| assert count == 0 | |
| def test_returns_int(self): | |
| conn = _make_in_memory_conn() | |
| result = deactivate_expired_jobs(conn) | |
| assert isinstance(result, int) | |