jobsonar / tests /test_db.py
MiniMing
feat: 당일 크둀 미발견 곡고 μ¦‰μ‹œ λΉ„ν™œμ„±ν™” (mark-and-sweep)
cf3fcde
"""DB μ—°κ²° 및 upsert λ‹¨μœ„ ν…ŒμŠ€νŠΈ (인메λͺ¨λ¦¬ SQLite μ‚¬μš©)."""
import sys
import sqlite3
from pathlib import Path
from datetime import date
sys.path.insert(0, str(Path(__file__).parent.parent))
import pytest
from unittest.mock import MagicMock
from db.connection import (
init_db, get_conn, upsert_job, insert_skills,
_is_cross_site_duplicate, _titles_are_duplicate,
deactivate_unseen_jobs, deactivate_expired_jobs,
)
from crawler.run import validate_job_links
# ν…ŒμŠ€νŠΈμš© 인메λͺ¨λ¦¬ DB 경둜
_TEST_DB = Path(":memory:")
def _make_in_memory_conn():
"""인메λͺ¨λ¦¬ SQLite 컀λ„₯μ…˜ 생성 ν›„ μŠ€ν‚€λ§ˆ μ΄ˆκΈ°ν™”."""
schema = (Path(__file__).parent.parent / "db" / "schema.sql").read_text(encoding="utf-8")
conn = sqlite3.connect(":memory:")
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.executescript(schema)
return conn
def _sample_job(**overrides) -> dict:
base = {
"source_site": "wanted",
"source_id": "12345",
"url": "https://wanted.co.kr/wd/12345",
"title": "데이터 μ—”μ§€λ‹ˆμ–΄",
"company_name": "ν…ŒμŠ€νŠΈμ»΄νΌλ‹ˆ",
"job_category": "데이터 μ—”μ§€λ‹ˆμ–΄",
"industry": "IT",
"employment_type": "μ •κ·œμ§",
"location": "μ„œμšΈ",
"experience_min": 3,
"experience_max": 7,
"salary_min": 5000,
"salary_max": 8000,
"posted_date": "2025-01-01",
"deadline_date": "2025-03-31",
}
base.update(overrides)
return base
# ── upsert_job ────────────────────────────────────────────────────
class TestUpsertJob:
def test_insert_new_job(self):
conn = _make_in_memory_conn()
job_id, action = upsert_job(conn, _sample_job())
assert action == "inserted"
assert job_id > 0
def test_update_existing_job(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job())
_, action = upsert_job(conn, _sample_job(title="λ³€κ²½λœ 제λͺ©"))
assert action == "updated"
def test_unique_key_is_source_and_id(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1"))
upsert_job(conn, _sample_job(source_site="saramin", source_id="1"))
count = conn.execute("SELECT COUNT(*) FROM jobs").fetchone()[0]
assert count == 2 # μ‚¬μ΄νŠΈκ°€ λ‹€λ₯΄λ©΄ λ³„κ°œ 곡고
def test_updated_job_is_active(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job())
# is_active=0으둜 μˆ˜λ™ μ„€μ • ν›„ upsert β†’ λ‹€μ‹œ 1둜 볡원
conn.execute("UPDATE jobs SET is_active=0 WHERE source_site='wanted' AND source_id='12345'")
upsert_job(conn, _sample_job(title="μ—…λ°μ΄νŠΈλ¨"))
row = conn.execute("SELECT is_active FROM jobs WHERE source_site='wanted'").fetchone()
assert row["is_active"] == 1
# ── insert_skills ─────────────────────────────────────────────────
class TestInsertSkills:
def test_insert_skills(self):
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job())
insert_skills(conn, job_id, ["Python", "SQL", "Apache Spark"])
conn.commit()
rows = conn.execute("SELECT skill_name FROM job_skills WHERE job_id=?", (job_id,)).fetchall()
names = {r["skill_name"] for r in rows}
assert names == {"Python", "SQL", "Apache Spark"}
def test_preserves_canonical_case(self):
"""μ†Œλ¬Έμžλ‘œ μ €μž₯λ˜μ§€ μ•Šκ³  canonical ν‘œκΈ° κ·ΈλŒ€λ‘œ μ €μž₯."""
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job())
insert_skills(conn, job_id, ["Python", "PostgreSQL", "Apache Spark"])
conn.commit()
rows = conn.execute("SELECT skill_name FROM job_skills WHERE job_id=?", (job_id,)).fetchall()
names = {r["skill_name"] for r in rows}
# μ†Œλ¬Έμžκ°€ μ•„λ‹Œ canonical κ°’μœΌλ‘œ μ €μž₯λ˜μ–΄μ•Ό 함
assert "Python" in names
assert "python" not in names
assert "PostgreSQL" in names
assert "Apache Spark" in names
def test_no_duplicate_skills(self):
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job())
insert_skills(conn, job_id, ["Python", "Python", "Python"])
conn.commit()
count = conn.execute(
"SELECT COUNT(*) FROM job_skills WHERE job_id=? AND skill_name='Python'",
(job_id,),
).fetchone()[0]
assert count == 1
def test_empty_skills_no_error(self):
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job())
insert_skills(conn, job_id, [])
conn.commit()
count = conn.execute("SELECT COUNT(*) FROM job_skills WHERE job_id=?", (job_id,)).fetchone()[0]
assert count == 0
# ── _is_cross_site_duplicate ──────────────────────────────────────
class TestCrossSiteDuplicate:
def test_detects_duplicate(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1"))
# 같은 νšŒμ‚¬+제λͺ©μ΄ λ‹€λ₯Έ μ‚¬μ΄νŠΈμ— 있으면 쀑볡
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="saramin", source_id="99"),
)
assert is_dup is True
def test_not_duplicate_different_title(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1"))
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="saramin", source_id="99", title="μ „ν˜€ λ‹€λ₯Έ 곡고"),
)
assert is_dup is False
def test_not_duplicate_same_site(self):
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1"))
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="wanted", source_id="2"),
)
assert is_dup is False
# ── _titles_are_duplicate (퍼지 λ§€μΉ­) ─────────────────────────────
class TestTitlesAreDuplicate:
def test_exact_same(self):
assert _titles_are_duplicate("데이터 μ—”μ§€λ‹ˆμ–΄", "데이터 μ—”μ§€λ‹ˆμ–΄") is True
def test_space_difference(self):
"""띄어쓰기 μ°¨μ΄λŠ” 동일 곡고둜 처리."""
assert _titles_are_duplicate("데이터 μ—”μ§€λ‹ˆμ–΄", "λ°μ΄ν„°μ—”μ§€λ‹ˆμ–΄") is True
def test_bracket_suffix_ignored(self):
"""κ΄„ν˜Έ μ•ˆ λ‚΄μš© 제거 ν›„ 비ꡐ: (κ²½λ ₯), [μ‹ μž…] 등은 λ¬΄μ‹œ."""
assert _titles_are_duplicate("데이터 μ—”μ§€λ‹ˆμ–΄ (κ²½λ ₯ 3년↑)", "데이터 μ—”μ§€λ‹ˆμ–΄") is True
assert _titles_are_duplicate("데이터 뢄석가 [μ‹ μž…]", "데이터 뢄석가") is True
def test_senior_prefix(self):
"""직급 접두어 포함 β†’ 짧은 μͺ½μ΄ κΈ΄ μͺ½μ— 포함."""
assert _titles_are_duplicate("μ‹œλ‹ˆμ–΄ 데이터 μ—”μ§€λ‹ˆμ–΄", "데이터 μ—”μ§€λ‹ˆμ–΄") is True
assert _titles_are_duplicate("Senior 데이터 μ—”μ§€λ‹ˆμ–΄", "데이터 μ—”μ§€λ‹ˆμ–΄") is True
def test_different_job_type(self):
"""μ™„μ „νžˆ λ‹€λ₯Έ 직ꡰ은 쀑볡 μ•„λ‹˜."""
assert _titles_are_duplicate("데이터 μ—”μ§€λ‹ˆμ–΄", "데이터 뢄석가") is False
def test_ml_vs_data_engineer(self):
assert _titles_are_duplicate("ML μ—”μ§€λ‹ˆμ–΄", "데이터 μ—”μ§€λ‹ˆμ–΄") is False
def test_empty_strings(self):
assert _titles_are_duplicate("", "데이터 μ—”μ§€λ‹ˆμ–΄") is False
assert _titles_are_duplicate("데이터 μ—”μ§€λ‹ˆμ–΄", "") is False
class TestFuzzyCrossSiteDuplicate:
def test_fuzzy_detects_space_diff(self):
"""μ‚¬μ΄νŠΈλ§ˆλ‹€ 띄어쓰기가 달라도 μ€‘λ³΅μœΌλ‘œ 식별."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="데이터 μ—”μ§€λ‹ˆμ–΄"))
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="saramin", source_id="99", title="λ°μ΄ν„°μ—”μ§€λ‹ˆμ–΄"),
)
assert is_dup is True
def test_fuzzy_detects_bracket_diff(self):
"""κ΄„ν˜Έλ‘œ κ²½λ ₯ ν‘œκΈ° 차이도 μ€‘λ³΅μœΌλ‘œ 식별."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="데이터 μ—”μ§€λ‹ˆμ–΄"))
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="jobkorea", source_id="99", title="데이터 μ—”μ§€λ‹ˆμ–΄ (κ²½λ ₯ 5년↑)"),
)
assert is_dup is True
def test_fuzzy_does_not_false_positive(self):
"""λ‹€λ₯Έ 직ꡰ은 μ€‘λ³΅μœΌλ‘œ μ²˜λ¦¬ν•˜μ§€ μ•ŠμŒ."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1", title="데이터 μ—”μ§€λ‹ˆμ–΄"))
is_dup = _is_cross_site_duplicate(
conn,
_sample_job(source_site="saramin", source_id="99", title="데이터 뢄석가"),
)
assert is_dup is False
# ── validate_job_links ────────────────────────────────────────────
def _mock_head(status_code: int):
"""requests.Session.head() λ₯Ό ν‰λ‚΄λ‚΄λŠ” mock λ°˜ν™˜."""
resp = MagicMock()
resp.status_code = status_code
session = MagicMock()
session.head.return_value = resp
return session
class TestValidateJobLinks:
def test_deactivates_404_link(self):
"""HTTP 404 응닡 β†’ is_active=0."""
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job(source_id="10", deadline_date=None))
conn.commit()
result = validate_job_links(conn, session=_mock_head(404), delay=0)
conn.commit()
assert result["deactivated"] == 1
row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone()
assert row["is_active"] == 0
def test_keeps_200_active(self):
"""HTTP 200 응닡 β†’ is_active μœ μ§€."""
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job(source_id="11", deadline_date=None))
conn.commit()
result = validate_job_links(conn, session=_mock_head(200), delay=0)
conn.commit()
assert result["deactivated"] == 0
row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone()
assert row["is_active"] == 1
def test_skips_jobs_with_deadline(self):
"""마감일이 μ„€μ •λœ κ³΅κ³ λŠ” URL 검사 λŒ€μƒ μ œμ™Έ (deadline 기반 λΉ„ν™œμ„±ν™”μ™€ μ—­ν•  뢄리)."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_id="12", deadline_date="2099-12-31"))
conn.commit()
result = validate_job_links(conn, session=_mock_head(404), delay=0)
assert result["checked"] == 0
def test_network_error_ignored(self):
"""λ„€νŠΈμ›Œν¬ 였λ₯˜ λ°œμƒ μ‹œ ν•΄λ‹Ή κ³΅κ³ λŠ” κ±΄λ„ˆλœ€ (보수적 μ ‘κ·Ό)."""
conn = _make_in_memory_conn()
job_id, _ = upsert_job(conn, _sample_job(source_id="13", deadline_date=None))
conn.commit()
session = MagicMock()
session.head.side_effect = Exception("connection timeout")
result = validate_job_links(conn, session=session, delay=0)
conn.commit()
assert result["deactivated"] == 0
row = conn.execute("SELECT is_active FROM jobs WHERE id=?", (job_id,)).fetchone()
assert row["is_active"] == 1
def test_returns_correct_keys(self):
conn = _make_in_memory_conn()
result = validate_job_links(conn, session=_mock_head(200), delay=0)
assert "checked" in result
assert "deactivated" in result
# ── deactivate_unseen_jobs ────────────────────────────────────────
class TestDeactivateUnseenJobs:
def test_deactivates_job_not_seen_in_crawl(self):
"""크둀 μ‹œμž‘ μ „ updated_at β†’ 였늘 미발견 곡고 λΉ„ν™œμ„±ν™”."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="1"))
conn.execute("UPDATE jobs SET updated_at='2020-01-01 00:00:00' WHERE source_id='1'")
conn.commit()
count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00")
conn.commit()
assert count == 1
row = conn.execute("SELECT is_active FROM jobs WHERE source_id='1'").fetchone()
assert row["is_active"] == 0
def test_keeps_job_seen_in_crawl(self):
"""크둀 μ‹œμž‘ 이후 updated_at β†’ ν™œμ„± μœ μ§€."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="2"))
conn.commit()
# crawl_startλ₯Ό 과거둜 β†’ ν˜„μž¬ updated_at(β‰ˆnow)이 μ΄ν›„μž„
count = deactivate_unseen_jobs(conn, "wanted", "2020-01-01 00:00:00")
conn.commit()
assert count == 0
row = conn.execute("SELECT is_active FROM jobs WHERE source_id='2'").fetchone()
assert row["is_active"] == 1
def test_only_affects_matching_source(self):
"""λ‹€λ₯Έ source_site κ³΅κ³ λŠ” 영ν–₯ μ—†μŒ."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="3"))
upsert_job(conn, _sample_job(source_site="saramin", source_id="4"))
conn.execute("UPDATE jobs SET updated_at='2020-01-01 00:00:00'")
conn.commit()
count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00")
conn.commit()
assert count == 1
assert conn.execute("SELECT is_active FROM jobs WHERE source_id='3'").fetchone()["is_active"] == 0
assert conn.execute("SELECT is_active FROM jobs WHERE source_id='4'").fetchone()["is_active"] == 1
def test_already_inactive_not_counted(self):
"""이미 λΉ„ν™œμ„±μΈ κ³΅κ³ λŠ” 카운트 μ œμ™Έ."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_site="wanted", source_id="5"))
conn.execute("UPDATE jobs SET is_active=0, updated_at='2020-01-01' WHERE source_id='5'")
conn.commit()
count = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00")
assert count == 0
def test_returns_int(self):
conn = _make_in_memory_conn()
result = deactivate_unseen_jobs(conn, "wanted", "2025-01-01 09:00:00")
assert isinstance(result, int)
# ── deactivate_expired_jobs ───────────────────────────────────────
class TestDeactivateExpiredJobs:
def test_deactivates_past_deadline(self):
"""마감일이 μ§€λ‚œ κ³΅κ³ λŠ” λΉ„ν™œμ„±ν™”."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_id="10", deadline_date="2020-01-01"))
conn.commit()
count = deactivate_expired_jobs(conn)
conn.commit()
assert count >= 1
row = conn.execute("SELECT is_active FROM jobs WHERE source_id='10'").fetchone()
assert row["is_active"] == 0
def test_keeps_future_deadline_active(self):
"""마감일이 아직 남은 κ³΅κ³ λŠ” μœ μ§€."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_id="11", deadline_date="2099-12-31"))
conn.commit()
deactivate_expired_jobs(conn)
conn.commit()
row = conn.execute("SELECT is_active FROM jobs WHERE source_id='11'").fetchone()
assert row["is_active"] == 1
def test_already_inactive_not_counted(self):
"""이미 λΉ„ν™œμ„±μΈ κ³΅κ³ λŠ” μΉ΄μš΄νŠΈμ— ν¬ν•¨λ˜μ§€ μ•ŠμŒ."""
conn = _make_in_memory_conn()
upsert_job(conn, _sample_job(source_id="12", deadline_date="2020-01-01"))
conn.execute("UPDATE jobs SET is_active=0 WHERE source_id='12'")
conn.commit()
count = deactivate_expired_jobs(conn)
assert count == 0
def test_returns_int(self):
conn = _make_in_memory_conn()
result = deactivate_expired_jobs(conn)
assert isinstance(result, int)