jobsonar / tests /test_analysis.py
MiniMing
refactor: μ „λ©΄ μ½”λ“œ κ°œμ„  β€” 버그 μˆ˜μ •, 뢄석 정확도, ꡬ쑰화, ν…ŒμŠ€νŠΈ μΆ”κ°€
0ade07c
"""뢄석 ν•¨μˆ˜ λ‹¨μœ„ ν…ŒμŠ€νŠΈ."""
import sys
from pathlib import Path
from datetime import date, timedelta
sys.path.insert(0, str(Path(__file__).parent.parent))
import pandas as pd
import pytest
from analysis.trends import (
normalize_location,
weekly_job_counts,
top_skills_by_category,
skill_trend_weekly,
salary_by_category,
experience_distribution,
skill_growth_rate,
new_jobs_count,
_trend_date,
)
# ── normalize_location ────────────────────────────────────────────
class TestNormalizeLocation:
def test_seoul(self):
assert normalize_location("μ„œμšΈ 강남ꡬ") == "μ„œμšΈ"
def test_gyeonggi(self):
assert normalize_location("κ²½κΈ° μ„±λ‚¨μ‹œ 뢄당ꡬ") == "κ²½κΈ°"
def test_exact_match(self):
assert normalize_location("λΆ€μ‚°") == "λΆ€μ‚°"
def test_overseas(self):
assert normalize_location("San Francisco") == "ν•΄μ™Έ"
def test_none(self):
assert normalize_location(None) == ""
def test_nan(self):
assert normalize_location(float("nan")) == ""
# ── _trend_date ───────────────────────────────────────────────────
class TestTrendDate:
def _make_df(self, posted, collected):
return pd.DataFrame({
"posted_date": pd.to_datetime(posted),
"collected_at": pd.to_datetime(collected),
})
def test_prefers_posted_date(self):
df = self._make_df(["2025-01-10"], ["2025-01-15"])
result = _trend_date(df)
assert str(result.iloc[0].date()) == "2025-01-10"
def test_fallback_to_collected_at(self):
df = self._make_df([None], ["2025-01-15"])
result = _trend_date(df)
assert str(result.iloc[0].date()) == "2025-01-15"
# ── weekly_job_counts ─────────────────────────────────────────────
class TestWeeklyJobCounts:
def _make_jobs(self):
today = pd.Timestamp.now()
return pd.DataFrame({
"job_category": ["데이터 μ—”μ§€λ‹ˆμ–΄", "데이터 뢄석가", "데이터 μ—”μ§€λ‹ˆμ–΄"],
"posted_date": [today - timedelta(days=3), today - timedelta(days=10), None],
"collected_at": [today, today, today],
})
def test_returns_dataframe(self):
df = self._make_jobs()
result = weekly_job_counts(df)
assert isinstance(result, pd.DataFrame)
assert "week" in result.columns
assert "count" in result.columns
def test_count_correct(self):
df = self._make_jobs()
result = weekly_job_counts(df)
assert result["count"].sum() == 3
def test_empty_input(self):
empty = pd.DataFrame(columns=["job_category", "posted_date", "collected_at"])
result = weekly_job_counts(empty)
assert result.empty
# ── top_skills_by_category ────────────────────────────────────────
class TestTopSkillsByCategory:
def _make_skills(self):
return pd.DataFrame({
"job_id": [1, 1, 2, 2, 3],
"skill_name": ["Python", "SQL", "Python", "AWS", "Python"],
"job_category": ["DE", "DE", "DE", "DE", "DA"],
"source_site": ["wanted"] * 5,
"posted_date": [None] * 5,
"collected_at": [pd.Timestamp.now()] * 5,
})
def _make_jobs(self):
return pd.DataFrame({
"id": [1, 2, 3],
"job_category": ["DE", "DE", "DA"],
})
def test_basic(self):
sf = self._make_skills()
result = top_skills_by_category(sf, top_n=10)
assert "skill_name" in result.columns
assert "count" in result.columns
assert "pct" in result.columns
def test_pct_with_jobs_df(self):
sf = self._make_skills()
jf = self._make_jobs()
result = top_skills_by_category(sf, jobs_df=jf, top_n=10)
de = result[result["job_category"] == "DE"]
python_row = de[de["skill_name"] == "Python"].iloc[0]
# DE 곡고 2개 쀑 Python은 2개 β†’ 100%
assert python_row["pct"] == 100.0
def test_top_n_limit(self):
sf = self._make_skills()
result = top_skills_by_category(sf, top_n=1)
# 각 μ§κ΅°μ—μ„œ 1개만
assert result.groupby("job_category").size().max() == 1
def test_empty_input(self):
empty = pd.DataFrame(columns=["job_id", "skill_name", "job_category",
"source_site", "posted_date", "collected_at"])
result = top_skills_by_category(empty)
assert result.empty
# ── salary_by_category ────────────────────────────────────────────
class TestSalaryByCategory:
def _make_jobs(self):
return pd.DataFrame({
"job_category": ["DE", "DA", "DE"],
"salary_min": [4000, 3000, None],
"salary_max": [6000, 5000, None],
"company_name": ["A", "B", "C"],
})
def test_drops_null_salary(self):
df = self._make_jobs()
result = salary_by_category(df)
assert len(result) == 2 # None ν–‰ 제거
def test_salary_mid_calculation(self):
df = self._make_jobs()
result = salary_by_category(df)
de_row = result[result["job_category"] == "DE"].iloc[0]
assert de_row["salary_mid"] == 5000.0 # (4000+6000)/2
# ── experience_distribution ───────────────────────────────────────
class TestExperienceDistribution:
def _make_jobs(self):
return pd.DataFrame({
"job_category": ["DE"] * 5,
"experience_min": [0, 1, 3, 7, None],
"collected_at": [pd.Timestamp.now()] * 5,
})
def test_basic_groups(self):
df = self._make_jobs()
result = experience_distribution(df)
labels = result["exp_group"].astype(str).tolist()
assert "μ‹ μž…" in labels
assert "1-2λ…„" in labels
def test_null_excluded(self):
df = self._make_jobs()
result = experience_distribution(df)
assert result["count"].sum() == 4 # None μ œμ™Έ
# ── skill_growth_rate ─────────────────────────────────────────────
class TestSkillGrowthRate:
def _make_skills(self, recent_count=5, prev_count=2):
now = pd.Timestamp.now()
rows = []
for _ in range(recent_count):
rows.append({"skill_name": "Python", "job_id": 1,
"job_category": "DE", "source_site": "wanted",
"posted_date": None, "collected_at": now - timedelta(days=3)})
for _ in range(prev_count):
rows.append({"skill_name": "Python", "job_id": 2,
"job_category": "DE", "source_site": "wanted",
"posted_date": None, "collected_at": now - timedelta(days=20)})
return pd.DataFrame(rows)
def test_returns_dataframe(self):
df = self._make_skills()
result = skill_growth_rate(df)
assert isinstance(result, pd.DataFrame)
def test_growth_positive(self):
df = self._make_skills(recent_count=5, prev_count=2)
result = skill_growth_rate(df)
assert not result.empty
assert result.iloc[0]["growth_pct"] > 0
def test_prev_filter_removes_noise(self):
"""prev < 2인 μŠ€ν‚¬μ€ ν¬ν•¨ν•˜μ§€ μ•Šμ•„μ•Ό 함."""
df = self._make_skills(recent_count=5, prev_count=1)
result = skill_growth_rate(df)
assert result.empty # prev=1 μ΄λ―€λ‘œ 필터링
def test_empty_input(self):
empty = pd.DataFrame(columns=["skill_name", "job_id", "job_category",
"source_site", "posted_date", "collected_at"])
result = skill_growth_rate(empty)
assert result.empty
# ── new_jobs_count ────────────────────────────────────────────────
class TestNewJobsCount:
def test_recent_jobs(self):
now = pd.Timestamp.now()
df = pd.DataFrame({
"collected_at": [now - timedelta(days=i) for i in range(10)],
})
assert new_jobs_count(df, days=7) == 8 # 0~7일 포함
def test_empty(self):
assert new_jobs_count(pd.DataFrame(columns=["collected_at"]), days=7) == 0