jobsonar / crawler /base.py
MiniMing
feat: expand SKILL_ALIASES to 120+ entries, widen skill text extraction
0459e0c
"""๊ณตํ†ต ํฌ๋กค๋Ÿฌ ๋ฒ ์ด์Šค ํด๋ž˜์Šค."""
import time
import random
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import date
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
# ํฌ๋กค๋ง ๋Œ€์ƒ ์ง๊ตฐ ํ‚ค์›Œ๋“œ (URL ํŒŒ๋ผ๋ฏธํ„ฐ์šฉ)
TARGET_CATEGORIES = {
"๋ฐ์ดํ„ฐ ์—”์ง€๋‹ˆ์–ด": ["๋ฐ์ดํ„ฐ ์—”์ง€๋‹ˆ์–ด", "data engineer"],
"๋ฐ์ดํ„ฐ ๋ถ„์„๊ฐ€": ["๋ฐ์ดํ„ฐ ๋ถ„์„", "data analyst", "๋ฐ์ดํ„ฐ ๋ถ„์„๊ฐ€"],
"ML ์—”์ง€๋‹ˆ์–ด": ["๋จธ์‹ ๋Ÿฌ๋‹", "machine learning", "ML engineer"],
"๋ฐ์ดํ„ฐ ์‚ฌ์ด์–ธํ‹ฐ์ŠคํŠธ": ["๋ฐ์ดํ„ฐ ์‚ฌ์ด์–ธํ‹ฐ์ŠคํŠธ", "data scientist"],
}
# ์ •๊ทœํ™”๋œ ์Šคํ‚ฌ ์‚ฌ์ „ (์›๋ฌธ โ†’ ํ‘œ์ค€๋ช…)
SKILL_ALIASES: dict[str, str] = {
# ์–ธ์–ด
"python": "Python", "ํŒŒ์ด์ฌ": "Python",
"r": "R",
"scala": "Scala",
"java": "Java", "์ž๋ฐ”": "Java",
"javascript": "JavaScript", "js": "JavaScript",
"typescript": "TypeScript", "ts": "TypeScript",
"go": "Go", "golang": "Go",
"c++": "C++", "cpp": "C++",
"c#": "C#",
"rust": "Rust",
"julia": "Julia",
"matlab": "MATLAB",
# DB / SQL
"sql": "SQL",
"mysql": "MySQL",
"postgresql": "PostgreSQL", "postgres": "PostgreSQL",
"oracle": "Oracle",
"mssql": "MSSQL", "sql server": "MSSQL",
"sqlite": "SQLite",
"mongodb": "MongoDB", "mongo": "MongoDB",
"cassandra": "Cassandra",
"hbase": "HBase",
"neo4j": "Neo4j",
"dynamodb": "DynamoDB",
"redis": "Redis",
"hive": "Hive",
"presto": "Presto", "trino": "Trino",
"athena": "AWS Athena",
# ํด๋ผ์šฐ๋“œ
"aws": "AWS", "amazon web services": "AWS",
"gcp": "GCP", "google cloud": "GCP", "google cloud platform": "GCP",
"azure": "Azure", "microsoft azure": "Azure",
# ๋น…๋ฐ์ดํ„ฐ / ํŒŒ์ดํ”„๋ผ์ธ
"spark": "Apache Spark", "apache spark": "Apache Spark", "pyspark": "Apache Spark",
"hadoop": "Hadoop",
"airflow": "Apache Airflow", "apache airflow": "Apache Airflow",
"kafka": "Apache Kafka", "apache kafka": "Apache Kafka",
"flink": "Apache Flink", "apache flink": "Apache Flink",
"nifi": "Apache NiFi",
"databricks": "Databricks",
"dbt": "dbt", "data build tool": "dbt",
"luigi": "Luigi",
"celery": "Celery",
# ์ปจํ…Œ์ด๋„ˆ / ์ธํ”„๋ผ
"docker": "Docker",
"kubernetes": "Kubernetes", "k8s": "Kubernetes",
"terraform": "Terraform",
"ansible": "Ansible",
"jenkins": "Jenkins",
"github actions": "GitHub Actions",
"gitlab ci": "GitLab CI",
"git": "Git", "github": "Git", "gitlab": "Git",
"linux": "Linux", "ubuntu": "Linux",
# ML / DL ํ”„๋ ˆ์ž„์›Œํฌ
"tensorflow": "TensorFlow", "tf": "TensorFlow",
"pytorch": "PyTorch", "torch": "PyTorch",
"keras": "Keras",
"scikit-learn": "scikit-learn", "sklearn": "scikit-learn",
"xgboost": "XGBoost",
"lightgbm": "LightGBM",
"catboost": "CatBoost",
"hugging face": "Hugging Face", "huggingface": "Hugging Face",
"langchain": "LangChain",
"openai": "OpenAI API",
"llm": "LLM",
"rag": "RAG",
"mlflow": "MLflow",
"kubeflow": "Kubeflow",
"sagemaker": "SageMaker", "aws sagemaker": "SageMaker",
"vertex ai": "Vertex AI",
# ๋ถ„์„ / ์‹œ๊ฐํ™”
"pandas": "pandas",
"numpy": "NumPy",
"scipy": "SciPy",
"matplotlib": "Matplotlib",
"seaborn": "Seaborn",
"plotly": "Plotly",
"tableau": "Tableau", "ํƒ€๋ธ”๋กœ": "Tableau",
"power bi": "Power BI", "powerbi": "Power BI",
"looker": "Looker", "looker studio": "Looker",
"redash": "Redash",
"superset": "Apache Superset", "apache superset": "Apache Superset",
"metabase": "Metabase",
"grafana": "Grafana",
"excel": "Excel", "์—‘์…€": "Excel",
# ๋ฐ์ดํ„ฐ ์›จ์–ดํ•˜์šฐ์Šค
"bigquery": "BigQuery", "bq": "BigQuery",
"redshift": "Redshift", "aws redshift": "Redshift",
"snowflake": "Snowflake",
"clickhouse": "ClickHouse",
# ๊ฒ€์ƒ‰ / ๋กœ๊ทธ
"elasticsearch": "Elasticsearch", "elastic": "Elasticsearch",
"kibana": "Kibana",
"logstash": "Logstash",
# ํ˜‘์—… / ๊ธฐํƒ€
"jira": "Jira",
"confluence": "Confluence",
"notion": "Notion",
"slack": "Slack",
"figma": "Figma",
}
def normalize_skill(raw: str) -> str:
"""์Šคํ‚ฌ ๋ฌธ์ž์—ด์„ ํ‘œ์ค€๋ช…์œผ๋กœ ๋ณ€ํ™˜."""
key = raw.lower().strip()
return SKILL_ALIASES.get(key, raw.strip())
def extract_skills_from_text(text: str) -> list[str]:
"""ํ…์ŠคํŠธ์—์„œ ์•Œ๋ ค์ง„ ์Šคํ‚ฌ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœ. ๋‹จ์ผ ๋ฌธ์ž ํ‚ค์›Œ๋“œ๋Š” ๋‹จ์–ด ๊ฒฝ๊ณ„ ์ ์šฉ."""
import re
text_lower = text.lower()
found = set()
for alias, canonical in SKILL_ALIASES.items():
if len(alias) <= 2:
# 'r', 'R' ๋“ฑ ์งง์€ ํ‚ค์›Œ๋“œ๋Š” ๋‹จ์–ด ๊ฒฝ๊ณ„ ๋งค์นญ
if re.search(rf"\b{re.escape(alias)}\b", text_lower):
found.add(canonical)
else:
if alias in text_lower:
found.add(canonical)
return sorted(found)
@dataclass
class JobItem:
"""ํฌ๋กค๋Ÿฌ๊ฐ€ ๋ฐ˜ํ™˜ํ•˜๋Š” ๊ณต๊ณ  ๋‹จ์œ„."""
source_site: str
source_id: str
url: str
title: str
company_name: str
job_category: str
skills: list[str] = field(default_factory=list)
industry: str | None = None # ํšŒ์‚ฌ ์—…์ข…
employment_type: str | None = None # ์ •๊ทœ์ง | ๊ณ„์•ฝ์ง | ์ธํ„ด ๋“ฑ
location: str | None = None
experience_min: int | None = None
experience_max: int | None = None
salary_min: int | None = None
salary_max: int | None = None
posted_date: date | None = None
deadline_date: date | None = None
def to_db_dict(self) -> dict:
return {
"source_site": self.source_site,
"source_id": self.source_id,
"url": self.url,
"title": self.title,
"company_name": self.company_name,
"job_category": self.job_category,
"industry": self.industry,
"employment_type": self.employment_type,
"location": self.location,
"experience_min": self.experience_min,
"experience_max": self.experience_max,
"salary_min": self.salary_min,
"salary_max": self.salary_max,
"posted_date": self.posted_date.isoformat() if self.posted_date else None,
"deadline_date": self.deadline_date.isoformat() if self.deadline_date else None,
}
class BaseCrawler(ABC):
"""๋ชจ๋“  ํฌ๋กค๋Ÿฌ์˜ ๊ณตํ†ต ์ธํ„ฐํŽ˜์ด์Šค."""
SITE_NAME: str = ""
MIN_DELAY: float = 1.5 # ์š”์ฒญ ๊ฐ„ ์ตœ์†Œ ๋”œ๋ ˆ์ด (์ดˆ)
MAX_DELAY: float = 3.5
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8",
})
def _sleep(self) -> None:
delay = random.uniform(self.MIN_DELAY, self.MAX_DELAY)
logger.debug(f"[{self.SITE_NAME}] sleeping {delay:.1f}s")
time.sleep(delay)
def _get(self, url: str, **kwargs) -> requests.Response:
self._sleep()
resp = self.session.get(url, timeout=15, **kwargs)
resp.raise_for_status()
return resp
def _soup(self, url: str, **kwargs) -> BeautifulSoup:
resp = self._get(url, **kwargs)
return BeautifulSoup(resp.text, "lxml")
@abstractmethod
def crawl(self, category: str, max_pages: int = 10) -> list[JobItem]:
"""์ฃผ์–ด์ง„ ์ง๊ตฐ ์นดํ…Œ๊ณ ๋ฆฌ์˜ ๊ณต๊ณ ๋ฅผ ์ˆ˜์ง‘ํ•ด ๋ฐ˜ํ™˜."""
...