Spaces:
Running
Running
| """๊ณตํต ํฌ๋กค๋ฌ ๋ฒ ์ด์ค ํด๋์ค.""" | |
| import time | |
| import random | |
| import logging | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass, field | |
| from datetime import date | |
| import requests | |
| from bs4 import BeautifulSoup | |
| logger = logging.getLogger(__name__) | |
| # ํฌ๋กค๋ง ๋์ ์ง๊ตฐ ํค์๋ (URL ํ๋ผ๋ฏธํฐ์ฉ) | |
| TARGET_CATEGORIES = { | |
| "๋ฐ์ดํฐ ์์ง๋์ด": ["๋ฐ์ดํฐ ์์ง๋์ด", "data engineer"], | |
| "๋ฐ์ดํฐ ๋ถ์๊ฐ": ["๋ฐ์ดํฐ ๋ถ์", "data analyst", "๋ฐ์ดํฐ ๋ถ์๊ฐ"], | |
| "ML ์์ง๋์ด": ["๋จธ์ ๋ฌ๋", "machine learning", "ML engineer"], | |
| "๋ฐ์ดํฐ ์ฌ์ด์ธํฐ์คํธ": ["๋ฐ์ดํฐ ์ฌ์ด์ธํฐ์คํธ", "data scientist"], | |
| } | |
| # ์ ๊ทํ๋ ์คํฌ ์ฌ์ (์๋ฌธ โ ํ์ค๋ช ) | |
| SKILL_ALIASES: dict[str, str] = { | |
| # ์ธ์ด | |
| "python": "Python", "ํ์ด์ฌ": "Python", | |
| "r": "R", | |
| "scala": "Scala", | |
| "java": "Java", "์๋ฐ": "Java", | |
| "javascript": "JavaScript", "js": "JavaScript", | |
| "typescript": "TypeScript", "ts": "TypeScript", | |
| "go": "Go", "golang": "Go", | |
| "c++": "C++", "cpp": "C++", | |
| "c#": "C#", | |
| "rust": "Rust", | |
| "julia": "Julia", | |
| "matlab": "MATLAB", | |
| # DB / SQL | |
| "sql": "SQL", | |
| "mysql": "MySQL", | |
| "postgresql": "PostgreSQL", "postgres": "PostgreSQL", | |
| "oracle": "Oracle", | |
| "mssql": "MSSQL", "sql server": "MSSQL", | |
| "sqlite": "SQLite", | |
| "mongodb": "MongoDB", "mongo": "MongoDB", | |
| "cassandra": "Cassandra", | |
| "hbase": "HBase", | |
| "neo4j": "Neo4j", | |
| "dynamodb": "DynamoDB", | |
| "redis": "Redis", | |
| "hive": "Hive", | |
| "presto": "Presto", "trino": "Trino", | |
| "athena": "AWS Athena", | |
| # ํด๋ผ์ฐ๋ | |
| "aws": "AWS", "amazon web services": "AWS", | |
| "gcp": "GCP", "google cloud": "GCP", "google cloud platform": "GCP", | |
| "azure": "Azure", "microsoft azure": "Azure", | |
| # ๋น ๋ฐ์ดํฐ / ํ์ดํ๋ผ์ธ | |
| "spark": "Apache Spark", "apache spark": "Apache Spark", "pyspark": "Apache Spark", | |
| "hadoop": "Hadoop", | |
| "airflow": "Apache Airflow", "apache airflow": "Apache Airflow", | |
| "kafka": "Apache Kafka", "apache kafka": "Apache Kafka", | |
| "flink": "Apache Flink", "apache flink": "Apache Flink", | |
| "nifi": "Apache NiFi", | |
| "databricks": "Databricks", | |
| "dbt": "dbt", "data build tool": "dbt", | |
| "luigi": "Luigi", | |
| "celery": "Celery", | |
| # ์ปจํ ์ด๋ / ์ธํ๋ผ | |
| "docker": "Docker", | |
| "kubernetes": "Kubernetes", "k8s": "Kubernetes", | |
| "terraform": "Terraform", | |
| "ansible": "Ansible", | |
| "jenkins": "Jenkins", | |
| "github actions": "GitHub Actions", | |
| "gitlab ci": "GitLab CI", | |
| "git": "Git", "github": "Git", "gitlab": "Git", | |
| "linux": "Linux", "ubuntu": "Linux", | |
| # ML / DL ํ๋ ์์ํฌ | |
| "tensorflow": "TensorFlow", "tf": "TensorFlow", | |
| "pytorch": "PyTorch", "torch": "PyTorch", | |
| "keras": "Keras", | |
| "scikit-learn": "scikit-learn", "sklearn": "scikit-learn", | |
| "xgboost": "XGBoost", | |
| "lightgbm": "LightGBM", | |
| "catboost": "CatBoost", | |
| "hugging face": "Hugging Face", "huggingface": "Hugging Face", | |
| "langchain": "LangChain", | |
| "openai": "OpenAI API", | |
| "llm": "LLM", | |
| "rag": "RAG", | |
| "mlflow": "MLflow", | |
| "kubeflow": "Kubeflow", | |
| "sagemaker": "SageMaker", "aws sagemaker": "SageMaker", | |
| "vertex ai": "Vertex AI", | |
| # ๋ถ์ / ์๊ฐํ | |
| "pandas": "pandas", | |
| "numpy": "NumPy", | |
| "scipy": "SciPy", | |
| "matplotlib": "Matplotlib", | |
| "seaborn": "Seaborn", | |
| "plotly": "Plotly", | |
| "tableau": "Tableau", "ํ๋ธ๋ก": "Tableau", | |
| "power bi": "Power BI", "powerbi": "Power BI", | |
| "looker": "Looker", "looker studio": "Looker", | |
| "redash": "Redash", | |
| "superset": "Apache Superset", "apache superset": "Apache Superset", | |
| "metabase": "Metabase", | |
| "grafana": "Grafana", | |
| "excel": "Excel", "์์ ": "Excel", | |
| # ๋ฐ์ดํฐ ์จ์ดํ์ฐ์ค | |
| "bigquery": "BigQuery", "bq": "BigQuery", | |
| "redshift": "Redshift", "aws redshift": "Redshift", | |
| "snowflake": "Snowflake", | |
| "clickhouse": "ClickHouse", | |
| # ๊ฒ์ / ๋ก๊ทธ | |
| "elasticsearch": "Elasticsearch", "elastic": "Elasticsearch", | |
| "kibana": "Kibana", | |
| "logstash": "Logstash", | |
| # ํ์ / ๊ธฐํ | |
| "jira": "Jira", | |
| "confluence": "Confluence", | |
| "notion": "Notion", | |
| "slack": "Slack", | |
| "figma": "Figma", | |
| } | |
| def normalize_skill(raw: str) -> str: | |
| """์คํฌ ๋ฌธ์์ด์ ํ์ค๋ช ์ผ๋ก ๋ณํ.""" | |
| key = raw.lower().strip() | |
| return SKILL_ALIASES.get(key, raw.strip()) | |
| def extract_skills_from_text(text: str) -> list[str]: | |
| """ํ ์คํธ์์ ์๋ ค์ง ์คํฌ ํค์๋๋ฅผ ์ถ์ถ. ๋จ์ผ ๋ฌธ์ ํค์๋๋ ๋จ์ด ๊ฒฝ๊ณ ์ ์ฉ.""" | |
| import re | |
| text_lower = text.lower() | |
| found = set() | |
| for alias, canonical in SKILL_ALIASES.items(): | |
| if len(alias) <= 2: | |
| # 'r', 'R' ๋ฑ ์งง์ ํค์๋๋ ๋จ์ด ๊ฒฝ๊ณ ๋งค์นญ | |
| if re.search(rf"\b{re.escape(alias)}\b", text_lower): | |
| found.add(canonical) | |
| else: | |
| if alias in text_lower: | |
| found.add(canonical) | |
| return sorted(found) | |
| class JobItem: | |
| """ํฌ๋กค๋ฌ๊ฐ ๋ฐํํ๋ ๊ณต๊ณ ๋จ์.""" | |
| source_site: str | |
| source_id: str | |
| url: str | |
| title: str | |
| company_name: str | |
| job_category: str | |
| skills: list[str] = field(default_factory=list) | |
| industry: str | None = None # ํ์ฌ ์ ์ข | |
| employment_type: str | None = None # ์ ๊ท์ง | ๊ณ์ฝ์ง | ์ธํด ๋ฑ | |
| location: str | None = None | |
| experience_min: int | None = None | |
| experience_max: int | None = None | |
| salary_min: int | None = None | |
| salary_max: int | None = None | |
| posted_date: date | None = None | |
| deadline_date: date | None = None | |
| def to_db_dict(self) -> dict: | |
| return { | |
| "source_site": self.source_site, | |
| "source_id": self.source_id, | |
| "url": self.url, | |
| "title": self.title, | |
| "company_name": self.company_name, | |
| "job_category": self.job_category, | |
| "industry": self.industry, | |
| "employment_type": self.employment_type, | |
| "location": self.location, | |
| "experience_min": self.experience_min, | |
| "experience_max": self.experience_max, | |
| "salary_min": self.salary_min, | |
| "salary_max": self.salary_max, | |
| "posted_date": self.posted_date.isoformat() if self.posted_date else None, | |
| "deadline_date": self.deadline_date.isoformat() if self.deadline_date else None, | |
| } | |
| class BaseCrawler(ABC): | |
| """๋ชจ๋ ํฌ๋กค๋ฌ์ ๊ณตํต ์ธํฐํ์ด์ค.""" | |
| SITE_NAME: str = "" | |
| MIN_DELAY: float = 1.5 # ์์ฒญ ๊ฐ ์ต์ ๋๋ ์ด (์ด) | |
| MAX_DELAY: float = 3.5 | |
| def __init__(self): | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/122.0.0.0 Safari/537.36" | |
| ), | |
| "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8", | |
| }) | |
| def _sleep(self) -> None: | |
| delay = random.uniform(self.MIN_DELAY, self.MAX_DELAY) | |
| logger.debug(f"[{self.SITE_NAME}] sleeping {delay:.1f}s") | |
| time.sleep(delay) | |
| def _get(self, url: str, **kwargs) -> requests.Response: | |
| self._sleep() | |
| resp = self.session.get(url, timeout=15, **kwargs) | |
| resp.raise_for_status() | |
| return resp | |
| def _soup(self, url: str, **kwargs) -> BeautifulSoup: | |
| resp = self._get(url, **kwargs) | |
| return BeautifulSoup(resp.text, "lxml") | |
| def crawl(self, category: str, max_pages: int = 10) -> list[JobItem]: | |
| """์ฃผ์ด์ง ์ง๊ตฐ ์นดํ ๊ณ ๋ฆฌ์ ๊ณต๊ณ ๋ฅผ ์์งํด ๋ฐํ.""" | |
| ... | |