jobsonar / .github /workflows /crawl.yml
MiniMing
์„ค์ •ํŒŒ์ผ ์ˆ˜์ •
3e81c5e
name: Job Crawl
on:
schedule:
- cron: "0 1 * * *" # ๋งค์ผ ์˜ค์ „ 10์‹œ KST (UTC+9 โ†’ UTC 01:00)
workflow_dispatch: # ์ˆ˜๋™ ์‹คํ–‰ ํ—ˆ์šฉ
jobs:
crawl:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip
- name: Install dependencies
run: pip install -r requirements.txt huggingface_hub
# HF Dataset์—์„œ ์ด์ „ DB ๋ณต์› (๋ˆ„์  ๋ฐ์ดํ„ฐ ์œ ์ง€)
- name: Restore DB from HF Dataset
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_DATASET_REPO: ${{ secrets.HF_DATASET_REPO }}
run: |
python - <<'EOF'
import os, shutil
from huggingface_hub import hf_hub_download
repo = os.environ["HF_DATASET_REPO"]
token = os.environ["HF_TOKEN"]
try:
path = hf_hub_download(
repo_id=repo, filename="jobsonar.db",
repo_type="dataset", token=token,
)
os.makedirs("data", exist_ok=True)
shutil.copy(path, "data/jobsonar.db")
print("DB ๋ณต์› ์™„๋ฃŒ")
except Exception as e:
print(f"DB ์—†์Œ (์ฒซ ์‹คํ–‰์ด๋ฉด ์ •์ƒ): {e}")
EOF
- name: Run crawlers
run: python -m crawler.run --source all --max-pages 15
env:
PYTHONPATH: ${{ github.workspace }}
# ์—…๋ฐ์ดํŠธ๋œ DB๋ฅผ HF Dataset์— ํ‘ธ์‹œ โ†’ Spaces๊ฐ€ ์ž๋™์œผ๋กœ ์ตœ์‹  ๋ฐ์ดํ„ฐ ์‚ฌ์šฉ
- name: Push DB to HF Dataset
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
HF_DATASET_REPO: ${{ secrets.HF_DATASET_REPO }}
run: |
python - <<'EOF'
import os
from huggingface_hub import HfApi
api = HfApi(token=os.environ["HF_TOKEN"])
api.upload_file(
path_or_fileobj="data/jobsonar.db",
path_in_repo="jobsonar.db",
repo_id=os.environ["HF_DATASET_REPO"],
repo_type="dataset",
commit_message="chore: weekly DB update",
)
print("HF Dataset ์—…๋กœ๋“œ ์™„๋ฃŒ")
EOF