Spaces:
Running
Running
feat: job offer scraper — URL scraping with text fallback
Browse filesCo-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- app/main.py +2 -1
- app/routers/offer.py +25 -0
- app/services/offer_scraper.py +77 -0
- tests/test_offer_scraper.py +39 -0
app/main.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
|
| 4 |
-
from app.routers import linkedin
|
| 5 |
|
| 6 |
app = FastAPI(title="Bored CV API", version="0.1.0")
|
| 7 |
|
|
@@ -17,6 +17,7 @@ app.add_middleware(
|
|
| 17 |
)
|
| 18 |
|
| 19 |
app.include_router(linkedin.router)
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
@app.get("/api/health")
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
|
| 4 |
+
from app.routers import linkedin, offer
|
| 5 |
|
| 6 |
app = FastAPI(title="Bored CV API", version="0.1.0")
|
| 7 |
|
|
|
|
| 17 |
)
|
| 18 |
|
| 19 |
app.include_router(linkedin.router)
|
| 20 |
+
app.include_router(offer.router)
|
| 21 |
|
| 22 |
|
| 23 |
@app.get("/api/health")
|
app/routers/offer.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
|
| 3 |
+
from app.models import Offer, OfferScrapeRequest
|
| 4 |
+
from app.services.offer_scraper import parse_offer_text, scrape_offer_url
|
| 5 |
+
|
| 6 |
+
router = APIRouter(prefix="/api", tags=["offer"])
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@router.post("/scrape-offer", response_model=Offer)
|
| 10 |
+
async def scrape_offer(req: OfferScrapeRequest):
|
| 11 |
+
if not req.url and not req.raw_text:
|
| 12 |
+
raise HTTPException(status_code=400, detail="Provide either url or raw_text")
|
| 13 |
+
|
| 14 |
+
if req.raw_text:
|
| 15 |
+
if len(req.raw_text) > 10_000:
|
| 16 |
+
raise HTTPException(status_code=400, detail="Text too long (max 10,000 chars)")
|
| 17 |
+
return parse_offer_text(req.raw_text)
|
| 18 |
+
|
| 19 |
+
offer = await scrape_offer_url(req.url)
|
| 20 |
+
if offer is None:
|
| 21 |
+
raise HTTPException(
|
| 22 |
+
status_code=422,
|
| 23 |
+
detail="Could not scrape this URL. Please paste the job description text instead.",
|
| 24 |
+
)
|
| 25 |
+
return offer
|
app/services/offer_scraper.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
import httpx
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
|
| 6 |
+
from app.models import Offer, OfferRequirement
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
async def scrape_offer_url(url: str) -> Offer | None:
|
| 10 |
+
try:
|
| 11 |
+
async with httpx.AsyncClient(
|
| 12 |
+
timeout=10.0,
|
| 13 |
+
follow_redirects=True,
|
| 14 |
+
headers={"User-Agent": "Mozilla/5.0 (compatible; BoredCV/1.0)"},
|
| 15 |
+
) as client:
|
| 16 |
+
resp = await client.get(url)
|
| 17 |
+
resp.raise_for_status()
|
| 18 |
+
except (httpx.HTTPError, httpx.InvalidURL):
|
| 19 |
+
return None
|
| 20 |
+
|
| 21 |
+
soup = BeautifulSoup(resp.text, "html.parser")
|
| 22 |
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
| 23 |
+
tag.decompose()
|
| 24 |
+
|
| 25 |
+
main = soup.find("main") or soup.find("article") or soup.find("div", class_=re.compile("job|posting|description"))
|
| 26 |
+
text = (main or soup.body or soup).get_text(separator="\n", strip=True)
|
| 27 |
+
text = text[:10_000]
|
| 28 |
+
|
| 29 |
+
if len(text) < 50:
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
return parse_offer_text(text)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def parse_offer_text(raw: str) -> Offer:
|
| 36 |
+
lines = [line.strip() for line in raw.strip().split("\n") if line.strip()]
|
| 37 |
+
if not lines:
|
| 38 |
+
return Offer(title="", company="", description=raw)
|
| 39 |
+
|
| 40 |
+
title = lines[0] if len(lines[0]) < 120 else ""
|
| 41 |
+
company = ""
|
| 42 |
+
|
| 43 |
+
if " - " in title:
|
| 44 |
+
parts = title.split(" - ", 1)
|
| 45 |
+
title = parts[0].strip()
|
| 46 |
+
company = parts[1].strip()
|
| 47 |
+
elif " at " in title.lower():
|
| 48 |
+
parts = re.split(r"\s+at\s+", title, flags=re.IGNORECASE)
|
| 49 |
+
if len(parts) == 2:
|
| 50 |
+
title = parts[0].strip()
|
| 51 |
+
company = parts[1].strip()
|
| 52 |
+
|
| 53 |
+
requirements: list[OfferRequirement] = []
|
| 54 |
+
nice_to_have: list[OfferRequirement] = []
|
| 55 |
+
current_list = None
|
| 56 |
+
|
| 57 |
+
for line in lines:
|
| 58 |
+
lower = line.lower()
|
| 59 |
+
if any(kw in lower for kw in ["requirement", "qualif", "must have", "you have", "what we need", "profil recherché", "compétences requises"]):
|
| 60 |
+
current_list = requirements
|
| 61 |
+
continue
|
| 62 |
+
elif any(kw in lower for kw in ["nice to have", "bonus", "plus", "atout", "idéalement"]):
|
| 63 |
+
current_list = nice_to_have
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
if current_list is not None and (line.startswith("-") or line.startswith("•") or line.startswith("*")):
|
| 67 |
+
text = line.lstrip("-•* ").strip()
|
| 68 |
+
if text:
|
| 69 |
+
category = "nice_to_have" if current_list is nice_to_have else "required"
|
| 70 |
+
current_list.append(OfferRequirement(text=text, category=category))
|
| 71 |
+
|
| 72 |
+
description = "\n".join(lines)
|
| 73 |
+
|
| 74 |
+
return Offer(
|
| 75 |
+
title=title, company=company, description=description,
|
| 76 |
+
requirements=requirements, nice_to_have=nice_to_have,
|
| 77 |
+
)
|
tests/test_offer_scraper.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from app.services.offer_scraper import parse_offer_text, scrape_offer_url
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def test_parse_offer_text_extracts_structure():
|
| 6 |
+
raw = """
|
| 7 |
+
Senior Product Manager - TechCorp
|
| 8 |
+
Paris, France
|
| 9 |
+
|
| 10 |
+
About the role:
|
| 11 |
+
We're looking for a Senior PM to lead our B2B platform.
|
| 12 |
+
|
| 13 |
+
Requirements:
|
| 14 |
+
- 5+ years product management experience
|
| 15 |
+
- Experience with B2B SaaS
|
| 16 |
+
- Strong SQL skills
|
| 17 |
+
- Team leadership experience
|
| 18 |
+
|
| 19 |
+
Nice to have:
|
| 20 |
+
- Experience with AI/ML products
|
| 21 |
+
- MBA or equivalent
|
| 22 |
+
"""
|
| 23 |
+
result = parse_offer_text(raw)
|
| 24 |
+
assert result.title != ""
|
| 25 |
+
assert result.company != ""
|
| 26 |
+
assert len(result.requirements) > 0
|
| 27 |
+
assert result.description != ""
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_parse_offer_text_handles_minimal_input():
|
| 31 |
+
raw = "We need a developer with Python and React skills."
|
| 32 |
+
result = parse_offer_text(raw)
|
| 33 |
+
assert result.description != ""
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@pytest.mark.asyncio
|
| 37 |
+
async def test_scrape_offer_url_returns_none_on_bad_url():
|
| 38 |
+
result = await scrape_offer_url("https://this-does-not-exist-12345.com/job")
|
| 39 |
+
assert result is None
|