Aramente Claude Sonnet 4.6 commited on
Commit
37224d4
·
1 Parent(s): 5e04dcc

feat: job offer scraper — URL scraping with text fallback

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

app/main.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
 
4
- from app.routers import linkedin
5
 
6
  app = FastAPI(title="Bored CV API", version="0.1.0")
7
 
@@ -17,6 +17,7 @@ app.add_middleware(
17
  )
18
 
19
  app.include_router(linkedin.router)
 
20
 
21
 
22
  @app.get("/api/health")
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
 
4
+ from app.routers import linkedin, offer
5
 
6
  app = FastAPI(title="Bored CV API", version="0.1.0")
7
 
 
17
  )
18
 
19
  app.include_router(linkedin.router)
20
+ app.include_router(offer.router)
21
 
22
 
23
  @app.get("/api/health")
app/routers/offer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+
3
+ from app.models import Offer, OfferScrapeRequest
4
+ from app.services.offer_scraper import parse_offer_text, scrape_offer_url
5
+
6
+ router = APIRouter(prefix="/api", tags=["offer"])
7
+
8
+
9
+ @router.post("/scrape-offer", response_model=Offer)
10
+ async def scrape_offer(req: OfferScrapeRequest):
11
+ if not req.url and not req.raw_text:
12
+ raise HTTPException(status_code=400, detail="Provide either url or raw_text")
13
+
14
+ if req.raw_text:
15
+ if len(req.raw_text) > 10_000:
16
+ raise HTTPException(status_code=400, detail="Text too long (max 10,000 chars)")
17
+ return parse_offer_text(req.raw_text)
18
+
19
+ offer = await scrape_offer_url(req.url)
20
+ if offer is None:
21
+ raise HTTPException(
22
+ status_code=422,
23
+ detail="Could not scrape this URL. Please paste the job description text instead.",
24
+ )
25
+ return offer
app/services/offer_scraper.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import httpx
4
+ from bs4 import BeautifulSoup
5
+
6
+ from app.models import Offer, OfferRequirement
7
+
8
+
9
+ async def scrape_offer_url(url: str) -> Offer | None:
10
+ try:
11
+ async with httpx.AsyncClient(
12
+ timeout=10.0,
13
+ follow_redirects=True,
14
+ headers={"User-Agent": "Mozilla/5.0 (compatible; BoredCV/1.0)"},
15
+ ) as client:
16
+ resp = await client.get(url)
17
+ resp.raise_for_status()
18
+ except (httpx.HTTPError, httpx.InvalidURL):
19
+ return None
20
+
21
+ soup = BeautifulSoup(resp.text, "html.parser")
22
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
23
+ tag.decompose()
24
+
25
+ main = soup.find("main") or soup.find("article") or soup.find("div", class_=re.compile("job|posting|description"))
26
+ text = (main or soup.body or soup).get_text(separator="\n", strip=True)
27
+ text = text[:10_000]
28
+
29
+ if len(text) < 50:
30
+ return None
31
+
32
+ return parse_offer_text(text)
33
+
34
+
35
+ def parse_offer_text(raw: str) -> Offer:
36
+ lines = [line.strip() for line in raw.strip().split("\n") if line.strip()]
37
+ if not lines:
38
+ return Offer(title="", company="", description=raw)
39
+
40
+ title = lines[0] if len(lines[0]) < 120 else ""
41
+ company = ""
42
+
43
+ if " - " in title:
44
+ parts = title.split(" - ", 1)
45
+ title = parts[0].strip()
46
+ company = parts[1].strip()
47
+ elif " at " in title.lower():
48
+ parts = re.split(r"\s+at\s+", title, flags=re.IGNORECASE)
49
+ if len(parts) == 2:
50
+ title = parts[0].strip()
51
+ company = parts[1].strip()
52
+
53
+ requirements: list[OfferRequirement] = []
54
+ nice_to_have: list[OfferRequirement] = []
55
+ current_list = None
56
+
57
+ for line in lines:
58
+ lower = line.lower()
59
+ if any(kw in lower for kw in ["requirement", "qualif", "must have", "you have", "what we need", "profil recherché", "compétences requises"]):
60
+ current_list = requirements
61
+ continue
62
+ elif any(kw in lower for kw in ["nice to have", "bonus", "plus", "atout", "idéalement"]):
63
+ current_list = nice_to_have
64
+ continue
65
+
66
+ if current_list is not None and (line.startswith("-") or line.startswith("•") or line.startswith("*")):
67
+ text = line.lstrip("-•* ").strip()
68
+ if text:
69
+ category = "nice_to_have" if current_list is nice_to_have else "required"
70
+ current_list.append(OfferRequirement(text=text, category=category))
71
+
72
+ description = "\n".join(lines)
73
+
74
+ return Offer(
75
+ title=title, company=company, description=description,
76
+ requirements=requirements, nice_to_have=nice_to_have,
77
+ )
tests/test_offer_scraper.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.services.offer_scraper import parse_offer_text, scrape_offer_url
3
+
4
+
5
+ def test_parse_offer_text_extracts_structure():
6
+ raw = """
7
+ Senior Product Manager - TechCorp
8
+ Paris, France
9
+
10
+ About the role:
11
+ We're looking for a Senior PM to lead our B2B platform.
12
+
13
+ Requirements:
14
+ - 5+ years product management experience
15
+ - Experience with B2B SaaS
16
+ - Strong SQL skills
17
+ - Team leadership experience
18
+
19
+ Nice to have:
20
+ - Experience with AI/ML products
21
+ - MBA or equivalent
22
+ """
23
+ result = parse_offer_text(raw)
24
+ assert result.title != ""
25
+ assert result.company != ""
26
+ assert len(result.requirements) > 0
27
+ assert result.description != ""
28
+
29
+
30
+ def test_parse_offer_text_handles_minimal_input():
31
+ raw = "We need a developer with Python and React skills."
32
+ result = parse_offer_text(raw)
33
+ assert result.description != ""
34
+
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_scrape_offer_url_returns_none_on_bad_url():
38
+ result = await scrape_offer_url("https://this-does-not-exist-12345.com/job")
39
+ assert result is None