Spaces:

Aramente
/

bored-cv-api

Running

App Files Files Community

Aramente Claude Sonnet 4.6 commited on Apr 13

Commit

37224d4

1 Parent(s): 5e04dcc

feat: job offer scraper — URL scraping with text fallback

Browse files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (4) hide show

app/main.py +2 -1
app/routers/offer.py +25 -0
app/services/offer_scraper.py +77 -0
tests/test_offer_scraper.py +39 -0

app/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from app.routers import linkedin
 app = FastAPI(title="Bored CV API", version="0.1.0")
@@ -17,6 +17,7 @@ app.add_middleware(
 )
 app.include_router(linkedin.router)
 @app.get("/api/health")

 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from app.routers import linkedin, offer
 app = FastAPI(title="Bored CV API", version="0.1.0")
 )
 app.include_router(linkedin.router)
+app.include_router(offer.router)
 @app.get("/api/health")

app/routers/offer.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from fastapi import APIRouter, HTTPException
+from app.models import Offer, OfferScrapeRequest
+from app.services.offer_scraper import parse_offer_text, scrape_offer_url
+router = APIRouter(prefix="/api", tags=["offer"])
+@router.post("/scrape-offer", response_model=Offer)
+async def scrape_offer(req: OfferScrapeRequest):
+    if not req.url and not req.raw_text:
+        raise HTTPException(status_code=400, detail="Provide either url or raw_text")
+    if req.raw_text:
+        if len(req.raw_text) > 10_000:
+            raise HTTPException(status_code=400, detail="Text too long (max 10,000 chars)")
+        return parse_offer_text(req.raw_text)
+    offer = await scrape_offer_url(req.url)
+    if offer is None:
+        raise HTTPException(
+            status_code=422,
+            detail="Could not scrape this URL. Please paste the job description text instead.",
+        )
+    return offer

app/services/offer_scraper.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import re
+import httpx
+from bs4 import BeautifulSoup
+from app.models import Offer, OfferRequirement
+async def scrape_offer_url(url: str) -> Offer | None:
+    try:
+        async with httpx.AsyncClient(
+            timeout=10.0,
+            follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 (compatible; BoredCV/1.0)"},
+        ) as client:
+            resp = await client.get(url)
+            resp.raise_for_status()
+    except (httpx.HTTPError, httpx.InvalidURL):
+        return None
+    soup = BeautifulSoup(resp.text, "html.parser")
+    for tag in soup(["script", "style", "nav", "footer", "header"]):
+        tag.decompose()
+    main = soup.find("main") or soup.find("article") or soup.find("div", class_=re.compile("job|posting|description"))
+    text = (main or soup.body or soup).get_text(separator="\n", strip=True)
+    text = text[:10_000]
+    if len(text) < 50:
+        return None
+    return parse_offer_text(text)
+def parse_offer_text(raw: str) -> Offer:
+    lines = [line.strip() for line in raw.strip().split("\n") if line.strip()]
+    if not lines:
+        return Offer(title="", company="", description=raw)
+    title = lines[0] if len(lines[0]) < 120 else ""
+    company = ""
+    if " - " in title:
+        parts = title.split(" - ", 1)
+        title = parts[0].strip()
+        company = parts[1].strip()
+    elif " at " in title.lower():
+        parts = re.split(r"\s+at\s+", title, flags=re.IGNORECASE)
+        if len(parts) == 2:
+            title = parts[0].strip()
+            company = parts[1].strip()
+    requirements: list[OfferRequirement] = []
+    nice_to_have: list[OfferRequirement] = []
+    current_list = None
+    for line in lines:
+        lower = line.lower()
+        if any(kw in lower for kw in ["requirement", "qualif", "must have", "you have", "what we need", "profil recherché", "compétences requises"]):
+            current_list = requirements
+            continue
+        elif any(kw in lower for kw in ["nice to have", "bonus", "plus", "atout", "idéalement"]):
+            current_list = nice_to_have
+            continue
+        if current_list is not None and (line.startswith("-") or line.startswith("•") or line.startswith("*")):
+            text = line.lstrip("-•* ").strip()
+            if text:
+                category = "nice_to_have" if current_list is nice_to_have else "required"
+                current_list.append(OfferRequirement(text=text, category=category))
+    description = "\n".join(lines)
+    return Offer(
+        title=title, company=company, description=description,
+        requirements=requirements, nice_to_have=nice_to_have,
+    )

tests/test_offer_scraper.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import pytest
+from app.services.offer_scraper import parse_offer_text, scrape_offer_url
+def test_parse_offer_text_extracts_structure():
+    raw = """
+    Senior Product Manager - TechCorp
+    Paris, France
+    About the role:
+    We're looking for a Senior PM to lead our B2B platform.
+    Requirements:
+    - 5+ years product management experience
+    - Experience with B2B SaaS
+    - Strong SQL skills
+    - Team leadership experience
+    Nice to have:
+    - Experience with AI/ML products
+    - MBA or equivalent
+    """
+    result = parse_offer_text(raw)
+    assert result.title != ""
+    assert result.company != ""
+    assert len(result.requirements) > 0
+    assert result.description != ""
+def test_parse_offer_text_handles_minimal_input():
+    raw = "We need a developer with Python and React skills."
+    result = parse_offer_text(raw)
+    assert result.description != ""
+@pytest.mark.asyncio
+async def test_scrape_offer_url_returns_none_on_bad_url():
+    result = await scrape_offer_url("https://this-does-not-exist-12345.com/job")
+    assert result is None