bored-cv-api / app /services /pdf_parser.py
Aramente's picture
fix: correct mistralai import path — Mistral is in mistralai.client
a235542
import io
import json
import os
from mistralai.client import Mistral
import pdfplumber
from app.models import Education, Experience, Profile
def extract_pdf_text(pdf_bytes: bytes) -> str:
"""Extract raw text from a PDF file."""
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
pages = []
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
return "\n\n".join(pages)
def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
"""Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM."""
raw_text = extract_pdf_text(pdf_bytes)
if len(raw_text.strip()) < 50:
return Profile(name="", title="")
# Try instant heuristic parser first (0ms vs 5-20s for LLM)
from app.services.linkedin_parser import parse_linkedin_heuristic
result = parse_linkedin_heuristic(raw_text)
if result:
return result
# Fallback to LLM for non-LinkedIn PDFs or weird formats
api_key = os.environ.get("MISTRAL_API_KEY", "")
if not api_key:
return _fallback_parse(raw_text)
client = Mistral(api_key=api_key)
prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
RAW TEXT:
{raw_text[:8000]}
Return valid JSON only:
{{
"name": "full name (just the person's name, not 'Coordonnées' or other labels)",
"title": "current or most recent job title",
"location": "city, country",
"email": "email if found, otherwise empty string",
"phone": "phone number if found, otherwise empty string",
"linkedin": "LinkedIn URL if found, otherwise empty string",
"summary": "professional summary/about section if found, otherwise empty string",
"experiences": [
{{
"title": "job title",
"company": "company name",
"dates": "date range as written",
"description": "role description as a single paragraph",
"bullets": ["key achievement or responsibility 1", "key achievement 2"]
}}
],
"education": [
{{
"degree": "degree name",
"school": "school name",
"year": "graduation year or date range"
}}
],
"skills": ["skill1", "skill2"],
"languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"]
}}
IMPORTANT:
- Extract ALL experiences from all pages, not just the first few
- Keep the original language of descriptions (don't translate)
- For bullets, split multi-sentence descriptions into separate items
- If a field is not found, use empty string or empty array
- The name is usually on the first line, possibly after 'Coordonnées' or 'Contact'
- Languages section: extract with proficiency level (Native, Fluent, Professional, etc.)
- Phone number: often near the top, may start with + or country code"""
try:
response = client.chat.complete(
model="mistral-small-latest",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
max_tokens=4000,
temperature=0.1,
)
raw_resp = response.choices[0].message.content.strip()
# Find the JSON object in the response
start = raw_resp.find("{")
end = raw_resp.rfind("}") + 1
if start == -1 or end == 0:
raise ValueError("No JSON found in response")
json_str = raw_resp[start:end]
# Fix common issues
import re
json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas
data = json.loads(json_str)
def s(val: str | None) -> str:
"""Safe string — convert None/null to empty string."""
return str(val) if val else ""
def sl(val: list | None) -> list:
"""Safe list — convert None to empty list."""
return val if isinstance(val, list) else []
return Profile(
name=s(data.get("name")),
title=s(data.get("title")),
location=s(data.get("location")),
email=s(data.get("email")),
phone=s(data.get("phone")),
linkedin=s(data.get("linkedin")),
summary=s(data.get("summary")),
experiences=[
Experience(
title=s(exp.get("title")),
company=s(exp.get("company")),
dates=s(exp.get("dates")),
description=s(exp.get("description")),
bullets=sl(exp.get("bullets")),
)
for exp in sl(data.get("experiences"))
],
education=[
Education(
degree=s(edu.get("degree")),
school=s(edu.get("school")),
year=s(edu.get("year")),
)
for edu in sl(data.get("education"))
],
skills=sl(data.get("skills")),
languages=sl(data.get("languages")),
)
except Exception as e:
import traceback
traceback.print_exc()
return _fallback_parse(raw_text)
def _fallback_parse(raw_text: str) -> Profile:
"""Basic fallback if LLM is unavailable."""
lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
name = lines[0] if lines else ""
for prefix in ["Coordonnées ", "Contact "]:
if name.startswith(prefix):
name = name[len(prefix):]
return Profile(name=name, title=lines[1] if len(lines) > 1 else "")