Spaces:

Aramente
/

bored-cv-api

Running

App Files Files Community

bored-cv-api / app /services /pdf_parser.py

Aramente

fix: correct mistralai import path — Mistral is in mistralai.client

a235542 about 1 month ago

raw

history blame contribute delete

5.62 kB

	import io
	import json
	import os

	from mistralai.client import Mistral
	import pdfplumber

	from app.models import Education, Experience, Profile


	def extract_pdf_text(pdf_bytes: bytes) -> str:
	"""Extract raw text from a PDF file."""
	with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
	pages = []
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	pages.append(text)
	return "\n\n".join(pages)


	def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
	"""Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM."""
	raw_text = extract_pdf_text(pdf_bytes)

	if len(raw_text.strip()) < 50:
	return Profile(name="", title="")

	# Try instant heuristic parser first (0ms vs 5-20s for LLM)
	from app.services.linkedin_parser import parse_linkedin_heuristic
	result = parse_linkedin_heuristic(raw_text)
	if result:
	return result

	# Fallback to LLM for non-LinkedIn PDFs or weird formats
	api_key = os.environ.get("MISTRAL_API_KEY", "")
	if not api_key:
	return _fallback_parse(raw_text)

	client = Mistral(api_key=api_key)

	prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.

	RAW TEXT:
	{raw_text[:8000]}

	Return valid JSON only:
	{{
	"name": "full name (just the person's name, not 'Coordonnées' or other labels)",
	"title": "current or most recent job title",
	"location": "city, country",
	"email": "email if found, otherwise empty string",
	"phone": "phone number if found, otherwise empty string",
	"linkedin": "LinkedIn URL if found, otherwise empty string",
	"summary": "professional summary/about section if found, otherwise empty string",
	"experiences": [
	{{
	"title": "job title",
	"company": "company name",
	"dates": "date range as written",
	"description": "role description as a single paragraph",
	"bullets": ["key achievement or responsibility 1", "key achievement 2"]
	}}
	],
	"education": [
	{{
	"degree": "degree name",
	"school": "school name",
	"year": "graduation year or date range"
	}}
	],
	"skills": ["skill1", "skill2"],
	"languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"]
	}}

	IMPORTANT:
	- Extract ALL experiences from all pages, not just the first few
	- Keep the original language of descriptions (don't translate)
	- For bullets, split multi-sentence descriptions into separate items
	- If a field is not found, use empty string or empty array
	- The name is usually on the first line, possibly after 'Coordonnées' or 'Contact'
	- Languages section: extract with proficiency level (Native, Fluent, Professional, etc.)
	- Phone number: often near the top, may start with + or country code"""

	try:
	response = client.chat.complete(
	model="mistral-small-latest",
	messages=[{"role": "user", "content": prompt}],
	response_format={"type": "json_object"},
	max_tokens=4000,
	temperature=0.1,
	)
	raw_resp = response.choices[0].message.content.strip()
	# Find the JSON object in the response
	start = raw_resp.find("{")
	end = raw_resp.rfind("}") + 1
	if start == -1 or end == 0:
	raise ValueError("No JSON found in response")
	json_str = raw_resp[start:end]
	# Fix common issues
	import re
	json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas
	data = json.loads(json_str)

	def s(val: str \| None) -> str:
	"""Safe string — convert None/null to empty string."""
	return str(val) if val else ""

	def sl(val: list \| None) -> list:
	"""Safe list — convert None to empty list."""
	return val if isinstance(val, list) else []

	return Profile(
	name=s(data.get("name")),
	title=s(data.get("title")),
	location=s(data.get("location")),
	email=s(data.get("email")),
	phone=s(data.get("phone")),
	linkedin=s(data.get("linkedin")),
	summary=s(data.get("summary")),
	experiences=[
	Experience(
	title=s(exp.get("title")),
	company=s(exp.get("company")),
	dates=s(exp.get("dates")),
	description=s(exp.get("description")),
	bullets=sl(exp.get("bullets")),
	)
	for exp in sl(data.get("experiences"))
	],
	education=[
	Education(
	degree=s(edu.get("degree")),
	school=s(edu.get("school")),
	year=s(edu.get("year")),
	)
	for edu in sl(data.get("education"))
	],
	skills=sl(data.get("skills")),
	languages=sl(data.get("languages")),
	)
	except Exception as e:
	import traceback
	traceback.print_exc()
	return _fallback_parse(raw_text)


	def _fallback_parse(raw_text: str) -> Profile:
	"""Basic fallback if LLM is unavailable."""
	lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
	name = lines[0] if lines else ""
	for prefix in ["Coordonnées ", "Contact "]:
	if name.startswith(prefix):
	name = name[len(prefix):]
	return Profile(name=name, title=lines[1] if len(lines) > 1 else "")