Spaces:
Running
Running
File size: 5,616 Bytes
5e04dcc 3b5bee3 5e04dcc a235542 5e04dcc 3b5bee3 5e04dcc 3b5bee3 5e04dcc 3b5bee3 5e04dcc 3b5bee3 b10aabd 3b5bee3 5e04dcc b10aabd 8b18ee8 3b5bee3 8b18ee8 3b5bee3 e391cda 3b5bee3 e391cda 3b5bee3 e391cda 3b5bee3 8b18ee8 3b5bee3 8b18ee8 bc01feb 5a15c3e 3b5bee3 5a15c3e 3b5bee3 5a15c3e 3b5bee3 5a15c3e 3b5bee3 5a15c3e 3b5bee3 5a15c3e 3b5bee3 5a15c3e 5e04dcc 5a15c3e 3b5bee3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | import io
import json
import os
from mistralai.client import Mistral
import pdfplumber
from app.models import Education, Experience, Profile
def extract_pdf_text(pdf_bytes: bytes) -> str:
"""Extract raw text from a PDF file."""
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
pages = []
for page in pdf.pages:
text = page.extract_text()
if text:
pages.append(text)
return "\n\n".join(pages)
def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
"""Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM."""
raw_text = extract_pdf_text(pdf_bytes)
if len(raw_text.strip()) < 50:
return Profile(name="", title="")
# Try instant heuristic parser first (0ms vs 5-20s for LLM)
from app.services.linkedin_parser import parse_linkedin_heuristic
result = parse_linkedin_heuristic(raw_text)
if result:
return result
# Fallback to LLM for non-LinkedIn PDFs or weird formats
api_key = os.environ.get("MISTRAL_API_KEY", "")
if not api_key:
return _fallback_parse(raw_text)
client = Mistral(api_key=api_key)
prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.
RAW TEXT:
{raw_text[:8000]}
Return valid JSON only:
{{
"name": "full name (just the person's name, not 'Coordonnées' or other labels)",
"title": "current or most recent job title",
"location": "city, country",
"email": "email if found, otherwise empty string",
"phone": "phone number if found, otherwise empty string",
"linkedin": "LinkedIn URL if found, otherwise empty string",
"summary": "professional summary/about section if found, otherwise empty string",
"experiences": [
{{
"title": "job title",
"company": "company name",
"dates": "date range as written",
"description": "role description as a single paragraph",
"bullets": ["key achievement or responsibility 1", "key achievement 2"]
}}
],
"education": [
{{
"degree": "degree name",
"school": "school name",
"year": "graduation year or date range"
}}
],
"skills": ["skill1", "skill2"],
"languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"]
}}
IMPORTANT:
- Extract ALL experiences from all pages, not just the first few
- Keep the original language of descriptions (don't translate)
- For bullets, split multi-sentence descriptions into separate items
- If a field is not found, use empty string or empty array
- The name is usually on the first line, possibly after 'Coordonnées' or 'Contact'
- Languages section: extract with proficiency level (Native, Fluent, Professional, etc.)
- Phone number: often near the top, may start with + or country code"""
try:
response = client.chat.complete(
model="mistral-small-latest",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
max_tokens=4000,
temperature=0.1,
)
raw_resp = response.choices[0].message.content.strip()
# Find the JSON object in the response
start = raw_resp.find("{")
end = raw_resp.rfind("}") + 1
if start == -1 or end == 0:
raise ValueError("No JSON found in response")
json_str = raw_resp[start:end]
# Fix common issues
import re
json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas
data = json.loads(json_str)
def s(val: str | None) -> str:
"""Safe string — convert None/null to empty string."""
return str(val) if val else ""
def sl(val: list | None) -> list:
"""Safe list — convert None to empty list."""
return val if isinstance(val, list) else []
return Profile(
name=s(data.get("name")),
title=s(data.get("title")),
location=s(data.get("location")),
email=s(data.get("email")),
phone=s(data.get("phone")),
linkedin=s(data.get("linkedin")),
summary=s(data.get("summary")),
experiences=[
Experience(
title=s(exp.get("title")),
company=s(exp.get("company")),
dates=s(exp.get("dates")),
description=s(exp.get("description")),
bullets=sl(exp.get("bullets")),
)
for exp in sl(data.get("experiences"))
],
education=[
Education(
degree=s(edu.get("degree")),
school=s(edu.get("school")),
year=s(edu.get("year")),
)
for edu in sl(data.get("education"))
],
skills=sl(data.get("skills")),
languages=sl(data.get("languages")),
)
except Exception as e:
import traceback
traceback.print_exc()
return _fallback_parse(raw_text)
def _fallback_parse(raw_text: str) -> Profile:
"""Basic fallback if LLM is unavailable."""
lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
name = lines[0] if lines else ""
for prefix in ["Coordonnées ", "Contact "]:
if name.startswith(prefix):
name = name[len(prefix):]
return Profile(name=name, title=lines[1] if len(lines) > 1 else "")
|