Spaces:
Running
Running
| import io | |
| import json | |
| import os | |
| from mistralai.client import Mistral | |
| import pdfplumber | |
| from app.models import Education, Experience, Profile | |
| def extract_pdf_text(pdf_bytes: bytes) -> str: | |
| """Extract raw text from a PDF file.""" | |
| with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | |
| pages = [] | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| if text: | |
| pages.append(text) | |
| return "\n\n".join(pages) | |
| def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile: | |
| """Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM.""" | |
| raw_text = extract_pdf_text(pdf_bytes) | |
| if len(raw_text.strip()) < 50: | |
| return Profile(name="", title="") | |
| # Try instant heuristic parser first (0ms vs 5-20s for LLM) | |
| from app.services.linkedin_parser import parse_linkedin_heuristic | |
| result = parse_linkedin_heuristic(raw_text) | |
| if result: | |
| return result | |
| # Fallback to LLM for non-LinkedIn PDFs or weird formats | |
| api_key = os.environ.get("MISTRAL_API_KEY", "") | |
| if not api_key: | |
| return _fallback_parse(raw_text) | |
| client = Mistral(api_key=api_key) | |
| prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure. | |
| RAW TEXT: | |
| {raw_text[:8000]} | |
| Return valid JSON only: | |
| {{ | |
| "name": "full name (just the person's name, not 'Coordonnées' or other labels)", | |
| "title": "current or most recent job title", | |
| "location": "city, country", | |
| "email": "email if found, otherwise empty string", | |
| "phone": "phone number if found, otherwise empty string", | |
| "linkedin": "LinkedIn URL if found, otherwise empty string", | |
| "summary": "professional summary/about section if found, otherwise empty string", | |
| "experiences": [ | |
| {{ | |
| "title": "job title", | |
| "company": "company name", | |
| "dates": "date range as written", | |
| "description": "role description as a single paragraph", | |
| "bullets": ["key achievement or responsibility 1", "key achievement 2"] | |
| }} | |
| ], | |
| "education": [ | |
| {{ | |
| "degree": "degree name", | |
| "school": "school name", | |
| "year": "graduation year or date range" | |
| }} | |
| ], | |
| "skills": ["skill1", "skill2"], | |
| "languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"] | |
| }} | |
| IMPORTANT: | |
| - Extract ALL experiences from all pages, not just the first few | |
| - Keep the original language of descriptions (don't translate) | |
| - For bullets, split multi-sentence descriptions into separate items | |
| - If a field is not found, use empty string or empty array | |
| - The name is usually on the first line, possibly after 'Coordonnées' or 'Contact' | |
| - Languages section: extract with proficiency level (Native, Fluent, Professional, etc.) | |
| - Phone number: often near the top, may start with + or country code""" | |
| try: | |
| response = client.chat.complete( | |
| model="mistral-small-latest", | |
| messages=[{"role": "user", "content": prompt}], | |
| response_format={"type": "json_object"}, | |
| max_tokens=4000, | |
| temperature=0.1, | |
| ) | |
| raw_resp = response.choices[0].message.content.strip() | |
| # Find the JSON object in the response | |
| start = raw_resp.find("{") | |
| end = raw_resp.rfind("}") + 1 | |
| if start == -1 or end == 0: | |
| raise ValueError("No JSON found in response") | |
| json_str = raw_resp[start:end] | |
| # Fix common issues | |
| import re | |
| json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas | |
| data = json.loads(json_str) | |
| def s(val: str | None) -> str: | |
| """Safe string — convert None/null to empty string.""" | |
| return str(val) if val else "" | |
| def sl(val: list | None) -> list: | |
| """Safe list — convert None to empty list.""" | |
| return val if isinstance(val, list) else [] | |
| return Profile( | |
| name=s(data.get("name")), | |
| title=s(data.get("title")), | |
| location=s(data.get("location")), | |
| email=s(data.get("email")), | |
| phone=s(data.get("phone")), | |
| linkedin=s(data.get("linkedin")), | |
| summary=s(data.get("summary")), | |
| experiences=[ | |
| Experience( | |
| title=s(exp.get("title")), | |
| company=s(exp.get("company")), | |
| dates=s(exp.get("dates")), | |
| description=s(exp.get("description")), | |
| bullets=sl(exp.get("bullets")), | |
| ) | |
| for exp in sl(data.get("experiences")) | |
| ], | |
| education=[ | |
| Education( | |
| degree=s(edu.get("degree")), | |
| school=s(edu.get("school")), | |
| year=s(edu.get("year")), | |
| ) | |
| for edu in sl(data.get("education")) | |
| ], | |
| skills=sl(data.get("skills")), | |
| languages=sl(data.get("languages")), | |
| ) | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| return _fallback_parse(raw_text) | |
| def _fallback_parse(raw_text: str) -> Profile: | |
| """Basic fallback if LLM is unavailable.""" | |
| lines = [line.strip() for line in raw_text.split("\n") if line.strip()] | |
| name = lines[0] if lines else "" | |
| for prefix in ["Coordonnées ", "Contact "]: | |
| if name.startswith(prefix): | |
| name = name[len(prefix):] | |
| return Profile(name=name, title=lines[1] if len(lines) > 1 else "") | |