import io import json import os from mistralai.client import Mistral import pdfplumber from app.models import Education, Experience, Profile def extract_pdf_text(pdf_bytes: bytes) -> str: """Extract raw text from a PDF file.""" with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: pages = [] for page in pdf.pages: text = page.extract_text() if text: pages.append(text) return "\n\n".join(pages) def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile: """Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM.""" raw_text = extract_pdf_text(pdf_bytes) if len(raw_text.strip()) < 50: return Profile(name="", title="") # Try instant heuristic parser first (0ms vs 5-20s for LLM) from app.services.linkedin_parser import parse_linkedin_heuristic result = parse_linkedin_heuristic(raw_text) if result: return result # Fallback to LLM for non-LinkedIn PDFs or weird formats api_key = os.environ.get("MISTRAL_API_KEY", "") if not api_key: return _fallback_parse(raw_text) client = Mistral(api_key=api_key) prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure. RAW TEXT: {raw_text[:8000]} Return valid JSON only: {{ "name": "full name (just the person's name, not 'Coordonnées' or other labels)", "title": "current or most recent job title", "location": "city, country", "email": "email if found, otherwise empty string", "phone": "phone number if found, otherwise empty string", "linkedin": "LinkedIn URL if found, otherwise empty string", "summary": "professional summary/about section if found, otherwise empty string", "experiences": [ {{ "title": "job title", "company": "company name", "dates": "date range as written", "description": "role description as a single paragraph", "bullets": ["key achievement or responsibility 1", "key achievement 2"] }} ], "education": [ {{ "degree": "degree name", "school": "school name", "year": "graduation year or date range" }} ], "skills": ["skill1", "skill2"], "languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"] }} IMPORTANT: - Extract ALL experiences from all pages, not just the first few - Keep the original language of descriptions (don't translate) - For bullets, split multi-sentence descriptions into separate items - If a field is not found, use empty string or empty array - The name is usually on the first line, possibly after 'Coordonnées' or 'Contact' - Languages section: extract with proficiency level (Native, Fluent, Professional, etc.) - Phone number: often near the top, may start with + or country code""" try: response = client.chat.complete( model="mistral-small-latest", messages=[{"role": "user", "content": prompt}], response_format={"type": "json_object"}, max_tokens=4000, temperature=0.1, ) raw_resp = response.choices[0].message.content.strip() # Find the JSON object in the response start = raw_resp.find("{") end = raw_resp.rfind("}") + 1 if start == -1 or end == 0: raise ValueError("No JSON found in response") json_str = raw_resp[start:end] # Fix common issues import re json_str = re.sub(r",\s*([}\]])", r"\1", json_str) # trailing commas data = json.loads(json_str) def s(val: str | None) -> str: """Safe string — convert None/null to empty string.""" return str(val) if val else "" def sl(val: list | None) -> list: """Safe list — convert None to empty list.""" return val if isinstance(val, list) else [] return Profile( name=s(data.get("name")), title=s(data.get("title")), location=s(data.get("location")), email=s(data.get("email")), phone=s(data.get("phone")), linkedin=s(data.get("linkedin")), summary=s(data.get("summary")), experiences=[ Experience( title=s(exp.get("title")), company=s(exp.get("company")), dates=s(exp.get("dates")), description=s(exp.get("description")), bullets=sl(exp.get("bullets")), ) for exp in sl(data.get("experiences")) ], education=[ Education( degree=s(edu.get("degree")), school=s(edu.get("school")), year=s(edu.get("year")), ) for edu in sl(data.get("education")) ], skills=sl(data.get("skills")), languages=sl(data.get("languages")), ) except Exception as e: import traceback traceback.print_exc() return _fallback_parse(raw_text) def _fallback_parse(raw_text: str) -> Profile: """Basic fallback if LLM is unavailable.""" lines = [line.strip() for line in raw_text.split("\n") if line.strip()] name = lines[0] if lines else "" for prefix in ["Coordonnées ", "Contact "]: if name.startswith(prefix): name = name[len(prefix):] return Profile(name=name, title=lines[1] if len(lines) > 1 else "")