File size: 5,616 Bytes
5e04dcc
3b5bee3
 
5e04dcc
a235542
5e04dcc
 
 
 
 
3b5bee3
 
5e04dcc
3b5bee3
5e04dcc
 
 
3b5bee3
 
 
5e04dcc
3b5bee3
b10aabd
3b5bee3
 
 
5e04dcc
 
b10aabd
 
 
 
 
 
 
8b18ee8
3b5bee3
 
 
8b18ee8
3b5bee3
 
 
 
 
 
 
 
 
 
 
 
e391cda
 
3b5bee3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e391cda
 
3b5bee3
 
 
 
 
 
 
e391cda
 
 
3b5bee3
 
8b18ee8
 
 
 
 
 
3b5bee3
8b18ee8
bc01feb
 
 
 
 
 
 
 
 
 
5a15c3e
 
 
 
 
 
 
 
3b5bee3
 
5a15c3e
 
 
 
 
 
 
3b5bee3
 
5a15c3e
 
 
 
 
3b5bee3
5a15c3e
3b5bee3
 
 
5a15c3e
 
 
3b5bee3
5a15c3e
3b5bee3
5a15c3e
 
5e04dcc
5a15c3e
 
 
3b5bee3
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import io
import json
import os

from mistralai.client import Mistral
import pdfplumber

from app.models import Education, Experience, Profile


def extract_pdf_text(pdf_bytes: bytes) -> str:
    """Extract raw text from a PDF file."""
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        pages = []
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                pages.append(text)
    return "\n\n".join(pages)


def parse_linkedin_pdf(pdf_bytes: bytes) -> Profile:
    """Parse a LinkedIn PDF export. Tries fast heuristic first, falls back to LLM."""
    raw_text = extract_pdf_text(pdf_bytes)

    if len(raw_text.strip()) < 50:
        return Profile(name="", title="")

    # Try instant heuristic parser first (0ms vs 5-20s for LLM)
    from app.services.linkedin_parser import parse_linkedin_heuristic
    result = parse_linkedin_heuristic(raw_text)
    if result:
        return result

    # Fallback to LLM for non-LinkedIn PDFs or weird formats
    api_key = os.environ.get("MISTRAL_API_KEY", "")
    if not api_key:
        return _fallback_parse(raw_text)

    client = Mistral(api_key=api_key)

    prompt = f"""Extract structured profile data from this LinkedIn PDF export. The text is messy because LinkedIn PDFs use a two-column layout — sections are interleaved. Use your judgment to reconstruct the correct structure.

RAW TEXT:
{raw_text[:8000]}

Return valid JSON only:
{{
  "name": "full name (just the person's name, not 'Coordonnées' or other labels)",
  "title": "current or most recent job title",
  "location": "city, country",
  "email": "email if found, otherwise empty string",
  "phone": "phone number if found, otherwise empty string",
  "linkedin": "LinkedIn URL if found, otherwise empty string",
  "summary": "professional summary/about section if found, otherwise empty string",
  "experiences": [
    {{
      "title": "job title",
      "company": "company name",
      "dates": "date range as written",
      "description": "role description as a single paragraph",
      "bullets": ["key achievement or responsibility 1", "key achievement 2"]
    }}
  ],
  "education": [
    {{
      "degree": "degree name",
      "school": "school name",
      "year": "graduation year or date range"
    }}
  ],
  "skills": ["skill1", "skill2"],
  "languages": ["French (Native)", "English (Fluent)", "Spanish (Professional)"]
}}

IMPORTANT:
- Extract ALL experiences from all pages, not just the first few
- Keep the original language of descriptions (don't translate)
- For bullets, split multi-sentence descriptions into separate items
- If a field is not found, use empty string or empty array
- The name is usually on the first line, possibly after 'Coordonnées' or 'Contact'
- Languages section: extract with proficiency level (Native, Fluent, Professional, etc.)
- Phone number: often near the top, may start with + or country code"""

    try:
        response = client.chat.complete(
            model="mistral-small-latest",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"},
            max_tokens=4000,
            temperature=0.1,
        )
        raw_resp = response.choices[0].message.content.strip()
        # Find the JSON object in the response
        start = raw_resp.find("{")
        end = raw_resp.rfind("}") + 1
        if start == -1 or end == 0:
            raise ValueError("No JSON found in response")
        json_str = raw_resp[start:end]
        # Fix common issues
        import re
        json_str = re.sub(r",\s*([}\]])", r"\1", json_str)  # trailing commas
        data = json.loads(json_str)

        def s(val: str | None) -> str:
            """Safe string — convert None/null to empty string."""
            return str(val) if val else ""

        def sl(val: list | None) -> list:
            """Safe list — convert None to empty list."""
            return val if isinstance(val, list) else []

        return Profile(
            name=s(data.get("name")),
            title=s(data.get("title")),
            location=s(data.get("location")),
            email=s(data.get("email")),
            phone=s(data.get("phone")),
            linkedin=s(data.get("linkedin")),
            summary=s(data.get("summary")),
            experiences=[
                Experience(
                    title=s(exp.get("title")),
                    company=s(exp.get("company")),
                    dates=s(exp.get("dates")),
                    description=s(exp.get("description")),
                    bullets=sl(exp.get("bullets")),
                )
                for exp in sl(data.get("experiences"))
            ],
            education=[
                Education(
                    degree=s(edu.get("degree")),
                    school=s(edu.get("school")),
                    year=s(edu.get("year")),
                )
                for edu in sl(data.get("education"))
            ],
            skills=sl(data.get("skills")),
            languages=sl(data.get("languages")),
        )
    except Exception as e:
        import traceback
        traceback.print_exc()
        return _fallback_parse(raw_text)


def _fallback_parse(raw_text: str) -> Profile:
    """Basic fallback if LLM is unavailable."""
    lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
    name = lines[0] if lines else ""
    for prefix in ["Coordonnées ", "Contact "]:
        if name.startswith(prefix):
            name = name[len(prefix):]
    return Profile(name=name, title=lines[1] if len(lines) > 1 else "")