File size: 8,133 Bytes
aa8e154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
"""

modules/resume_parser.py  β€”  Phase 1 upgrade

Supports: PDF, DOCX, TXT, MD

"""

import os
import re
from dotenv import load_dotenv
import pdfplumber
from groq import Groq

load_dotenv()

GROQ_API_KEY = os.getenv('GROQ_API_KEY')
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not set in environment.")

client = Groq(api_key=GROQ_API_KEY)
# llama3-70b-8192 was decommissioned Aug 2025 β€” replaced with successor
MODEL  = 'llama-3.3-70b-versatile'


# ── Text Extractors ───────────────────────────────────────────────────────────

def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


def extract_text_from_docx(docx_path: str) -> str:
    """Extract text from .docx using python-docx."""
    try:
        from docx import Document
    except ImportError:
        raise ImportError("python-docx not installed. Run: pip install python-docx")

    doc   = Document(docx_path)
    lines = []

    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            lines.append(text)

    # Also extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_text = ' | '.join(
                cell.text.strip() for cell in row.cells if cell.text.strip()
            )
            if row_text:
                lines.append(row_text)

    return '\n'.join(lines).strip()


def extract_text_from_txt(path: str) -> str:
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().strip()


def load_resume(file_path: str) -> str:
    """Load resume text from PDF, DOCX, TXT, or MD file."""
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == '.pdf':
        return extract_text_from_pdf(file_path)
    elif ext == '.docx':
        return extract_text_from_docx(file_path)
    elif ext in ['.txt', '.md']:
        return extract_text_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}. Supported: .pdf, .docx, .txt, .md")


# ── Section Extractor (rule-based pre-processing) ─────────────────────────────

SECTION_HEADERS = {
    'skills':     r'(skills|technical\s+skills|core\s+competencies|technologies)',
    'experience': r'(experience|work\s+experience|employment|professional\s+experience)',
    'projects':   r'(projects|personal\s+projects|academic\s+projects|key\s+projects)',
    'education':  r'(education|academic|qualification)',
}

def _extract_sections(raw_text: str) -> dict:
    """

    Heuristically split resume text into sections.

    Returns dict with section names as keys and extracted text as values.

    """
    sections     = {k: '' for k in SECTION_HEADERS}
    lines        = raw_text.split('\n')
    current      = None
    buffer       = []

    for line in lines:
        stripped = line.strip()
        matched  = False
        for section, pattern in SECTION_HEADERS.items():
            if re.match(pattern, stripped, re.IGNORECASE) and len(stripped) < 60:
                # Save previous section
                if current:
                    sections[current] = '\n'.join(buffer).strip()
                current = section
                buffer  = []
                matched = True
                break
        if not matched and current:
            buffer.append(line)

    if current and buffer:
        sections[current] = '\n'.join(buffer).strip()

    return sections


# ── LLM-based Parser ──────────────────────────────────────────────────────────

def parse_resume(file_path: str) -> dict:
    """

    Full parse: load β†’ extract sections β†’ LLM structured extraction.

    Returns dict with name, job_role, experience, skills, projects, summary, raw_text.

    """
    raw_text = load_resume(file_path)

    if not raw_text or len(raw_text) < 50:
        raise ValueError("Resume appears empty or unreadable.")

    # Pre-extract sections to give LLM focused context
    sections       = _extract_sections(raw_text)
    skills_hint    = sections['skills'][:500]    if sections['skills']    else ''
    projects_hint  = sections['projects'][:800]  if sections['projects']  else ''
    experience_hint= sections['experience'][:800]if sections['experience']else ''

    prompt = f"""You are a precise resume parser. Extract information from the resume below.

Respond ONLY in this exact format β€” no extra text, no markdown, no preamble:



Name: <full name>

Job Role: <target or most recent job role>

Experience: <total years, e.g. "2 years" or "Fresher">

Skills: <comma-separated top 8-10 technical skills>

Projects: <pipe-separated list of project names, e.g. "Project A | Project B">

Education: <highest degree and institution>

Summary: <2-3 sentence professional summary focused on strengths>



Resume Text:

{raw_text[:3000]}



---

Extracted Sections for reference:

Skills Section: {skills_hint}

Projects Section: {projects_hint}

Experience Section: {experience_hint}"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )

    parsed_text = response.choices[0].message.content
    if not parsed_text:
        raise ValueError("Empty response from LLM during resume parsing.")

    parsed_text = parsed_text.strip()
    result      = {"raw_text": raw_text, "sections": sections}

    for line in parsed_text.split('\n'):
        if ':' not in line:
            continue
        key, _, value = line.partition(':')
        key   = key.strip().lower().replace(' ', '_')
        value = value.strip()
        if value:
            result[key] = value

    return result


# ── Public API ────────────────────────────────────────────────────────────────

def resume_to_profile(file_path: str) -> dict:
    """

    Parse resume and return a clean profile dict ready to feed into llm.py.

    """
    parsed = parse_resume(file_path)

    # Parse projects list
    projects_raw = parsed.get('projects', '')
    projects     = [p.strip() for p in projects_raw.split('|') if p.strip()]

    return {
        'name':        parsed.get('name',       'Candidate'),
        'job_role':    parsed.get('job_role',   'Software Engineer'),
        'experience':  parsed.get('experience', 'Fresher'),
        'skills':      parsed.get('skills',     ''),
        'projects':    projects,
        'education':   parsed.get('education',  ''),
        'summary':     parsed.get('summary',    ''),
        'resume_text': parsed.get('raw_text',   ''),
        'sections':    parsed.get('sections',   {}),
    }


def get_resume_context_for_llm(profile: dict) -> str:
    """

    Format profile into a concise context string to inject into LLM prompts.

    Use this when passing resume info to generate_questions() in llm.py.

    """
    lines = [
        f"Candidate: {profile.get('name', '')}",
        f"Target Role: {profile.get('job_role', '')}",
        f"Experience: {profile.get('experience', '')}",
        f"Skills: {profile.get('skills', '')}",
    ]
    if profile.get('projects'):
        lines.append(f"Notable Projects: {', '.join(profile['projects'][:3])}")
    if profile.get('education'):
        lines.append(f"Education: {profile.get('education', '')}")
    if profile.get('summary'):
        lines.append(f"Summary: {profile.get('summary', '')}")

    return '\n'.join(lines)