AI-Interview-system / modules /resume_parser.py
Sunaina792's picture
Upload 29 files
aa8e154 verified
"""
modules/resume_parser.py β€” Phase 1 upgrade
Supports: PDF, DOCX, TXT, MD
"""
import os
import re
from dotenv import load_dotenv
import pdfplumber
from groq import Groq
load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')
if not GROQ_API_KEY:
raise ValueError("GROQ_API_KEY not set in environment.")
client = Groq(api_key=GROQ_API_KEY)
# llama3-70b-8192 was decommissioned Aug 2025 β€” replaced with successor
MODEL = 'llama-3.3-70b-versatile'
# ── Text Extractors ───────────────────────────────────────────────────────────
def extract_text_from_pdf(pdf_path: str) -> str:
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip()
def extract_text_from_docx(docx_path: str) -> str:
"""Extract text from .docx using python-docx."""
try:
from docx import Document
except ImportError:
raise ImportError("python-docx not installed. Run: pip install python-docx")
doc = Document(docx_path)
lines = []
for para in doc.paragraphs:
text = para.text.strip()
if text:
lines.append(text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
row_text = ' | '.join(
cell.text.strip() for cell in row.cells if cell.text.strip()
)
if row_text:
lines.append(row_text)
return '\n'.join(lines).strip()
def extract_text_from_txt(path: str) -> str:
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
def load_resume(file_path: str) -> str:
"""Load resume text from PDF, DOCX, TXT, or MD file."""
ext = os.path.splitext(file_path)[-1].lower()
if ext == '.pdf':
return extract_text_from_pdf(file_path)
elif ext == '.docx':
return extract_text_from_docx(file_path)
elif ext in ['.txt', '.md']:
return extract_text_from_txt(file_path)
else:
raise ValueError(f"Unsupported file type: {ext}. Supported: .pdf, .docx, .txt, .md")
# ── Section Extractor (rule-based pre-processing) ─────────────────────────────
SECTION_HEADERS = {
'skills': r'(skills|technical\s+skills|core\s+competencies|technologies)',
'experience': r'(experience|work\s+experience|employment|professional\s+experience)',
'projects': r'(projects|personal\s+projects|academic\s+projects|key\s+projects)',
'education': r'(education|academic|qualification)',
}
def _extract_sections(raw_text: str) -> dict:
"""
Heuristically split resume text into sections.
Returns dict with section names as keys and extracted text as values.
"""
sections = {k: '' for k in SECTION_HEADERS}
lines = raw_text.split('\n')
current = None
buffer = []
for line in lines:
stripped = line.strip()
matched = False
for section, pattern in SECTION_HEADERS.items():
if re.match(pattern, stripped, re.IGNORECASE) and len(stripped) < 60:
# Save previous section
if current:
sections[current] = '\n'.join(buffer).strip()
current = section
buffer = []
matched = True
break
if not matched and current:
buffer.append(line)
if current and buffer:
sections[current] = '\n'.join(buffer).strip()
return sections
# ── LLM-based Parser ──────────────────────────────────────────────────────────
def parse_resume(file_path: str) -> dict:
"""
Full parse: load β†’ extract sections β†’ LLM structured extraction.
Returns dict with name, job_role, experience, skills, projects, summary, raw_text.
"""
raw_text = load_resume(file_path)
if not raw_text or len(raw_text) < 50:
raise ValueError("Resume appears empty or unreadable.")
# Pre-extract sections to give LLM focused context
sections = _extract_sections(raw_text)
skills_hint = sections['skills'][:500] if sections['skills'] else ''
projects_hint = sections['projects'][:800] if sections['projects'] else ''
experience_hint= sections['experience'][:800]if sections['experience']else ''
prompt = f"""You are a precise resume parser. Extract information from the resume below.
Respond ONLY in this exact format β€” no extra text, no markdown, no preamble:
Name: <full name>
Job Role: <target or most recent job role>
Experience: <total years, e.g. "2 years" or "Fresher">
Skills: <comma-separated top 8-10 technical skills>
Projects: <pipe-separated list of project names, e.g. "Project A | Project B">
Education: <highest degree and institution>
Summary: <2-3 sentence professional summary focused on strengths>
Resume Text:
{raw_text[:3000]}
---
Extracted Sections for reference:
Skills Section: {skills_hint}
Projects Section: {projects_hint}
Experience Section: {experience_hint}"""
response = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.2,
)
parsed_text = response.choices[0].message.content
if not parsed_text:
raise ValueError("Empty response from LLM during resume parsing.")
parsed_text = parsed_text.strip()
result = {"raw_text": raw_text, "sections": sections}
for line in parsed_text.split('\n'):
if ':' not in line:
continue
key, _, value = line.partition(':')
key = key.strip().lower().replace(' ', '_')
value = value.strip()
if value:
result[key] = value
return result
# ── Public API ────────────────────────────────────────────────────────────────
def resume_to_profile(file_path: str) -> dict:
"""
Parse resume and return a clean profile dict ready to feed into llm.py.
"""
parsed = parse_resume(file_path)
# Parse projects list
projects_raw = parsed.get('projects', '')
projects = [p.strip() for p in projects_raw.split('|') if p.strip()]
return {
'name': parsed.get('name', 'Candidate'),
'job_role': parsed.get('job_role', 'Software Engineer'),
'experience': parsed.get('experience', 'Fresher'),
'skills': parsed.get('skills', ''),
'projects': projects,
'education': parsed.get('education', ''),
'summary': parsed.get('summary', ''),
'resume_text': parsed.get('raw_text', ''),
'sections': parsed.get('sections', {}),
}
def get_resume_context_for_llm(profile: dict) -> str:
"""
Format profile into a concise context string to inject into LLM prompts.
Use this when passing resume info to generate_questions() in llm.py.
"""
lines = [
f"Candidate: {profile.get('name', '')}",
f"Target Role: {profile.get('job_role', '')}",
f"Experience: {profile.get('experience', '')}",
f"Skills: {profile.get('skills', '')}",
]
if profile.get('projects'):
lines.append(f"Notable Projects: {', '.join(profile['projects'][:3])}")
if profile.get('education'):
lines.append(f"Education: {profile.get('education', '')}")
if profile.get('summary'):
lines.append(f"Summary: {profile.get('summary', '')}")
return '\n'.join(lines)