Spaces:

Sunaina792
/

AI-Interview-system

Sleeping

File size: 8,133 Bytes

aa8e154

"""

modules/resume_parser.py  —  Phase 1 upgrade

Supports: PDF, DOCX, TXT, MD

"""

import os
import re
from dotenv import load_dotenv
import pdfplumber
from groq import Groq

load_dotenv()

GROQ_API_KEY = os.getenv('GROQ_API_KEY')
if not GROQ_API_KEY:
    raise ValueError("GROQ_API_KEY not set in environment.")

client = Groq(api_key=GROQ_API_KEY)
# llama3-70b-8192 was decommissioned Aug 2025 — replaced with successor
MODEL  = 'llama-3.3-70b-versatile'


# ── Text Extractors ───────────────────────────────────────────────────────────

def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


def extract_text_from_docx(docx_path: str) -> str:
    """Extract text from .docx using python-docx."""
    try:
        from docx import Document
    except ImportError:
        raise ImportError("python-docx not installed. Run: pip install python-docx")

    doc   = Document(docx_path)
    lines = []

    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            lines.append(text)

    # Also extract text from tables
    for table in doc.tables:
        for row in table.rows:
            row_text = ' | '.join(
                cell.text.strip() for cell in row.cells if cell.text.strip()
            )
            if row_text:
                lines.append(row_text)

    return '\n'.join(lines).strip()


def extract_text_from_txt(path: str) -> str:
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().strip()


def load_resume(file_path: str) -> str:
    """Load resume text from PDF, DOCX, TXT, or MD file."""
    ext = os.path.splitext(file_path)[-1].lower()

    if ext == '.pdf':
        return extract_text_from_pdf(file_path)
    elif ext == '.docx':
        return extract_text_from_docx(file_path)
    elif ext in ['.txt', '.md']:
        return extract_text_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}. Supported: .pdf, .docx, .txt, .md")


# ── Section Extractor (rule-based pre-processing) ─────────────────────────────

SECTION_HEADERS = {
    'skills':     r'(skills|technical\s+skills|core\s+competencies|technologies)',
    'experience': r'(experience|work\s+experience|employment|professional\s+experience)',
    'projects':   r'(projects|personal\s+projects|academic\s+projects|key\s+projects)',
    'education':  r'(education|academic|qualification)',
}

def _extract_sections(raw_text: str) -> dict:
    """

    Heuristically split resume text into sections.

    Returns dict with section names as keys and extracted text as values.

    """
    sections     = {k: '' for k in SECTION_HEADERS}
    lines        = raw_text.split('\n')
    current      = None
    buffer       = []

    for line in lines:
        stripped = line.strip()
        matched  = False
        for section, pattern in SECTION_HEADERS.items():
            if re.match(pattern, stripped, re.IGNORECASE) and len(stripped) < 60:
                # Save previous section
                if current:
                    sections[current] = '\n'.join(buffer).strip()
                current = section
                buffer  = []
                matched = True
                break
        if not matched and current:
            buffer.append(line)

    if current and buffer:
        sections[current] = '\n'.join(buffer).strip()

    return sections


# ── LLM-based Parser ──────────────────────────────────────────────────────────

def parse_resume(file_path: str) -> dict:
    """

    Full parse: load → extract sections → LLM structured extraction.

    Returns dict with name, job_role, experience, skills, projects, summary, raw_text.

    """
    raw_text = load_resume(file_path)

    if not raw_text or len(raw_text) < 50:
        raise ValueError("Resume appears empty or unreadable.")

    # Pre-extract sections to give LLM focused context
    sections       = _extract_sections(raw_text)
    skills_hint    = sections['skills'][:500]    if sections['skills']    else ''
    projects_hint  = sections['projects'][:800]  if sections['projects']  else ''
    experience_hint= sections['experience'][:800]if sections['experience']else ''

    prompt = f"""You are a precise resume parser. Extract information from the resume below.

Respond ONLY in this exact format — no extra text, no markdown, no preamble:



Name: <full name>

Job Role: <target or most recent job role>

Experience: <total years, e.g. "2 years" or "Fresher">

Skills: <comma-separated top 8-10 technical skills>

Projects: <pipe-separated list of project names, e.g. "Project A | Project B">

Education: <highest degree and institution>

Summary: <2-3 sentence professional summary focused on strengths>



Resume Text:

{raw_text[:3000]}



---

Extracted Sections for reference:

Skills Section: {skills_hint}

Projects Section: {projects_hint}

Experience Section: {experience_hint}"""

    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )

    parsed_text = response.choices[0].message.content
    if not parsed_text:
        raise ValueError("Empty response from LLM during resume parsing.")

    parsed_text = parsed_text.strip()
    result      = {"raw_text": raw_text, "sections": sections}

    for line in parsed_text.split('\n'):
        if ':' not in line:
            continue
        key, _, value = line.partition(':')
        key   = key.strip().lower().replace(' ', '_')
        value = value.strip()
        if value:
            result[key] = value

    return result


# ── Public API ────────────────────────────────────────────────────────────────

def resume_to_profile(file_path: str) -> dict:
    """

    Parse resume and return a clean profile dict ready to feed into llm.py.

    """
    parsed = parse_resume(file_path)

    # Parse projects list
    projects_raw = parsed.get('projects', '')
    projects     = [p.strip() for p in projects_raw.split('|') if p.strip()]

    return {
        'name':        parsed.get('name',       'Candidate'),
        'job_role':    parsed.get('job_role',   'Software Engineer'),
        'experience':  parsed.get('experience', 'Fresher'),
        'skills':      parsed.get('skills',     ''),
        'projects':    projects,
        'education':   parsed.get('education',  ''),
        'summary':     parsed.get('summary',    ''),
        'resume_text': parsed.get('raw_text',   ''),
        'sections':    parsed.get('sections',   {}),
    }


def get_resume_context_for_llm(profile: dict) -> str:
    """

    Format profile into a concise context string to inject into LLM prompts.

    Use this when passing resume info to generate_questions() in llm.py.

    """
    lines = [
        f"Candidate: {profile.get('name', '')}",
        f"Target Role: {profile.get('job_role', '')}",
        f"Experience: {profile.get('experience', '')}",
        f"Skills: {profile.get('skills', '')}",
    ]
    if profile.get('projects'):
        lines.append(f"Notable Projects: {', '.join(profile['projects'][:3])}")
    if profile.get('education'):
        lines.append(f"Education: {profile.get('education', '')}")
    if profile.get('summary'):
        lines.append(f"Summary: {profile.get('summary', '')}")

    return '\n'.join(lines)