File size: 14,520 Bytes

9b02fb4

"""
Resume Section Classifier – Inference Script

Takes raw resume text, splits it into sections, and classifies each section
into one of 8 categories with confidence scores.

Author: Lorenzo Scaturchio (gr8monk3ys)

Usage:
    # Classify a resume file
    python inference.py --file resume.txt

    # Classify inline text
    python inference.py --text "Bachelor of Science in Computer Science, MIT, 2023"

    # Use a custom model path
    python inference.py --model ./model_output/final_model --file resume.txt

    # Output as JSON
    python inference.py --file resume.txt --format json

    # Python API
    from inference import ResumeSectionClassifier
    classifier = ResumeSectionClassifier("./model_output/final_model")
    results = classifier.classify_resume(resume_text)
"""

import json
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class SectionPrediction:
    """A single section classification result."""
    text: str
    label: str
    confidence: float
    all_scores: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return {
            "text": self.text,
            "label": self.label,
            "confidence": round(self.confidence, 4),
            "all_scores": {k: round(v, 4) for k, v in self.all_scores.items()},
        }


@dataclass
class ResumeAnalysis:
    """Complete resume analysis output."""
    sections: list
    section_count: int = 0
    label_distribution: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return {
            "sections": [s.to_dict() for s in self.sections],
            "section_count": self.section_count,
            "label_distribution": self.label_distribution,
        }

    def to_json(self, indent: int = 2) -> str:
        return json.dumps(self.to_dict(), indent=indent)

    def summary(self) -> str:
        """Human-readable summary."""
        lines = [
            f"Resume Analysis: {self.section_count} sections detected",
            "=" * 50,
        ]
        for i, sec in enumerate(self.sections, 1):
            text_preview = sec.text[:80].replace("\n", " ")
            if len(sec.text) > 80:
                text_preview += "..."
            lines.append(
                f"\n[{i}] {sec.label.upper()} (confidence: {sec.confidence:.1%})"
            )
            lines.append(f"    {text_preview}")

        lines.append("\n" + "-" * 50)
        lines.append("Label Distribution:")
        for label, count in sorted(self.label_distribution.items()):
            lines.append(f"  {label}: {count}")

        return "\n".join(lines)


# ---------------------------------------------------------------------------
# Section splitting heuristics
# ---------------------------------------------------------------------------

# Common resume section headers (case-insensitive patterns)
SECTION_HEADER_PATTERNS = [
    r"^#{1,3}\s+.+$",  # Markdown headers
    r"^[A-Z][A-Z\s&/,]{2,}$",  # ALL CAPS headers
    r"^(?:EDUCATION|EXPERIENCE|WORK EXPERIENCE|PROFESSIONAL EXPERIENCE|"
    r"SKILLS|TECHNICAL SKILLS|PROJECTS|PERSONAL PROJECTS|"
    r"SUMMARY|PROFESSIONAL SUMMARY|OBJECTIVE|PROFILE|ABOUT|"
    r"CERTIFICATIONS|CERTIFICATES|LICENSES|"
    r"CONTACT|CONTACT INFORMATION|PERSONAL INFORMATION|"
    r"AWARDS|HONORS|ACHIEVEMENTS|RECOGNITION|"
    r"PUBLICATIONS|REFERENCES|VOLUNTEER|LANGUAGES|INTERESTS|"
    r"ACTIVITIES|LEADERSHIP|RESEARCH)\s*:?\s*$",
]

COMPILED_HEADERS = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in SECTION_HEADER_PATTERNS]


def is_section_header(line: str) -> bool:
    """Check if a line looks like a section header."""
    stripped = line.strip()
    if not stripped or len(stripped) < 3:
        return False

    for pattern in COMPILED_HEADERS:
        if pattern.match(stripped):
            return True

    # Heuristic: short all-caps line
    if stripped.isupper() and len(stripped.split()) <= 5 and len(stripped) < 50:
        return True

    return False


def split_resume_into_sections(text: str, min_section_length: int = 20) -> list:
    """
    Split raw resume text into logical sections.

    Strategy:
    1. First try to split on detected section headers.
    2. Fall back to splitting on double newlines (paragraph breaks).
    3. Filter out very short fragments.

    Args:
        text: Raw resume text.
        min_section_length: Minimum character length for a section.

    Returns:
        List of text sections.
    """
    lines = text.split("\n")
    sections = []
    current_section_lines = []

    # Pass 1: Try header-based splitting
    header_found = False
    for line in lines:
        if is_section_header(line):
            header_found = True
            # Save previous section
            if current_section_lines:
                section_text = "\n".join(current_section_lines).strip()
                if len(section_text) >= min_section_length:
                    sections.append(section_text)
            current_section_lines = [line]
        else:
            current_section_lines.append(line)

    # Don't forget the last section
    if current_section_lines:
        section_text = "\n".join(current_section_lines).strip()
        if len(section_text) >= min_section_length:
            sections.append(section_text)

    # If no headers found, fall back to paragraph splitting
    if not header_found or len(sections) <= 1:
        sections = []
        paragraphs = re.split(r"\n\s*\n", text)
        for para in paragraphs:
            stripped = para.strip()
            if len(stripped) >= min_section_length:
                sections.append(stripped)

    # If still just one big block, return it as-is
    if not sections:
        stripped = text.strip()
        if stripped:
            sections = [stripped]

    return sections


# ---------------------------------------------------------------------------
# Classifier
# ---------------------------------------------------------------------------

class ResumeSectionClassifier:
    """
    Classifies resume text sections into categories.

    Supports both single-section and full-resume classification.
    """

    def __init__(
        self,
        model_path: str = "./model_output/final_model",
        device: str = None,
        max_length: int = 256,
    ):
        """
        Initialize the classifier.

        Args:
            model_path: Path to fine-tuned model directory.
            device: Device string ('cpu', 'cuda', 'mps'). Auto-detected if None.
            max_length: Maximum token sequence length.
        """
        self.model_path = Path(model_path)
        self.max_length = max_length

        # Auto-detect device
        if device is None:
            if torch.cuda.is_available():
                self.device = torch.device("cuda")
            elif torch.backends.mps.is_available():
                self.device = torch.device("mps")
            else:
                self.device = torch.device("cpu")
        else:
            self.device = torch.device(device)

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path))
        self.model = AutoModelForSequenceClassification.from_pretrained(
            str(self.model_path)
        ).to(self.device)
        self.model.eval()

        # Load label mapping
        label_mapping_path = self.model_path / "label_mapping.json"
        if label_mapping_path.exists():
            with open(label_mapping_path) as f:
                mapping = json.load(f)
            self.id2label = {int(k): v for k, v in mapping["id2label"].items()}
            self.label2id = mapping["label2id"]
        else:
            # Fall back to model config
            self.id2label = self.model.config.id2label
            self.label2id = self.model.config.label2id

        self.labels = sorted(self.label2id.keys())

    def classify_section(self, text: str) -> SectionPrediction:
        """
        Classify a single text section.

        Args:
            text: Section text to classify.

        Returns:
            SectionPrediction with label, confidence, and all scores.
        """
        inputs = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding=True,
            return_tensors="pt",
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]

        scores = {self.id2label[i]: probs[i].item() for i in range(len(probs))}
        predicted_id = probs.argmax().item()
        predicted_label = self.id2label[predicted_id]
        confidence = probs[predicted_id].item()

        return SectionPrediction(
            text=text,
            label=predicted_label,
            confidence=confidence,
            all_scores=scores,
        )

    def classify_sections(self, texts: list) -> list:
        """
        Classify multiple text sections (batched).

        Args:
            texts: List of section texts.

        Returns:
            List of SectionPrediction objects.
        """
        if not texts:
            return []

        inputs = self.tokenizer(
            texts,
            truncation=True,
            max_length=self.max_length,
            padding=True,
            return_tensors="pt",
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)

        results = []
        for i, text in enumerate(texts):
            scores = {self.id2label[j]: probs[i][j].item() for j in range(probs.shape[1])}
            predicted_id = probs[i].argmax().item()
            predicted_label = self.id2label[predicted_id]
            confidence = probs[i][predicted_id].item()

            results.append(SectionPrediction(
                text=text,
                label=predicted_label,
                confidence=confidence,
                all_scores=scores,
            ))

        return results

    def classify_resume(
        self,
        resume_text: str,
        min_section_length: int = 20,
    ) -> ResumeAnalysis:
        """
        Classify a full resume by splitting into sections and classifying each.

        Args:
            resume_text: Full resume text.
            min_section_length: Minimum section length in characters.

        Returns:
            ResumeAnalysis with all section predictions.
        """
        sections = split_resume_into_sections(resume_text, min_section_length)
        predictions = self.classify_sections(sections)

        # Compute label distribution
        label_dist = {}
        for pred in predictions:
            label_dist[pred.label] = label_dist.get(pred.label, 0) + 1

        return ResumeAnalysis(
            sections=predictions,
            section_count=len(predictions),
            label_distribution=label_dist,
        )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="Classify resume sections",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python inference.py --file resume.txt
  python inference.py --text "BS in Computer Science, MIT, 2023"
  python inference.py --file resume.txt --format json
  python inference.py --model ./model_output/final_model --file resume.txt
        """,
    )

    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument("--file", type=str, help="Path to resume text file")
    input_group.add_argument("--text", type=str, help="Direct text to classify")

    parser.add_argument("--model", type=str, default="./model_output/final_model",
                        help="Path to fine-tuned model (default: ./model_output/final_model)")
    parser.add_argument("--device", type=str, default=None,
                        help="Device: cpu, cuda, mps (auto-detected if omitted)")
    parser.add_argument("--max-length", type=int, default=256,
                        help="Maximum token sequence length (default: 256)")
    parser.add_argument("--min-section-length", type=int, default=20,
                        help="Minimum section length in characters (default: 20)")
    parser.add_argument("--format", type=str, choices=["text", "json"], default="text",
                        help="Output format (default: text)")
    parser.add_argument("--single", action="store_true",
                        help="Classify as single section (no splitting)")

    args = parser.parse_args()

    # Load classifier
    try:
        classifier = ResumeSectionClassifier(
            model_path=args.model,
            device=args.device,
            max_length=args.max_length,
        )
    except Exception as e:
        print(f"Error loading model from '{args.model}': {e}", file=sys.stderr)
        print("Have you trained the model yet? Run: python train.py", file=sys.stderr)
        sys.exit(1)

    # Get input text
    if args.file:
        file_path = Path(args.file)
        if not file_path.exists():
            print(f"File not found: {args.file}", file=sys.stderr)
            sys.exit(1)
        text = file_path.read_text(encoding="utf-8")
    else:
        text = args.text

    # Classify
    if args.single:
        result = classifier.classify_section(text)
        if args.format == "json":
            print(json.dumps(result.to_dict(), indent=2))
        else:
            print(f"Label: {result.label}")
            print(f"Confidence: {result.confidence:.1%}")
            print("\nAll scores:")
            for label, score in sorted(result.all_scores.items(), key=lambda x: -x[1]):
                bar = "#" * int(score * 40)
                print(f"  {label:20s} {score:.4f} {bar}")
    else:
        analysis = classifier.classify_resume(text, min_section_length=args.min_section_length)
        if args.format == "json":
            print(analysis.to_json())
        else:
            print(analysis.summary())


if __name__ == "__main__":
    main()