|
|
""" |
|
|
Resume Section Classifier – Inference Script |
|
|
|
|
|
Takes raw resume text, splits it into sections, and classifies each section |
|
|
into one of 8 categories with confidence scores. |
|
|
|
|
|
Author: Lorenzo Scaturchio (gr8monk3ys) |
|
|
|
|
|
Usage: |
|
|
# Classify a resume file |
|
|
python inference.py --file resume.txt |
|
|
|
|
|
# Classify inline text |
|
|
python inference.py --text "Bachelor of Science in Computer Science, MIT, 2023" |
|
|
|
|
|
# Use a custom model path |
|
|
python inference.py --model ./model_output/final_model --file resume.txt |
|
|
|
|
|
# Output as JSON |
|
|
python inference.py --file resume.txt --format json |
|
|
|
|
|
# Python API |
|
|
from inference import ResumeSectionClassifier |
|
|
classifier = ResumeSectionClassifier("./model_output/final_model") |
|
|
results = classifier.classify_resume(resume_text) |
|
|
""" |
|
|
|
|
|
import json |
|
|
import re |
|
|
import sys |
|
|
from dataclasses import dataclass, field |
|
|
from pathlib import Path |
|
|
|
|
|
import torch |
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class SectionPrediction: |
|
|
"""A single section classification result.""" |
|
|
text: str |
|
|
label: str |
|
|
confidence: float |
|
|
all_scores: dict = field(default_factory=dict) |
|
|
|
|
|
def to_dict(self) -> dict: |
|
|
return { |
|
|
"text": self.text, |
|
|
"label": self.label, |
|
|
"confidence": round(self.confidence, 4), |
|
|
"all_scores": {k: round(v, 4) for k, v in self.all_scores.items()}, |
|
|
} |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class ResumeAnalysis: |
|
|
"""Complete resume analysis output.""" |
|
|
sections: list |
|
|
section_count: int = 0 |
|
|
label_distribution: dict = field(default_factory=dict) |
|
|
|
|
|
def to_dict(self) -> dict: |
|
|
return { |
|
|
"sections": [s.to_dict() for s in self.sections], |
|
|
"section_count": self.section_count, |
|
|
"label_distribution": self.label_distribution, |
|
|
} |
|
|
|
|
|
def to_json(self, indent: int = 2) -> str: |
|
|
return json.dumps(self.to_dict(), indent=indent) |
|
|
|
|
|
def summary(self) -> str: |
|
|
"""Human-readable summary.""" |
|
|
lines = [ |
|
|
f"Resume Analysis: {self.section_count} sections detected", |
|
|
"=" * 50, |
|
|
] |
|
|
for i, sec in enumerate(self.sections, 1): |
|
|
text_preview = sec.text[:80].replace("\n", " ") |
|
|
if len(sec.text) > 80: |
|
|
text_preview += "..." |
|
|
lines.append( |
|
|
f"\n[{i}] {sec.label.upper()} (confidence: {sec.confidence:.1%})" |
|
|
) |
|
|
lines.append(f" {text_preview}") |
|
|
|
|
|
lines.append("\n" + "-" * 50) |
|
|
lines.append("Label Distribution:") |
|
|
for label, count in sorted(self.label_distribution.items()): |
|
|
lines.append(f" {label}: {count}") |
|
|
|
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SECTION_HEADER_PATTERNS = [ |
|
|
r"^#{1,3}\s+.+$", |
|
|
r"^[A-Z][A-Z\s&/,]{2,}$", |
|
|
r"^(?:EDUCATION|EXPERIENCE|WORK EXPERIENCE|PROFESSIONAL EXPERIENCE|" |
|
|
r"SKILLS|TECHNICAL SKILLS|PROJECTS|PERSONAL PROJECTS|" |
|
|
r"SUMMARY|PROFESSIONAL SUMMARY|OBJECTIVE|PROFILE|ABOUT|" |
|
|
r"CERTIFICATIONS|CERTIFICATES|LICENSES|" |
|
|
r"CONTACT|CONTACT INFORMATION|PERSONAL INFORMATION|" |
|
|
r"AWARDS|HONORS|ACHIEVEMENTS|RECOGNITION|" |
|
|
r"PUBLICATIONS|REFERENCES|VOLUNTEER|LANGUAGES|INTERESTS|" |
|
|
r"ACTIVITIES|LEADERSHIP|RESEARCH)\s*:?\s*$", |
|
|
] |
|
|
|
|
|
COMPILED_HEADERS = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in SECTION_HEADER_PATTERNS] |
|
|
|
|
|
|
|
|
def is_section_header(line: str) -> bool: |
|
|
"""Check if a line looks like a section header.""" |
|
|
stripped = line.strip() |
|
|
if not stripped or len(stripped) < 3: |
|
|
return False |
|
|
|
|
|
for pattern in COMPILED_HEADERS: |
|
|
if pattern.match(stripped): |
|
|
return True |
|
|
|
|
|
|
|
|
if stripped.isupper() and len(stripped.split()) <= 5 and len(stripped) < 50: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
def split_resume_into_sections(text: str, min_section_length: int = 20) -> list: |
|
|
""" |
|
|
Split raw resume text into logical sections. |
|
|
|
|
|
Strategy: |
|
|
1. First try to split on detected section headers. |
|
|
2. Fall back to splitting on double newlines (paragraph breaks). |
|
|
3. Filter out very short fragments. |
|
|
|
|
|
Args: |
|
|
text: Raw resume text. |
|
|
min_section_length: Minimum character length for a section. |
|
|
|
|
|
Returns: |
|
|
List of text sections. |
|
|
""" |
|
|
lines = text.split("\n") |
|
|
sections = [] |
|
|
current_section_lines = [] |
|
|
|
|
|
|
|
|
header_found = False |
|
|
for line in lines: |
|
|
if is_section_header(line): |
|
|
header_found = True |
|
|
|
|
|
if current_section_lines: |
|
|
section_text = "\n".join(current_section_lines).strip() |
|
|
if len(section_text) >= min_section_length: |
|
|
sections.append(section_text) |
|
|
current_section_lines = [line] |
|
|
else: |
|
|
current_section_lines.append(line) |
|
|
|
|
|
|
|
|
if current_section_lines: |
|
|
section_text = "\n".join(current_section_lines).strip() |
|
|
if len(section_text) >= min_section_length: |
|
|
sections.append(section_text) |
|
|
|
|
|
|
|
|
if not header_found or len(sections) <= 1: |
|
|
sections = [] |
|
|
paragraphs = re.split(r"\n\s*\n", text) |
|
|
for para in paragraphs: |
|
|
stripped = para.strip() |
|
|
if len(stripped) >= min_section_length: |
|
|
sections.append(stripped) |
|
|
|
|
|
|
|
|
if not sections: |
|
|
stripped = text.strip() |
|
|
if stripped: |
|
|
sections = [stripped] |
|
|
|
|
|
return sections |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ResumeSectionClassifier: |
|
|
""" |
|
|
Classifies resume text sections into categories. |
|
|
|
|
|
Supports both single-section and full-resume classification. |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
model_path: str = "./model_output/final_model", |
|
|
device: str = None, |
|
|
max_length: int = 256, |
|
|
): |
|
|
""" |
|
|
Initialize the classifier. |
|
|
|
|
|
Args: |
|
|
model_path: Path to fine-tuned model directory. |
|
|
device: Device string ('cpu', 'cuda', 'mps'). Auto-detected if None. |
|
|
max_length: Maximum token sequence length. |
|
|
""" |
|
|
self.model_path = Path(model_path) |
|
|
self.max_length = max_length |
|
|
|
|
|
|
|
|
if device is None: |
|
|
if torch.cuda.is_available(): |
|
|
self.device = torch.device("cuda") |
|
|
elif torch.backends.mps.is_available(): |
|
|
self.device = torch.device("mps") |
|
|
else: |
|
|
self.device = torch.device("cpu") |
|
|
else: |
|
|
self.device = torch.device(device) |
|
|
|
|
|
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path)) |
|
|
self.model = AutoModelForSequenceClassification.from_pretrained( |
|
|
str(self.model_path) |
|
|
).to(self.device) |
|
|
self.model.eval() |
|
|
|
|
|
|
|
|
label_mapping_path = self.model_path / "label_mapping.json" |
|
|
if label_mapping_path.exists(): |
|
|
with open(label_mapping_path) as f: |
|
|
mapping = json.load(f) |
|
|
self.id2label = {int(k): v for k, v in mapping["id2label"].items()} |
|
|
self.label2id = mapping["label2id"] |
|
|
else: |
|
|
|
|
|
self.id2label = self.model.config.id2label |
|
|
self.label2id = self.model.config.label2id |
|
|
|
|
|
self.labels = sorted(self.label2id.keys()) |
|
|
|
|
|
def classify_section(self, text: str) -> SectionPrediction: |
|
|
""" |
|
|
Classify a single text section. |
|
|
|
|
|
Args: |
|
|
text: Section text to classify. |
|
|
|
|
|
Returns: |
|
|
SectionPrediction with label, confidence, and all scores. |
|
|
""" |
|
|
inputs = self.tokenizer( |
|
|
text, |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self.model(**inputs) |
|
|
probs = torch.softmax(outputs.logits, dim=-1)[0] |
|
|
|
|
|
scores = {self.id2label[i]: probs[i].item() for i in range(len(probs))} |
|
|
predicted_id = probs.argmax().item() |
|
|
predicted_label = self.id2label[predicted_id] |
|
|
confidence = probs[predicted_id].item() |
|
|
|
|
|
return SectionPrediction( |
|
|
text=text, |
|
|
label=predicted_label, |
|
|
confidence=confidence, |
|
|
all_scores=scores, |
|
|
) |
|
|
|
|
|
def classify_sections(self, texts: list) -> list: |
|
|
""" |
|
|
Classify multiple text sections (batched). |
|
|
|
|
|
Args: |
|
|
texts: List of section texts. |
|
|
|
|
|
Returns: |
|
|
List of SectionPrediction objects. |
|
|
""" |
|
|
if not texts: |
|
|
return [] |
|
|
|
|
|
inputs = self.tokenizer( |
|
|
texts, |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self.model(**inputs) |
|
|
probs = torch.softmax(outputs.logits, dim=-1) |
|
|
|
|
|
results = [] |
|
|
for i, text in enumerate(texts): |
|
|
scores = {self.id2label[j]: probs[i][j].item() for j in range(probs.shape[1])} |
|
|
predicted_id = probs[i].argmax().item() |
|
|
predicted_label = self.id2label[predicted_id] |
|
|
confidence = probs[i][predicted_id].item() |
|
|
|
|
|
results.append(SectionPrediction( |
|
|
text=text, |
|
|
label=predicted_label, |
|
|
confidence=confidence, |
|
|
all_scores=scores, |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
def classify_resume( |
|
|
self, |
|
|
resume_text: str, |
|
|
min_section_length: int = 20, |
|
|
) -> ResumeAnalysis: |
|
|
""" |
|
|
Classify a full resume by splitting into sections and classifying each. |
|
|
|
|
|
Args: |
|
|
resume_text: Full resume text. |
|
|
min_section_length: Minimum section length in characters. |
|
|
|
|
|
Returns: |
|
|
ResumeAnalysis with all section predictions. |
|
|
""" |
|
|
sections = split_resume_into_sections(resume_text, min_section_length) |
|
|
predictions = self.classify_sections(sections) |
|
|
|
|
|
|
|
|
label_dist = {} |
|
|
for pred in predictions: |
|
|
label_dist[pred.label] = label_dist.get(pred.label, 0) + 1 |
|
|
|
|
|
return ResumeAnalysis( |
|
|
sections=predictions, |
|
|
section_count=len(predictions), |
|
|
label_distribution=label_dist, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Classify resume sections", |
|
|
formatter_class=argparse.RawDescriptionHelpFormatter, |
|
|
epilog=""" |
|
|
Examples: |
|
|
python inference.py --file resume.txt |
|
|
python inference.py --text "BS in Computer Science, MIT, 2023" |
|
|
python inference.py --file resume.txt --format json |
|
|
python inference.py --model ./model_output/final_model --file resume.txt |
|
|
""", |
|
|
) |
|
|
|
|
|
input_group = parser.add_mutually_exclusive_group(required=True) |
|
|
input_group.add_argument("--file", type=str, help="Path to resume text file") |
|
|
input_group.add_argument("--text", type=str, help="Direct text to classify") |
|
|
|
|
|
parser.add_argument("--model", type=str, default="./model_output/final_model", |
|
|
help="Path to fine-tuned model (default: ./model_output/final_model)") |
|
|
parser.add_argument("--device", type=str, default=None, |
|
|
help="Device: cpu, cuda, mps (auto-detected if omitted)") |
|
|
parser.add_argument("--max-length", type=int, default=256, |
|
|
help="Maximum token sequence length (default: 256)") |
|
|
parser.add_argument("--min-section-length", type=int, default=20, |
|
|
help="Minimum section length in characters (default: 20)") |
|
|
parser.add_argument("--format", type=str, choices=["text", "json"], default="text", |
|
|
help="Output format (default: text)") |
|
|
parser.add_argument("--single", action="store_true", |
|
|
help="Classify as single section (no splitting)") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
try: |
|
|
classifier = ResumeSectionClassifier( |
|
|
model_path=args.model, |
|
|
device=args.device, |
|
|
max_length=args.max_length, |
|
|
) |
|
|
except Exception as e: |
|
|
print(f"Error loading model from '{args.model}': {e}", file=sys.stderr) |
|
|
print("Have you trained the model yet? Run: python train.py", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
if args.file: |
|
|
file_path = Path(args.file) |
|
|
if not file_path.exists(): |
|
|
print(f"File not found: {args.file}", file=sys.stderr) |
|
|
sys.exit(1) |
|
|
text = file_path.read_text(encoding="utf-8") |
|
|
else: |
|
|
text = args.text |
|
|
|
|
|
|
|
|
if args.single: |
|
|
result = classifier.classify_section(text) |
|
|
if args.format == "json": |
|
|
print(json.dumps(result.to_dict(), indent=2)) |
|
|
else: |
|
|
print(f"Label: {result.label}") |
|
|
print(f"Confidence: {result.confidence:.1%}") |
|
|
print("\nAll scores:") |
|
|
for label, score in sorted(result.all_scores.items(), key=lambda x: -x[1]): |
|
|
bar = "#" * int(score * 40) |
|
|
print(f" {label:20s} {score:.4f} {bar}") |
|
|
else: |
|
|
analysis = classifier.classify_resume(text, min_section_length=args.min_section_length) |
|
|
if args.format == "json": |
|
|
print(analysis.to_json()) |
|
|
else: |
|
|
print(analysis.summary()) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|