gr8monk3ys's picture
Upload folder using huggingface_hub
9b02fb4 verified
"""
Resume Section Classifier – Inference Script
Takes raw resume text, splits it into sections, and classifies each section
into one of 8 categories with confidence scores.
Author: Lorenzo Scaturchio (gr8monk3ys)
Usage:
# Classify a resume file
python inference.py --file resume.txt
# Classify inline text
python inference.py --text "Bachelor of Science in Computer Science, MIT, 2023"
# Use a custom model path
python inference.py --model ./model_output/final_model --file resume.txt
# Output as JSON
python inference.py --file resume.txt --format json
# Python API
from inference import ResumeSectionClassifier
classifier = ResumeSectionClassifier("./model_output/final_model")
results = classifier.classify_resume(resume_text)
"""
import json
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class SectionPrediction:
"""A single section classification result."""
text: str
label: str
confidence: float
all_scores: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"text": self.text,
"label": self.label,
"confidence": round(self.confidence, 4),
"all_scores": {k: round(v, 4) for k, v in self.all_scores.items()},
}
@dataclass
class ResumeAnalysis:
"""Complete resume analysis output."""
sections: list
section_count: int = 0
label_distribution: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"sections": [s.to_dict() for s in self.sections],
"section_count": self.section_count,
"label_distribution": self.label_distribution,
}
def to_json(self, indent: int = 2) -> str:
return json.dumps(self.to_dict(), indent=indent)
def summary(self) -> str:
"""Human-readable summary."""
lines = [
f"Resume Analysis: {self.section_count} sections detected",
"=" * 50,
]
for i, sec in enumerate(self.sections, 1):
text_preview = sec.text[:80].replace("\n", " ")
if len(sec.text) > 80:
text_preview += "..."
lines.append(
f"\n[{i}] {sec.label.upper()} (confidence: {sec.confidence:.1%})"
)
lines.append(f" {text_preview}")
lines.append("\n" + "-" * 50)
lines.append("Label Distribution:")
for label, count in sorted(self.label_distribution.items()):
lines.append(f" {label}: {count}")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Section splitting heuristics
# ---------------------------------------------------------------------------
# Common resume section headers (case-insensitive patterns)
SECTION_HEADER_PATTERNS = [
r"^#{1,3}\s+.+$", # Markdown headers
r"^[A-Z][A-Z\s&/,]{2,}$", # ALL CAPS headers
r"^(?:EDUCATION|EXPERIENCE|WORK EXPERIENCE|PROFESSIONAL EXPERIENCE|"
r"SKILLS|TECHNICAL SKILLS|PROJECTS|PERSONAL PROJECTS|"
r"SUMMARY|PROFESSIONAL SUMMARY|OBJECTIVE|PROFILE|ABOUT|"
r"CERTIFICATIONS|CERTIFICATES|LICENSES|"
r"CONTACT|CONTACT INFORMATION|PERSONAL INFORMATION|"
r"AWARDS|HONORS|ACHIEVEMENTS|RECOGNITION|"
r"PUBLICATIONS|REFERENCES|VOLUNTEER|LANGUAGES|INTERESTS|"
r"ACTIVITIES|LEADERSHIP|RESEARCH)\s*:?\s*$",
]
COMPILED_HEADERS = [re.compile(p, re.MULTILINE | re.IGNORECASE) for p in SECTION_HEADER_PATTERNS]
def is_section_header(line: str) -> bool:
"""Check if a line looks like a section header."""
stripped = line.strip()
if not stripped or len(stripped) < 3:
return False
for pattern in COMPILED_HEADERS:
if pattern.match(stripped):
return True
# Heuristic: short all-caps line
if stripped.isupper() and len(stripped.split()) <= 5 and len(stripped) < 50:
return True
return False
def split_resume_into_sections(text: str, min_section_length: int = 20) -> list:
"""
Split raw resume text into logical sections.
Strategy:
1. First try to split on detected section headers.
2. Fall back to splitting on double newlines (paragraph breaks).
3. Filter out very short fragments.
Args:
text: Raw resume text.
min_section_length: Minimum character length for a section.
Returns:
List of text sections.
"""
lines = text.split("\n")
sections = []
current_section_lines = []
# Pass 1: Try header-based splitting
header_found = False
for line in lines:
if is_section_header(line):
header_found = True
# Save previous section
if current_section_lines:
section_text = "\n".join(current_section_lines).strip()
if len(section_text) >= min_section_length:
sections.append(section_text)
current_section_lines = [line]
else:
current_section_lines.append(line)
# Don't forget the last section
if current_section_lines:
section_text = "\n".join(current_section_lines).strip()
if len(section_text) >= min_section_length:
sections.append(section_text)
# If no headers found, fall back to paragraph splitting
if not header_found or len(sections) <= 1:
sections = []
paragraphs = re.split(r"\n\s*\n", text)
for para in paragraphs:
stripped = para.strip()
if len(stripped) >= min_section_length:
sections.append(stripped)
# If still just one big block, return it as-is
if not sections:
stripped = text.strip()
if stripped:
sections = [stripped]
return sections
# ---------------------------------------------------------------------------
# Classifier
# ---------------------------------------------------------------------------
class ResumeSectionClassifier:
"""
Classifies resume text sections into categories.
Supports both single-section and full-resume classification.
"""
def __init__(
self,
model_path: str = "./model_output/final_model",
device: str = None,
max_length: int = 256,
):
"""
Initialize the classifier.
Args:
model_path: Path to fine-tuned model directory.
device: Device string ('cpu', 'cuda', 'mps'). Auto-detected if None.
max_length: Maximum token sequence length.
"""
self.model_path = Path(model_path)
self.max_length = max_length
# Auto-detect device
if device is None:
if torch.cuda.is_available():
self.device = torch.device("cuda")
elif torch.backends.mps.is_available():
self.device = torch.device("mps")
else:
self.device = torch.device("cpu")
else:
self.device = torch.device(device)
# Load model and tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(str(self.model_path))
self.model = AutoModelForSequenceClassification.from_pretrained(
str(self.model_path)
).to(self.device)
self.model.eval()
# Load label mapping
label_mapping_path = self.model_path / "label_mapping.json"
if label_mapping_path.exists():
with open(label_mapping_path) as f:
mapping = json.load(f)
self.id2label = {int(k): v for k, v in mapping["id2label"].items()}
self.label2id = mapping["label2id"]
else:
# Fall back to model config
self.id2label = self.model.config.id2label
self.label2id = self.model.config.label2id
self.labels = sorted(self.label2id.keys())
def classify_section(self, text: str) -> SectionPrediction:
"""
Classify a single text section.
Args:
text: Section text to classify.
Returns:
SectionPrediction with label, confidence, and all scores.
"""
inputs = self.tokenizer(
text,
truncation=True,
max_length=self.max_length,
padding=True,
return_tensors="pt",
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0]
scores = {self.id2label[i]: probs[i].item() for i in range(len(probs))}
predicted_id = probs.argmax().item()
predicted_label = self.id2label[predicted_id]
confidence = probs[predicted_id].item()
return SectionPrediction(
text=text,
label=predicted_label,
confidence=confidence,
all_scores=scores,
)
def classify_sections(self, texts: list) -> list:
"""
Classify multiple text sections (batched).
Args:
texts: List of section texts.
Returns:
List of SectionPrediction objects.
"""
if not texts:
return []
inputs = self.tokenizer(
texts,
truncation=True,
max_length=self.max_length,
padding=True,
return_tensors="pt",
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
results = []
for i, text in enumerate(texts):
scores = {self.id2label[j]: probs[i][j].item() for j in range(probs.shape[1])}
predicted_id = probs[i].argmax().item()
predicted_label = self.id2label[predicted_id]
confidence = probs[i][predicted_id].item()
results.append(SectionPrediction(
text=text,
label=predicted_label,
confidence=confidence,
all_scores=scores,
))
return results
def classify_resume(
self,
resume_text: str,
min_section_length: int = 20,
) -> ResumeAnalysis:
"""
Classify a full resume by splitting into sections and classifying each.
Args:
resume_text: Full resume text.
min_section_length: Minimum section length in characters.
Returns:
ResumeAnalysis with all section predictions.
"""
sections = split_resume_into_sections(resume_text, min_section_length)
predictions = self.classify_sections(sections)
# Compute label distribution
label_dist = {}
for pred in predictions:
label_dist[pred.label] = label_dist.get(pred.label, 0) + 1
return ResumeAnalysis(
sections=predictions,
section_count=len(predictions),
label_distribution=label_dist,
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
import argparse
parser = argparse.ArgumentParser(
description="Classify resume sections",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python inference.py --file resume.txt
python inference.py --text "BS in Computer Science, MIT, 2023"
python inference.py --file resume.txt --format json
python inference.py --model ./model_output/final_model --file resume.txt
""",
)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument("--file", type=str, help="Path to resume text file")
input_group.add_argument("--text", type=str, help="Direct text to classify")
parser.add_argument("--model", type=str, default="./model_output/final_model",
help="Path to fine-tuned model (default: ./model_output/final_model)")
parser.add_argument("--device", type=str, default=None,
help="Device: cpu, cuda, mps (auto-detected if omitted)")
parser.add_argument("--max-length", type=int, default=256,
help="Maximum token sequence length (default: 256)")
parser.add_argument("--min-section-length", type=int, default=20,
help="Minimum section length in characters (default: 20)")
parser.add_argument("--format", type=str, choices=["text", "json"], default="text",
help="Output format (default: text)")
parser.add_argument("--single", action="store_true",
help="Classify as single section (no splitting)")
args = parser.parse_args()
# Load classifier
try:
classifier = ResumeSectionClassifier(
model_path=args.model,
device=args.device,
max_length=args.max_length,
)
except Exception as e:
print(f"Error loading model from '{args.model}': {e}", file=sys.stderr)
print("Have you trained the model yet? Run: python train.py", file=sys.stderr)
sys.exit(1)
# Get input text
if args.file:
file_path = Path(args.file)
if not file_path.exists():
print(f"File not found: {args.file}", file=sys.stderr)
sys.exit(1)
text = file_path.read_text(encoding="utf-8")
else:
text = args.text
# Classify
if args.single:
result = classifier.classify_section(text)
if args.format == "json":
print(json.dumps(result.to_dict(), indent=2))
else:
print(f"Label: {result.label}")
print(f"Confidence: {result.confidence:.1%}")
print("\nAll scores:")
for label, score in sorted(result.all_scores.items(), key=lambda x: -x[1]):
bar = "#" * int(score * 40)
print(f" {label:20s} {score:.4f} {bar}")
else:
analysis = classifier.classify_resume(text, min_section_length=args.min_section_length)
if args.format == "json":
print(analysis.to_json())
else:
print(analysis.summary())
if __name__ == "__main__":
main()