resume-parser-ml / inference.py
vvirothi's picture
Update inference.py
e172461 verified
import joblib
import numpy as np
import pandas as pd
import pdfplumber
import fitz
import re
from huggingface_hub import hf_hub_download
# =========================================================
# 1. COPY YOUR ORIGINAL FUNCTIONS HERE (VERY IMPORTANT)
# =========================================================
# ----- Replace these placeholder functions with your REAL code -----
def clean_text(text):
"""Clean and preprocess text data"""
if pd.isna(text):
return ""
# Convert to lowercase
text = text.lower()
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# Remove special characters and digits (keep letters and spaces)
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def extract_text_from_pdf(pdf_path):
"""Extract text from PDF file using multiple methods for robustness"""
text = ""
try:
# Method 1: Try with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return text
except Exception as e:
print(f"pdfplumber failed for {pdf_path}: {e}")
try:
# Method 2: Try with PyPDF2
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return text
except Exception as e:
print(f"PyPDF2 failed for {pdf_path}: {e}")
try:
# Method 3: Try with PyMuPDF (fitz) - most robust
doc = fitz.open(pdf_path)
for page in doc:
page_text = page.get_text()
if page_text:
text += page_text + "\n"
doc.close()
if text.strip():
return text
except Exception as e:
print(f"PyMuPDF failed for {pdf_path}: {e}")
return text if text.strip() else "Unable to extract text from PDF"
def extract_skills(text):
"""Extract skills from resume text"""
skill_keywords = [
'python', 'java', 'javascript', 'sql', 'html', 'css', 'react', 'angular',
'machine learning', 'data analysis', 'excel', 'powerbi', 'tableau',
'project management', 'communication', 'leadership', 'teamwork',
'problem solving', 'analytical', 'creative', 'organizational',
'aws', 'azure', 'docker', 'kubernetes', 'git', 'linux',
'tensorflow', 'pytorch', 'pandas', 'numpy', 'sklearn'
]
text_lower = text.lower()
found_skills = []
for skill in skill_keywords:
if skill in text_lower:
found_skills.append(skill)
return ', '.join(found_skills)
def extract_experience_years(text):
"""Extract years of experience from resume text"""
experience_patterns = [
r'(\d+)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)',
r'(\d+)\+\s*(?:years?|yrs?)',
r'over\s*(\d+)\s*(?:years?|yrs?)',
r'more\s*than\s*(\d+)\s*(?:years?|yrs?)'
]
text_lower = text.lower()
for pattern in experience_patterns:
match = re.search(pattern, text_lower)
if match:
return int(match.group(1))
return 0
def extract_education_level(text):
"""Extract education level from resume text"""
education_keywords = {
'phd': 4, 'doctorate': 4, 'doctoral': 4,
'master': 3, 'mba': 3, 'ms': 3, 'ma': 3,
'bachelor': 2, 'degree': 2, 'ba': 2, 'bs': 2,
'diploma': 1, 'certificate': 1
}
text_lower = text.lower()
max_level = 0
for keyword, level in education_keywords.items():
if keyword in text_lower:
max_level = max(max_level, level)
return max_level
def count_technical_terms(text):
"""Count technical terms in resume"""
technical_terms = [
'algorithm', 'database', 'software', 'development', 'programming',
'analysis', 'system', 'design', 'implementation', 'optimization',
'automation', 'testing', 'debugging', 'framework', 'api'
]
text_lower = text.lower()
count = 0
for term in technical_terms:
count += text_lower.count(term)
return count
# =========================================================
# 2. LOAD MODEL + PREPROCESSING ARTIFACTS
# =========================================================
HF_MODEL_REPO = "vvirothi/resume-parser-ml-model" # <- change to your real repo
MODEL_FILENAME = "resume_parser_pipeline.joblib"
def load_artifacts():
"""
Download the latest model pipeline from Hugging Face Hub
and load it with joblib.
"""
local_path = hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=MODEL_FILENAME
)
artifacts = joblib.load(local_path)
return artifacts
# =========================================================
# 3. FEATURE PREPARATION
# =========================================================
def prepare_features_from_text(raw_text: str, artifacts: dict):
"""
Converts raw resume text into the exact same feature vector used during training.
"""
tfidf = artifacts["tfidf"]
scaler = artifacts["scaler"]
numerical_cols = artifacts["numerical_cols"]
# 1. Clean text
clean = clean_text(raw_text)
# 2. Extract numerical features
skills = extract_skills(raw_text)
exp_years = extract_experience_years(raw_text)
edu_level = extract_education_level(raw_text)
tech_terms = count_technical_terms(raw_text)
resume_len = len(raw_text)
word_count = len(raw_text.split())
num_data = {
'Experience_Years': exp_years,
'Education_Level': edu_level,
'Technical_Terms_Count': tech_terms,
'Resume_Length': resume_len,
'Word_Count': word_count,
}
# 3. TF-IDF text features
text_vec = tfidf.transform([clean]).toarray()
# 4. Numerical → scaled in same order
num_vec_raw = np.array([[num_data[col] for col in numerical_cols]])
num_vec_scaled = scaler.transform(num_vec_raw)
# 5. Combine
X_final = np.hstack([text_vec, num_vec_scaled])
return X_final
# =========================================================
# 4. MAIN PREDICTION FUNCTION
# =========================================================
def predict_from_pdf(pdf_path: str):
# Load artifacts from HuggingFace model repo
artifacts = load_artifacts()
model = artifacts["model"]
label_encoder = artifacts["label_encoder"]
# Extract text
raw_text = extract_text_from_pdf(pdf_path)
# Prepare features
X_input = prepare_features_from_text(raw_text, artifacts)
# Predict
y_pred_encoded = model.predict(X_input)
predicted_label = label_encoder.inverse_transform(y_pred_encoded)[0]
# Predict probabilities (if supported)
proba_dict = None
if hasattr(model, "predict_proba"):
probs = model.predict_proba(X_input)[0]
classes = label_encoder.inverse_transform(range(len(probs)))
proba_dict = {cls: float(prob) for cls, prob in zip(classes, probs)}
return predicted_label, proba_dict, raw_text