Spaces:

vvirothi
/

resume-parser-ml

Sleeping

App Files Files Community

resume-parser-ml / inference.py

vvirothi

Update inference.py

e172461 verified 3 months ago

raw

history blame contribute delete

7.32 kB

	import joblib
	import numpy as np
	import pandas as pd
	import pdfplumber
	import fitz
	import re
	from huggingface_hub import hf_hub_download

	# =========================================================
	# 1. COPY YOUR ORIGINAL FUNCTIONS HERE (VERY IMPORTANT)
	# =========================================================

	# ----- Replace these placeholder functions with your REAL code -----

	def clean_text(text):
	"""Clean and preprocess text data"""
	if pd.isna(text):
	return ""

	# Convert to lowercase
	text = text.lower()

	# Remove HTML tags
	text = re.sub(r'<.*?>', '', text)

	# Remove special characters and digits (keep letters and spaces)
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Remove extra whitespace
	text = ' '.join(text.split())

	return text

	def extract_text_from_pdf(pdf_path):
	"""Extract text from PDF file using multiple methods for robustness"""
	text = ""

	try:
	# Method 1: Try with pdfplumber
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	if text.strip():
	return text

	except Exception as e:
	print(f"pdfplumber failed for {pdf_path}: {e}")

	try:
	# Method 2: Try with PyPDF2
	with open(pdf_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"

	if text.strip():
	return text

	except Exception as e:
	print(f"PyPDF2 failed for {pdf_path}: {e}")

	try:
	# Method 3: Try with PyMuPDF (fitz) - most robust
	doc = fitz.open(pdf_path)
	for page in doc:
	page_text = page.get_text()
	if page_text:
	text += page_text + "\n"
	doc.close()

	if text.strip():
	return text

	except Exception as e:
	print(f"PyMuPDF failed for {pdf_path}: {e}")

	return text if text.strip() else "Unable to extract text from PDF"


	def extract_skills(text):
	"""Extract skills from resume text"""
	skill_keywords = [
	'python', 'java', 'javascript', 'sql', 'html', 'css', 'react', 'angular',
	'machine learning', 'data analysis', 'excel', 'powerbi', 'tableau',
	'project management', 'communication', 'leadership', 'teamwork',
	'problem solving', 'analytical', 'creative', 'organizational',
	'aws', 'azure', 'docker', 'kubernetes', 'git', 'linux',
	'tensorflow', 'pytorch', 'pandas', 'numpy', 'sklearn'
	]

	text_lower = text.lower()
	found_skills = []

	for skill in skill_keywords:
	if skill in text_lower:
	found_skills.append(skill)

	return ', '.join(found_skills)


	def extract_experience_years(text):
	"""Extract years of experience from resume text"""
	experience_patterns = [
	r'(\d+)\s(?:years?\|yrs?)\s(?:of\s*)?(?:experience\|exp)',
	r'(\d+)\+\s*(?:years?\|yrs?)',
	r'over\s(\d+)\s(?:years?\|yrs?)',
	r'more\sthan\s(\d+)\s*(?:years?\|yrs?)'
	]

	text_lower = text.lower()

	for pattern in experience_patterns:
	match = re.search(pattern, text_lower)
	if match:
	return int(match.group(1))

	return 0

	def extract_education_level(text):
	"""Extract education level from resume text"""
	education_keywords = {
	'phd': 4, 'doctorate': 4, 'doctoral': 4,
	'master': 3, 'mba': 3, 'ms': 3, 'ma': 3,
	'bachelor': 2, 'degree': 2, 'ba': 2, 'bs': 2,
	'diploma': 1, 'certificate': 1
	}

	text_lower = text.lower()
	max_level = 0

	for keyword, level in education_keywords.items():
	if keyword in text_lower:
	max_level = max(max_level, level)

	return max_level


	def count_technical_terms(text):
	"""Count technical terms in resume"""
	technical_terms = [
	'algorithm', 'database', 'software', 'development', 'programming',
	'analysis', 'system', 'design', 'implementation', 'optimization',
	'automation', 'testing', 'debugging', 'framework', 'api'
	]

	text_lower = text.lower()
	count = 0

	for term in technical_terms:
	count += text_lower.count(term)

	return count

	# =========================================================
	# 2. LOAD MODEL + PREPROCESSING ARTIFACTS
	# =========================================================

	HF_MODEL_REPO = "vvirothi/resume-parser-ml-model" # <- change to your real repo
	MODEL_FILENAME = "resume_parser_pipeline.joblib"

	def load_artifacts():
	"""
	Download the latest model pipeline from Hugging Face Hub
	and load it with joblib.
	"""
	local_path = hf_hub_download(
	repo_id=HF_MODEL_REPO,
	filename=MODEL_FILENAME
	)
	artifacts = joblib.load(local_path)
	return artifacts

	# =========================================================
	# 3. FEATURE PREPARATION
	# =========================================================

	def prepare_features_from_text(raw_text: str, artifacts: dict):
	"""
	Converts raw resume text into the exact same feature vector used during training.
	"""
	tfidf = artifacts["tfidf"]
	scaler = artifacts["scaler"]
	numerical_cols = artifacts["numerical_cols"]

	# 1. Clean text
	clean = clean_text(raw_text)

	# 2. Extract numerical features
	skills = extract_skills(raw_text)
	exp_years = extract_experience_years(raw_text)
	edu_level = extract_education_level(raw_text)
	tech_terms = count_technical_terms(raw_text)
	resume_len = len(raw_text)
	word_count = len(raw_text.split())

	num_data = {
	'Experience_Years': exp_years,
	'Education_Level': edu_level,
	'Technical_Terms_Count': tech_terms,
	'Resume_Length': resume_len,
	'Word_Count': word_count,
	}

	# 3. TF-IDF text features
	text_vec = tfidf.transform([clean]).toarray()

	# 4. Numerical → scaled in same order
	num_vec_raw = np.array([[num_data[col] for col in numerical_cols]])
	num_vec_scaled = scaler.transform(num_vec_raw)

	# 5. Combine
	X_final = np.hstack([text_vec, num_vec_scaled])

	return X_final


	# =========================================================
	# 4. MAIN PREDICTION FUNCTION
	# =========================================================
	def predict_from_pdf(pdf_path: str):
	# Load artifacts from HuggingFace model repo
	artifacts = load_artifacts()

	model = artifacts["model"]
	label_encoder = artifacts["label_encoder"]

	# Extract text
	raw_text = extract_text_from_pdf(pdf_path)

	# Prepare features
	X_input = prepare_features_from_text(raw_text, artifacts)

	# Predict
	y_pred_encoded = model.predict(X_input)
	predicted_label = label_encoder.inverse_transform(y_pred_encoded)[0]

	# Predict probabilities (if supported)
	proba_dict = None
	if hasattr(model, "predict_proba"):
	probs = model.predict_proba(X_input)[0]
	classes = label_encoder.inverse_transform(range(len(probs)))
	proba_dict = {cls: float(prob) for cls, prob in zip(classes, probs)}

	return predicted_label, proba_dict, raw_text