import joblib import numpy as np import pandas as pd import pdfplumber import fitz import re from huggingface_hub import hf_hub_download # ========================================================= # 1. COPY YOUR ORIGINAL FUNCTIONS HERE (VERY IMPORTANT) # ========================================================= # ----- Replace these placeholder functions with your REAL code ----- def clean_text(text): """Clean and preprocess text data""" if pd.isna(text): return "" # Convert to lowercase text = text.lower() # Remove HTML tags text = re.sub(r'<.*?>', '', text) # Remove special characters and digits (keep letters and spaces) text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove extra whitespace text = ' '.join(text.split()) return text def extract_text_from_pdf(pdf_path): """Extract text from PDF file using multiple methods for robustness""" text = "" try: # Method 1: Try with pdfplumber with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if text.strip(): return text except Exception as e: print(f"pdfplumber failed for {pdf_path}: {e}") try: # Method 2: Try with PyPDF2 with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if text.strip(): return text except Exception as e: print(f"PyPDF2 failed for {pdf_path}: {e}") try: # Method 3: Try with PyMuPDF (fitz) - most robust doc = fitz.open(pdf_path) for page in doc: page_text = page.get_text() if page_text: text += page_text + "\n" doc.close() if text.strip(): return text except Exception as e: print(f"PyMuPDF failed for {pdf_path}: {e}") return text if text.strip() else "Unable to extract text from PDF" def extract_skills(text): """Extract skills from resume text""" skill_keywords = [ 'python', 'java', 'javascript', 'sql', 'html', 'css', 'react', 'angular', 'machine learning', 'data analysis', 'excel', 'powerbi', 'tableau', 'project management', 'communication', 'leadership', 'teamwork', 'problem solving', 'analytical', 'creative', 'organizational', 'aws', 'azure', 'docker', 'kubernetes', 'git', 'linux', 'tensorflow', 'pytorch', 'pandas', 'numpy', 'sklearn' ] text_lower = text.lower() found_skills = [] for skill in skill_keywords: if skill in text_lower: found_skills.append(skill) return ', '.join(found_skills) def extract_experience_years(text): """Extract years of experience from resume text""" experience_patterns = [ r'(\d+)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)', r'(\d+)\+\s*(?:years?|yrs?)', r'over\s*(\d+)\s*(?:years?|yrs?)', r'more\s*than\s*(\d+)\s*(?:years?|yrs?)' ] text_lower = text.lower() for pattern in experience_patterns: match = re.search(pattern, text_lower) if match: return int(match.group(1)) return 0 def extract_education_level(text): """Extract education level from resume text""" education_keywords = { 'phd': 4, 'doctorate': 4, 'doctoral': 4, 'master': 3, 'mba': 3, 'ms': 3, 'ma': 3, 'bachelor': 2, 'degree': 2, 'ba': 2, 'bs': 2, 'diploma': 1, 'certificate': 1 } text_lower = text.lower() max_level = 0 for keyword, level in education_keywords.items(): if keyword in text_lower: max_level = max(max_level, level) return max_level def count_technical_terms(text): """Count technical terms in resume""" technical_terms = [ 'algorithm', 'database', 'software', 'development', 'programming', 'analysis', 'system', 'design', 'implementation', 'optimization', 'automation', 'testing', 'debugging', 'framework', 'api' ] text_lower = text.lower() count = 0 for term in technical_terms: count += text_lower.count(term) return count # ========================================================= # 2. LOAD MODEL + PREPROCESSING ARTIFACTS # ========================================================= HF_MODEL_REPO = "vvirothi/resume-parser-ml-model" # <- change to your real repo MODEL_FILENAME = "resume_parser_pipeline.joblib" def load_artifacts(): """ Download the latest model pipeline from Hugging Face Hub and load it with joblib. """ local_path = hf_hub_download( repo_id=HF_MODEL_REPO, filename=MODEL_FILENAME ) artifacts = joblib.load(local_path) return artifacts # ========================================================= # 3. FEATURE PREPARATION # ========================================================= def prepare_features_from_text(raw_text: str, artifacts: dict): """ Converts raw resume text into the exact same feature vector used during training. """ tfidf = artifacts["tfidf"] scaler = artifacts["scaler"] numerical_cols = artifacts["numerical_cols"] # 1. Clean text clean = clean_text(raw_text) # 2. Extract numerical features skills = extract_skills(raw_text) exp_years = extract_experience_years(raw_text) edu_level = extract_education_level(raw_text) tech_terms = count_technical_terms(raw_text) resume_len = len(raw_text) word_count = len(raw_text.split()) num_data = { 'Experience_Years': exp_years, 'Education_Level': edu_level, 'Technical_Terms_Count': tech_terms, 'Resume_Length': resume_len, 'Word_Count': word_count, } # 3. TF-IDF text features text_vec = tfidf.transform([clean]).toarray() # 4. Numerical → scaled in same order num_vec_raw = np.array([[num_data[col] for col in numerical_cols]]) num_vec_scaled = scaler.transform(num_vec_raw) # 5. Combine X_final = np.hstack([text_vec, num_vec_scaled]) return X_final # ========================================================= # 4. MAIN PREDICTION FUNCTION # ========================================================= def predict_from_pdf(pdf_path: str): # Load artifacts from HuggingFace model repo artifacts = load_artifacts() model = artifacts["model"] label_encoder = artifacts["label_encoder"] # Extract text raw_text = extract_text_from_pdf(pdf_path) # Prepare features X_input = prepare_features_from_text(raw_text, artifacts) # Predict y_pred_encoded = model.predict(X_input) predicted_label = label_encoder.inverse_transform(y_pred_encoded)[0] # Predict probabilities (if supported) proba_dict = None if hasattr(model, "predict_proba"): probs = model.predict_proba(X_input)[0] classes = label_encoder.inverse_transform(range(len(probs))) proba_dict = {cls: float(prob) for cls, prob in zip(classes, probs)} return predicted_label, proba_dict, raw_text