Spaces:
Sleeping
Sleeping
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| import pdfplumber | |
| import fitz | |
| import re | |
| from huggingface_hub import hf_hub_download | |
| # ========================================================= | |
| # 1. COPY YOUR ORIGINAL FUNCTIONS HERE (VERY IMPORTANT) | |
| # ========================================================= | |
| # ----- Replace these placeholder functions with your REAL code ----- | |
| def clean_text(text): | |
| """Clean and preprocess text data""" | |
| if pd.isna(text): | |
| return "" | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove HTML tags | |
| text = re.sub(r'<.*?>', '', text) | |
| # Remove special characters and digits (keep letters and spaces) | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Remove extra whitespace | |
| text = ' '.join(text.split()) | |
| return text | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract text from PDF file using multiple methods for robustness""" | |
| text = "" | |
| try: | |
| # Method 1: Try with pdfplumber | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page in pdf.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| if text.strip(): | |
| return text | |
| except Exception as e: | |
| print(f"pdfplumber failed for {pdf_path}: {e}") | |
| try: | |
| # Method 2: Try with PyPDF2 | |
| with open(pdf_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| if text.strip(): | |
| return text | |
| except Exception as e: | |
| print(f"PyPDF2 failed for {pdf_path}: {e}") | |
| try: | |
| # Method 3: Try with PyMuPDF (fitz) - most robust | |
| doc = fitz.open(pdf_path) | |
| for page in doc: | |
| page_text = page.get_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| doc.close() | |
| if text.strip(): | |
| return text | |
| except Exception as e: | |
| print(f"PyMuPDF failed for {pdf_path}: {e}") | |
| return text if text.strip() else "Unable to extract text from PDF" | |
| def extract_skills(text): | |
| """Extract skills from resume text""" | |
| skill_keywords = [ | |
| 'python', 'java', 'javascript', 'sql', 'html', 'css', 'react', 'angular', | |
| 'machine learning', 'data analysis', 'excel', 'powerbi', 'tableau', | |
| 'project management', 'communication', 'leadership', 'teamwork', | |
| 'problem solving', 'analytical', 'creative', 'organizational', | |
| 'aws', 'azure', 'docker', 'kubernetes', 'git', 'linux', | |
| 'tensorflow', 'pytorch', 'pandas', 'numpy', 'sklearn' | |
| ] | |
| text_lower = text.lower() | |
| found_skills = [] | |
| for skill in skill_keywords: | |
| if skill in text_lower: | |
| found_skills.append(skill) | |
| return ', '.join(found_skills) | |
| def extract_experience_years(text): | |
| """Extract years of experience from resume text""" | |
| experience_patterns = [ | |
| r'(\d+)\s*(?:years?|yrs?)\s*(?:of\s*)?(?:experience|exp)', | |
| r'(\d+)\+\s*(?:years?|yrs?)', | |
| r'over\s*(\d+)\s*(?:years?|yrs?)', | |
| r'more\s*than\s*(\d+)\s*(?:years?|yrs?)' | |
| ] | |
| text_lower = text.lower() | |
| for pattern in experience_patterns: | |
| match = re.search(pattern, text_lower) | |
| if match: | |
| return int(match.group(1)) | |
| return 0 | |
| def extract_education_level(text): | |
| """Extract education level from resume text""" | |
| education_keywords = { | |
| 'phd': 4, 'doctorate': 4, 'doctoral': 4, | |
| 'master': 3, 'mba': 3, 'ms': 3, 'ma': 3, | |
| 'bachelor': 2, 'degree': 2, 'ba': 2, 'bs': 2, | |
| 'diploma': 1, 'certificate': 1 | |
| } | |
| text_lower = text.lower() | |
| max_level = 0 | |
| for keyword, level in education_keywords.items(): | |
| if keyword in text_lower: | |
| max_level = max(max_level, level) | |
| return max_level | |
| def count_technical_terms(text): | |
| """Count technical terms in resume""" | |
| technical_terms = [ | |
| 'algorithm', 'database', 'software', 'development', 'programming', | |
| 'analysis', 'system', 'design', 'implementation', 'optimization', | |
| 'automation', 'testing', 'debugging', 'framework', 'api' | |
| ] | |
| text_lower = text.lower() | |
| count = 0 | |
| for term in technical_terms: | |
| count += text_lower.count(term) | |
| return count | |
| # ========================================================= | |
| # 2. LOAD MODEL + PREPROCESSING ARTIFACTS | |
| # ========================================================= | |
| HF_MODEL_REPO = "vvirothi/resume-parser-ml-model" # <- change to your real repo | |
| MODEL_FILENAME = "resume_parser_pipeline.joblib" | |
| def load_artifacts(): | |
| """ | |
| Download the latest model pipeline from Hugging Face Hub | |
| and load it with joblib. | |
| """ | |
| local_path = hf_hub_download( | |
| repo_id=HF_MODEL_REPO, | |
| filename=MODEL_FILENAME | |
| ) | |
| artifacts = joblib.load(local_path) | |
| return artifacts | |
| # ========================================================= | |
| # 3. FEATURE PREPARATION | |
| # ========================================================= | |
| def prepare_features_from_text(raw_text: str, artifacts: dict): | |
| """ | |
| Converts raw resume text into the exact same feature vector used during training. | |
| """ | |
| tfidf = artifacts["tfidf"] | |
| scaler = artifacts["scaler"] | |
| numerical_cols = artifacts["numerical_cols"] | |
| # 1. Clean text | |
| clean = clean_text(raw_text) | |
| # 2. Extract numerical features | |
| skills = extract_skills(raw_text) | |
| exp_years = extract_experience_years(raw_text) | |
| edu_level = extract_education_level(raw_text) | |
| tech_terms = count_technical_terms(raw_text) | |
| resume_len = len(raw_text) | |
| word_count = len(raw_text.split()) | |
| num_data = { | |
| 'Experience_Years': exp_years, | |
| 'Education_Level': edu_level, | |
| 'Technical_Terms_Count': tech_terms, | |
| 'Resume_Length': resume_len, | |
| 'Word_Count': word_count, | |
| } | |
| # 3. TF-IDF text features | |
| text_vec = tfidf.transform([clean]).toarray() | |
| # 4. Numerical → scaled in same order | |
| num_vec_raw = np.array([[num_data[col] for col in numerical_cols]]) | |
| num_vec_scaled = scaler.transform(num_vec_raw) | |
| # 5. Combine | |
| X_final = np.hstack([text_vec, num_vec_scaled]) | |
| return X_final | |
| # ========================================================= | |
| # 4. MAIN PREDICTION FUNCTION | |
| # ========================================================= | |
| def predict_from_pdf(pdf_path: str): | |
| # Load artifacts from HuggingFace model repo | |
| artifacts = load_artifacts() | |
| model = artifacts["model"] | |
| label_encoder = artifacts["label_encoder"] | |
| # Extract text | |
| raw_text = extract_text_from_pdf(pdf_path) | |
| # Prepare features | |
| X_input = prepare_features_from_text(raw_text, artifacts) | |
| # Predict | |
| y_pred_encoded = model.predict(X_input) | |
| predicted_label = label_encoder.inverse_transform(y_pred_encoded)[0] | |
| # Predict probabilities (if supported) | |
| proba_dict = None | |
| if hasattr(model, "predict_proba"): | |
| probs = model.predict_proba(X_input)[0] | |
| classes = label_encoder.inverse_transform(range(len(probs))) | |
| proba_dict = {cls: float(prob) for cls, prob in zip(classes, probs)} | |
| return predicted_label, proba_dict, raw_text | |