import docx2txt import PyPDF2 import spacy from transformers import pipeline import re def parse_resume(file): if file.type == "application/pdf": pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: text += page.extract_text() return text elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return docx2txt.process(file) else: return "" def load_models(): nlp_model = spacy.load("en_core_web_sm") llm_model = pipeline("text-generation", model="microsoft/phi-2") return nlp_model, llm_model def get_recommendations(text, nlp_model, llm_model): doc = nlp_model(text) skills = [ent.text.lower() for ent in doc.ents if ent.label_ in ["SKILL", "WORK_OF_ART", "ORG"]] education = extract_education(text) prompt = f"Given these skills: {', '.join(set(skills))}, classify the most likely job field and rate CV quality (0-100):" response = llm_model(prompt, max_new_tokens=100)[0]['generated_text'] domain = "Engineering" if "engineer" in response.lower() else "General" score = int("".join(filter(str.isdigit, response))) return score, domain, list(set(skills)), education def extract_education(text): match = re.findall(r"(Bachelor|Master|PhD|MBA|BSc|MSc|B\.Tech|M\.Tech)[^\n\r]*", text, re.IGNORECASE) return ", ".join(set(match)) if match else "Not detected"