Spaces:

Bur3hani
/

cview

Sleeping

App Files Files Community

Bur3hani commited on Jul 2, 2025

Commit

cfc4901

verified ·

1 Parent(s): 214f5a5

Create app.py

Browse files

Files changed (1) hide show

app.py +372 -0

app.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import streamlit as st
+import os
+import io
+import re
+from docx import Document
+from PyPDF2 import PdfReader # PyPDF2 is used. For more robust PDF parsing, consider 'PyMuPDF' (fitz)
+import pandas as pd
+import spacy
+from collections import Counter
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+# --- SpaCy Model Loading ---
+# Use st.cache_resource to load the model only once and reuse it across sessions.
+@st.cache_resource
+def load_spacy_model():
+    """
+    Loads the English spaCy model.
+    The model should be pre-downloaded via requirements.txt for Hugging Face Spaces.
+    """
+    try:
+        nlp_model = spacy.load("en_core_web_lg")
+        return nlp_model
+    except Exception as e:
+        st.error(f"Error loading spaCy model: {e}. Please ensure 'en_core_web_lg' is correctly installed via requirements.txt.")
+        st.stop() # Stop the app if model fails to load
+nlp = load_spacy_model()
+print("SpaCy model loaded successfully.") # This will appear in your Space logs
+# --- Global Predefined Skills (could be loaded from a file for larger lists) ---
+predefined_skills_list = set([
+    "python", "tensorflow", "pytorch", "scikit-learn", "numpy", "pandas",
+    "docker", "kubernetes", "aws", "git", "sql", "java", "r", "tableau",
+    "jupyter", "vscode", "bert", "spacy", "nltk", "opencv", "cnns",
+    "mlops", "agile", "feature engineering", "model deployment",
+    "machine learning", "deep learning", "nlp", "computer vision",
+    "data analysis", "predictive modeling", "fraud detection",
+    "recommendation system", "sentiment analysis", "ab testing",
+    "xgboost", "spark", "hadoop", "azure", "gcp",
+    "ai", "artificial intelligence", "data science", "big data",
+    "software development", "web development", "mobile development",
+    "databases", "cloud computing", "networking", "cybersecurity",
+    "project management", "communication", "teamwork", "leadership",
+    "problem solving", "critical thinking", "creativity"
+])
+predefined_skills_list.update([
+    "machine learning engineer", "data scientist", "ai engineer", "deep learning engineer",
+    "senior machine learning engineer", "junior data scientist", # Adding common job titles too
+    "data engineer", "software engineer", "full stack", "frontend", "backend"
+])
+# --- Text Extraction Functions (Adjusted for Streamlit file_object) ---
+def extract_text_from_pdf(file_object):
+    """
+    Extracts text from a PDF file-like object using PyPDF2.
+    """
+    try:
+        reader = PdfReader(file_object)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text() or "" # Handle pages with no extractable text
+        return text
+    except Exception as e:
+        st.error(f"Error reading PDF: {e}")
+        return ""
+def extract_text_from_docx(file_object):
+    """
+    Extracts text from a DOCX file-like object using python-docx.
+    """
+    try:
+        document = Document(file_object)
+        text = "\n".join([paragraph.text for paragraph in document.paragraphs])
+        return text
+    except Exception as e:
+        st.error(f"Error reading DOCX: {e}")
+        return ""
+# --- Text Preprocessing Functions ---
+def preprocess_text(text):
+    """
+    Applies standard NLP preprocessing steps.
+    """
+    if not isinstance(text, str):
+        return ""
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text).strip()
+    doc = nlp(text)
+    processed_tokens = [
+        token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space
+    ]
+    return " ".join(processed_tokens)
+# --- Information Extraction (NER & Keyword Extraction) ---
+def extract_skills(text_doc, skill_keywords=None):
+    """
+    Extracts skills using spaCy's NER and a custom keyword list.
+    Args:
+        text_doc (spacy.tokens.Doc): spaCy Doc object of the text.
+        skill_keywords (set): An optional set of predefined skill keywords.
+    Returns:
+        list: A list of extracted skills.
+    """
+    extracted_skills = []
+    if skill_keywords is None:
+        skill_keywords = set() # Should not be None if global is used
+    doc_text = text_doc.text.lower()
+    for skill in skill_keywords:
+        if re.search(r'\b' + re.escape(skill) + r'\b', doc_text):
+            extracted_skills.append(skill)
+    entities = {}
+    for ent in text_doc.ents:
+        if ent.label_ == "ORG":
+            entities.setdefault("organizations", []).append(ent.text)
+        elif ent.label_ == "GPE":
+            entities.setdefault("locations", []).append(ent.text)
+        elif ent.label_ == "DATE":
+            entities.setdefault("dates", []).append(ent.text)
+        elif ent.label_ == "PERSON":
+            entities.setdefault("people", []).append(ent.text)
+    return list(set(extracted_skills)), entities
+def extract_experience_and_education(text):
+    """
+    Attempts to extract experience years and education level using regex and simple rules.
+    This is a simplified approach and can be complex for diverse CVs.
+    """
+    years_experience = 0
+    education_level = "Not Specified"
+    exp_matches = re.findall(r'(\d+)\s*(?:\+|plus)?\s*years?\s+of\s+experience|\d+\s*yrs?\s+exp', text.lower())
+    if exp_matches:
+        try:
+            years_experience = max([int(re.findall(r'\d+', m)[0]) for m in exp_matches if re.findall(r'\d+', m)])
+        except (ValueError, IndexError):
+            pass
+    text_lower = text.lower()
+    if "phd" in text_lower or "doctorate" in text_lower:
+        education_level = "Ph.D."
+    elif "master" in text_lower or "m.s." in text_lower or "msc" in text_lower:
+        education_level = "Master's"
+    elif "bachelor" in text_lower or "b.s." in text_lower or "bsc" in text_lower:
+        education_level = "Bachelor's"
+    elif "associate" in text_lower:
+        education_level = "Associate's"
+    return years_experience, education_level
+# --- Feature Engineering ---
+def get_text_embeddings(text):
+    """
+    Generates sentence embeddings for a given text using spaCy's pre-trained vectors.
+    """
+    if not text:
+        return np.zeros(nlp.vocab.vectors.shape[1])
+    doc = nlp(text)
+    if doc.has_vector:
+        return doc.vector
+    else:
+        # Fallback if no vector for doc (shouldn't happen with en_core_web_lg)
+        return np.mean([token.vector for token in doc if token.has_vector], axis=0) if [token.vector for token in doc if token.has_vector] else np.zeros(nlp.vocab.vectors.shape[1])
+def calculate_cosine_similarity(vec1, vec2):
+    """
+    Calculates cosine similarity between two vectors.
+    Handles potential division by zero if vectors are zero vectors.
+    """
+    if np.all(vec1 == 0) or np.all(vec2 == 0):
+        return 0.0
+    vec1 = vec1.reshape(1, -1)
+    vec2 = vec2.reshape(1, -1)
+    return cosine_similarity(vec1, vec2)[0][0]
+# --- Main Processing Pipeline for a Document (CV or Job Description) ---
+def analyze_document(doc_text):
+    """
+    Processes a document (CV or Job Description) for analysis.
+    """
+    doc_spacy = nlp(doc_text)
+    cleaned_text = preprocess_text(doc_text)
+    extracted_skills, general_entities = extract_skills(doc_spacy, skill_keywords=predefined_skills_list)
+    years_exp, education_level = extract_experience_and_education(doc_text)
+    text_embedding = get_text_embeddings(cleaned_text)
+    return {
+        "raw_text": doc_text,
+        "cleaned_text": cleaned_text,
+        "spacy_doc": doc_spacy,
+        "extracted_skills": extracted_skills,
+        "general_entities": general_entities,
+        "years_experience": years_exp,
+        "education_level": education_level,
+        "text_embedding": text_embedding
+    }
+# --- Matching and Scoring Logic ---
+def calculate_match_scores(cv_data, jd_data):
+    """
+    Calculates various match scores and identifies keyword overlaps.
+    """
+    results = {}
+    # 1. Overall Semantic Similarity (using embeddings)
+    overall_similarity = calculate_cosine_similarity(
+        cv_data["text_embedding"],
+        jd_data["text_embedding"]
+    )
+    results["overall_match_score"] = round(overall_similarity * 100, 2)
+    # 2. Skill Matching
+    cv_skills = set(cv_data["extracted_skills"])
+    jd_skills = set(jd_data["extracted_skills"])
+    matched_skills = list(cv_skills.intersection(jd_skills))
+    missing_skills = list(jd_skills.difference(cv_skills))
+    extra_skills_in_cv = list(cv_skills.difference(jd_skills))
+    results["matched_skills"] = matched_skills
+    results["missing_skills"] = missing_skills
+    results["extra_skills_in_cv"] = extra_skills_in_cv
+    if jd_skills:
+        skill_match_percentage = len(matched_skills) / len(jd_skills) * 100
+    else:
+        skill_match_percentage = 0.0
+    results["skill_match_percentage"] = round(skill_match_percentage, 2)
+    # 3. Keyword Overlap (using TF-IDF for important words beyond specific skills)
+    corpus = [cv_data["cleaned_text"], jd_data["cleaned_text"]]
+    tfidf_vectorizer = TfidfVectorizer(max_features=100)
+    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
+    feature_names = tfidf_vectorizer.get_feature_names_out()
+    cv_tfidf_scores = {feature_names[i]: tfidf_matrix[0, i] for i in tfidf_matrix[0].nonzero()[1]}
+    jd_tfidf_scores = {feature_names[i]: tfidf_matrix[1, i] for i in tfidf_matrix[1].nonzero()[1]}
+    top_cv_keywords = sorted(cv_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
+    top_jd_keywords = sorted(jd_tfidf_scores.items(), key=lambda x: x[1], reverse=True)[:15]
+    results["top_cv_keywords"] = [k for k,v in top_cv_keywords]
+    results["top_jd_keywords"] = [k for k,v in top_jd_keywords]
+    common_keywords = set(results["top_cv_keywords"]).intersection(set(results["top_jd_keywords"]))
+    results["common_keywords"] = list(common_keywords)
+    # 4. Experience Matching
+    cv_exp_years = cv_data["years_experience"]
+    jd_exp_years = jd_data["years_experience"]
+    results["cv_years_experience"] = cv_exp_years
+    results["jd_years_experience"] = jd_exp_years
+    exp_status = "Not specified by Job"
+    if jd_exp_years > 0:
+        if cv_exp_years >= jd_exp_years:
+            exp_status = "Meets or Exceeds Requirement"
+        else:
+            exp_status = f"Below Requirement (Needs {jd_exp_years - cv_exp_years} more years)"
+    results["experience_match_status"] = exp_status
+    # 5. Education Matching (simplified)
+    cv_edu = cv_data["education_level"]
+    jd_edu = jd_data["education_level"]
+    results["cv_education_level"] = cv_edu
+    results["jd_education_level"] = jd_edu
+    edu_match_status = "Not Specified by Job"
+    if jd_edu != "Not Specified": # Only compare if JD specifies
+        edu_order = {"Associate's": 1, "Bachelor's": 2, "Master's": 3, "Ph.D.": 4}
+        if edu_order.get(cv_edu, 0) >= edu_order.get(jd_edu, 0):
+            edu_match_status = "Meets or Exceeds Requirement"
+        else:
+            edu_match_status = "Below Requirement"
+    results["education_match_status"] = edu_match_status
+    return results
+# --- Overall Analysis Orchestrator ---
+def perform_cv_job_analysis(cv_text, job_desc_text):
+    """
+    Orchestrates the entire analysis process from raw text to results.
+    """
+    cv_analysis_data = analyze_document(cv_text)
+    job_desc_analysis_data = analyze_document(job_desc_text)
+    match_results = calculate_match_scores(cv_analysis_data, job_desc_analysis_data)
+    return match_results
+# --- Visualization Functions (Adjusted for Streamlit) ---
+# Each visualization function now returns a matplotlib figure object
+# and Streamlit's st.pyplot() is used to display it, then figure is closed.
+def create_overall_match_plot(score):
+    """Returns a matplotlib figure for overall match."""
+    fig, ax = plt.subplots(figsize=(6, 2))
+    sns.set_style("whitegrid")
+    ax.barh(["Overall Match"], [score], color='skyblue')
+    ax.set_xlim(0, 100)
+    ax.text(score + 2, 0, f'{score}%', va='center', color='black', fontsize=12)
+    ax.set_title("Overall CV-Job Description Match Score", fontsize=14)
+    ax.set_xlabel("Match Percentage", fontsize=12)
+    ax.get_yaxis().set_visible(False)
+    plt.tight_layout()
+    return fig
+def create_skill_match_plot(matched_skills, missing_skills):
+    """Returns a matplotlib figure for skill match breakdown."""
+    labels = ['Matched Skills', 'Missing Skills']
+    sizes = [len(matched_skills), len(missing_skills)]
+    colors = ['#66b3ff', '#ff9999']
+    explode = (0.05, 0.05) if sizes[0] > 0 and sizes[1] > 0 else (0,0)
+    if sum(sizes) == 0:
+        return None # Indicate no plot can be made
+    fig, ax = plt.subplots(figsize=(7, 7))
+    ax.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
+           shadow=True, startangle=90, textprops={'fontsize': 12})
+    ax.axis('equal')
+    ax.set_title("Skill Match Breakdown", fontsize=14)
+    plt.tight_layout()
+    return fig
+def create_top_keywords_plot(cv_keywords, jd_keywords):
+    """Returns a matplotlib figure for top keywords."""
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    sns.set_style("whitegrid")
+    cv_df = pd.DataFrame(Counter(cv_keywords).most_common(10), columns=['Keyword', 'Count'])
+    if not cv_df.empty:
+        sns.barplot(x='Count', y='Keyword', data=cv_df, ax=axes[0], palette='viridis')
+        axes[0].set_title('Top Keywords in CV', fontsize=14)
+        axes[0].set_xlabel('Frequency/Importance', fontsize=12)
+        axes[0].set_ylabel('')
+    jd_df = pd.DataFrame(Counter(jd_keywords).most_common(10), columns=['Keyword', 'Count'])
+    if not jd_df.empty:
+        sns.barplot(x='Count', y='Keyword', data=jd_df, ax=axes[1], palette='plasma')
+        axes[1].set_title('Top Keywords in Job Description', fontsize=14)
+        axes[1].set_xlabel('Frequency/Importance', fontsize=12)
+        axes[1].set_ylabel('')
+    plt.tight_layout()
+    return fig
+# --- Streamlit Application Layout ---
+st.set_page_config(page_title="CV-Job Match Analyzer", layout="wide", icon="👨‍💼")
+st.title("👨‍💼 CV-Job Match Analyzer 📈")
+st.markdown("""
+Welcome! This tool helps you understand how well a CV matches a job description.
+Upload a CV (PDF, DOCX, TXT) and paste the job description text to get an instant analysis.
+""")
+# Input for CV
+st.header("1. Upload Your CV")
+uploaded_cv_file = st.file_uploader("Choose a CV file (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"],