Spaces:

chirag1121
/

Resume_Screening_Model

Sleeping

App Files Files Community

chirag1121 commited on Apr 16

Commit

59d43ff

verified ·

1 Parent(s): 8913fce

Update utils/scorer.py

Browse files

Files changed (1) hide show

utils/scorer.py +142 -81

utils/scorer.py CHANGED Viewed

@@ -1,108 +1,169 @@
 """
-parser.py — Resume file parsing module.
-Handles text extraction from PDF and DOCX files.
-Uses PyMuPDF for PDFs and python-docx for Word documents.
 """
-import io
-import fitz  # PyMuPDF
-from docx import Document
-def extract_text_from_pdf(file_bytes: bytes) -> str:
     """
-    Extract all text from a PDF file given its raw bytes.
     Args:
-        file_bytes: Raw bytes of the PDF file.
     Returns:
-        Extracted text as a single string, or empty string on failure.
-    """
-    try:
-        pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
-        text_parts = []
-        for page_num in range(len(pdf_doc)):
-            page = pdf_doc[page_num]
-            text_parts.append(page.get_text("text"))
-        pdf_doc.close()
-        return "\n".join(text_parts).strip()
-    except Exception as e:
-        print(f"[parser] PDF extraction error: {e}")
-        return ""
-def extract_text_from_docx(file_bytes: bytes) -> str:
     """
-    Extract all text from a DOCX file given its raw bytes.
-    Args:
-        file_bytes: Raw bytes of the DOCX file.
-    Returns:
-        Extracted text as a single string, or empty string on failure.
     """
-    try:
-        doc = Document(io.BytesIO(file_bytes))
-        paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
-        # Also extract text from tables
-        for table in doc.tables:
-            for row in table.rows:
-                for cell in row.cells:
-                    if cell.text.strip():
-                        paragraphs.append(cell.text.strip())
-        return "\n".join(paragraphs).strip()
-    except Exception as e:
-        print(f"[parser] DOCX extraction error: {e}")
-        return ""
-def parse_resume(uploaded_file) -> dict:
-    """
-    Main entry point: parse an uploaded Streamlit file object.
-    Detects file type and routes to the correct extractor.
     Args:
-        uploaded_file: Streamlit UploadedFile object.
     Returns:
-        dict with keys:
-            - 'text'     : extracted resume text (str)
-            - 'filename' : original file name (str)
-            - 'file_type': 'pdf' | 'docx' | 'unknown'
-            - 'error'    : error message if extraction failed (str | None)
     """
-    result = {
-        "text": "",
-        "filename": uploaded_file.name,
-        "file_type": "unknown",
-        "error": None,
-    }
-    file_bytes = uploaded_file.read()
-    if not file_bytes:
-        result["error"] = "Uploaded file is empty."
-        return result
-    filename_lower = uploaded_file.name.lower()
-    if filename_lower.endswith(".pdf"):
-        result["file_type"] = "pdf"
-        result["text"] = extract_text_from_pdf(file_bytes)
-    elif filename_lower.endswith(".docx"):
-        result["file_type"] = "docx"
-        result["text"] = extract_text_from_docx(file_bytes)
-    else:
-        result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
-        return result
-    if not result["text"]:
-        result["error"] = (
-            "Could not extract text from the file. "
-            "The file may be image-based or corrupted."
         )
-    return result

 """
+scorer.py — Resume scoring module.
+Computes:
+  1. Resume Base Score  (0–100) based on resume content analysis
+  2. ATS Score          (0–100) combining base score + job match similarity
+Scoring rubric (Base Score):
+  - Skills richness   : up to 20 pts
+  - Experience section: up to 30 pts
+  - Projects section  : up to 20 pts
+  - Education section : up to 10 pts
+  - Resume length     : up to 10 pts
+  - Skill diversity   : up to 10 pts
+  TOTAL               : 100 pts
 """
+import math
+def compute_base_score(
+    text: str,
+    sections: dict,
+    skills: dict,
+) -> dict:
     """
+    Compute the resume base score from its content.
     Args:
+        text    : full resume text
+        sections: output of nlp_utils.detect_sections()
+        skills  : output of nlp_utils.extract_skills()
     Returns:
+        dict with:
+            'total'      : overall score (0–100)
+            'breakdown'  : per-category score dict
     """
+    breakdown = {}
+    # ── 1. Skills richness (0–20) ─────────────────────────────────────────
+    tech_count = len(skills.get("technical", []))
+    # 0 skills → 0, 5 skills → 10, 10+ skills → 20
+    skills_score = min(20, int((tech_count / 10) * 20))
+    breakdown["Skills"] = skills_score
+    # ── 2. Experience section (0–30) ──────────────────────────────────────
+    if sections.get("experience"):
+        # More experience-related content = higher score
+        exp_text = _extract_section_text(text, ["experience", "employment", "work history"])
+        exp_words = len(exp_text.split())
+        # 0 words = 0, 100+ words = 30
+        exp_score = min(30, int((exp_words / 100) * 30))
+        exp_score = max(exp_score, 10 if sections.get("experience") else 0)
+    else:
+        exp_score = 0
+    breakdown["Experience"] = exp_score
+    # ── 3. Projects section (0–20) ────────────────────────────────────────
+    if sections.get("projects"):
+        proj_text = _extract_section_text(text, ["project"])
+        proj_words = len(proj_text.split())
+        proj_score = min(20, int((proj_words / 60) * 20))
+        proj_score = max(proj_score, 8 if sections.get("projects") else 0)
+    else:
+        proj_score = 0
+    breakdown["Projects"] = proj_score
+    # ── 4. Education section (0–10) ───────────────────────────────────────
+    breakdown["Education"] = 10 if sections.get("education") else 0
+    # ── 5. Resume length (0–10) ───────────────────────────────────────────
+    word_count = len(text.split())
+    # Ideal range: 300–700 words
+    if word_count >= 700:
+        length_score = 10
+    elif word_count >= 300:
+        length_score = int(5 + ((word_count - 300) / 400) * 5)
+    elif word_count >= 100:
+        length_score = int((word_count / 300) * 5)
+    else:
+        length_score = 0
+    breakdown["Length"] = length_score
+    # ── 6. Skill diversity (0–10) ─────────────────────────────────────────
+    # Reward having both technical AND soft skills
+    has_tech = len(skills.get("technical", [])) >= 3
+    has_soft = len(skills.get("soft", [])) >= 1
+    has_summary = sections.get("summary", False)
+    diversity_score = sum([has_tech * 5, has_soft * 3, has_summary * 2])
+    breakdown["Diversity"] = min(10, diversity_score)
+    total = sum(breakdown.values())
+    return {
+        "total": min(100, total),
+        "breakdown": breakdown,
+    }
+def compute_ats_score(base_score: float, job_match_score: float) -> float:
     """
+    Compute final ATS score.
+    Formula: ATS = 0.6 × base_score + 0.4 × job_match_score
+    Capped at 100.
     Args:
+        base_score     : resume base score (0–100)
+        job_match_score: job description match percentage (0–100)
     Returns:
+        ATS score as a float (0–100), rounded to 1 decimal place.
     """
+    ats = (0.6 * base_score) + (0.4 * job_match_score)
+    return round(min(100.0, ats), 1)
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _extract_section_text(text: str, keywords: list) -> str:
+    """
+    Attempt to extract the content under a section heading.
+    Searches for lines containing any of the keywords and returns
+    all text until the next section-like heading.
+    Args:
+        text    : full resume text
+        keywords: list of lowercase keywords to identify the section heading
+    Returns:
+        Extracted section text (may be empty string).
+    """
+    lines = text.splitlines()
+    in_section = False
+    collected = []
+    # Common heading indicators (short, possibly title-cased lines)
+    def _is_heading(line: str) -> bool:
+        stripped = line.strip()
+        return (
+            len(stripped) < 60
+            and stripped
+            and stripped == stripped.upper()
+            or any(
+                kw in stripped.lower()
+                for kw in [
+                    "skills", "education", "experience", "project",
+                    "certification", "summary", "objective", "awards",
+                    "contact", "languages", "interests",
+                ]
+            )
         )
+    for line in lines:
+        line_lower = line.lower().strip()
+        if any(kw in line_lower for kw in keywords) and len(line.strip()) < 60:
+            in_section = True
+            continue
+        if in_section:
+            # Stop collecting at the next major heading
+            if _is_heading(line) and not any(kw in line.lower() for kw in keywords):
+                break
+            collected.append(line)
+    return " ".join(collected)