chirag1121 commited on
Commit
59d43ff
Β·
verified Β·
1 Parent(s): 8913fce

Update utils/scorer.py

Browse files
Files changed (1) hide show
  1. utils/scorer.py +142 -81
utils/scorer.py CHANGED
@@ -1,108 +1,169 @@
1
  """
2
- parser.py β€” Resume file parsing module.
3
-
4
- Handles text extraction from PDF and DOCX files.
5
- Uses PyMuPDF for PDFs and python-docx for Word documents.
 
 
 
 
 
 
 
 
 
 
6
  """
7
 
8
- import io
9
- import fitz # PyMuPDF
10
- from docx import Document
11
 
12
 
13
- def extract_text_from_pdf(file_bytes: bytes) -> str:
 
 
 
 
14
  """
15
- Extract all text from a PDF file given its raw bytes.
16
 
17
  Args:
18
- file_bytes: Raw bytes of the PDF file.
 
 
19
 
20
  Returns:
21
- Extracted text as a single string, or empty string on failure.
22
- """
23
- try:
24
- pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
25
- text_parts = []
26
- for page_num in range(len(pdf_doc)):
27
- page = pdf_doc[page_num]
28
- text_parts.append(page.get_text("text"))
29
- pdf_doc.close()
30
- return "\n".join(text_parts).strip()
31
- except Exception as e:
32
- print(f"[parser] PDF extraction error: {e}")
33
- return ""
34
-
35
-
36
- def extract_text_from_docx(file_bytes: bytes) -> str:
37
  """
38
- Extract all text from a DOCX file given its raw bytes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- Args:
41
- file_bytes: Raw bytes of the DOCX file.
42
 
43
- Returns:
44
- Extracted text as a single string, or empty string on failure.
45
  """
46
- try:
47
- doc = Document(io.BytesIO(file_bytes))
48
- paragraphs = [para.text for para in doc.paragraphs if para.text.strip()]
49
- # Also extract text from tables
50
- for table in doc.tables:
51
- for row in table.rows:
52
- for cell in row.cells:
53
- if cell.text.strip():
54
- paragraphs.append(cell.text.strip())
55
- return "\n".join(paragraphs).strip()
56
- except Exception as e:
57
- print(f"[parser] DOCX extraction error: {e}")
58
- return ""
59
-
60
-
61
- def parse_resume(uploaded_file) -> dict:
62
- """
63
- Main entry point: parse an uploaded Streamlit file object.
64
 
65
- Detects file type and routes to the correct extractor.
 
66
 
67
  Args:
68
- uploaded_file: Streamlit UploadedFile object.
 
69
 
70
  Returns:
71
- dict with keys:
72
- - 'text' : extracted resume text (str)
73
- - 'filename' : original file name (str)
74
- - 'file_type': 'pdf' | 'docx' | 'unknown'
75
- - 'error' : error message if extraction failed (str | None)
76
  """
77
- result = {
78
- "text": "",
79
- "filename": uploaded_file.name,
80
- "file_type": "unknown",
81
- "error": None,
82
- }
83
 
84
- file_bytes = uploaded_file.read()
85
 
86
- if not file_bytes:
87
- result["error"] = "Uploaded file is empty."
88
- return result
89
 
90
- filename_lower = uploaded_file.name.lower()
 
 
91
 
92
- if filename_lower.endswith(".pdf"):
93
- result["file_type"] = "pdf"
94
- result["text"] = extract_text_from_pdf(file_bytes)
95
- elif filename_lower.endswith(".docx"):
96
- result["file_type"] = "docx"
97
- result["text"] = extract_text_from_docx(file_bytes)
98
- else:
99
- result["error"] = "Unsupported file type. Please upload a PDF or DOCX."
100
- return result
101
 
102
- if not result["text"]:
103
- result["error"] = (
104
- "Could not extract text from the file. "
105
- "The file may be image-based or corrupted."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  )
107
 
108
- return result
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ scorer.py β€” Resume scoring module.
3
+
4
+ Computes:
5
+ 1. Resume Base Score (0–100) based on resume content analysis
6
+ 2. ATS Score (0–100) combining base score + job match similarity
7
+
8
+ Scoring rubric (Base Score):
9
+ - Skills richness : up to 20 pts
10
+ - Experience section: up to 30 pts
11
+ - Projects section : up to 20 pts
12
+ - Education section : up to 10 pts
13
+ - Resume length : up to 10 pts
14
+ - Skill diversity : up to 10 pts
15
+ TOTAL : 100 pts
16
  """
17
 
18
+ import math
 
 
19
 
20
 
21
+ def compute_base_score(
22
+ text: str,
23
+ sections: dict,
24
+ skills: dict,
25
+ ) -> dict:
26
  """
27
+ Compute the resume base score from its content.
28
 
29
  Args:
30
+ text : full resume text
31
+ sections: output of nlp_utils.detect_sections()
32
+ skills : output of nlp_utils.extract_skills()
33
 
34
  Returns:
35
+ dict with:
36
+ 'total' : overall score (0–100)
37
+ 'breakdown' : per-category score dict
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  """
39
+ breakdown = {}
40
+
41
+ # ── 1. Skills richness (0–20) ─────────────────────────────────────────
42
+ tech_count = len(skills.get("technical", []))
43
+ # 0 skills β†’ 0, 5 skills β†’ 10, 10+ skills β†’ 20
44
+ skills_score = min(20, int((tech_count / 10) * 20))
45
+ breakdown["Skills"] = skills_score
46
+
47
+ # ── 2. Experience section (0–30) ──────────────────────────────────────
48
+ if sections.get("experience"):
49
+ # More experience-related content = higher score
50
+ exp_text = _extract_section_text(text, ["experience", "employment", "work history"])
51
+ exp_words = len(exp_text.split())
52
+ # 0 words = 0, 100+ words = 30
53
+ exp_score = min(30, int((exp_words / 100) * 30))
54
+ exp_score = max(exp_score, 10 if sections.get("experience") else 0)
55
+ else:
56
+ exp_score = 0
57
+ breakdown["Experience"] = exp_score
58
+
59
+ # ── 3. Projects section (0–20) ────────────────────────────────────────
60
+ if sections.get("projects"):
61
+ proj_text = _extract_section_text(text, ["project"])
62
+ proj_words = len(proj_text.split())
63
+ proj_score = min(20, int((proj_words / 60) * 20))
64
+ proj_score = max(proj_score, 8 if sections.get("projects") else 0)
65
+ else:
66
+ proj_score = 0
67
+ breakdown["Projects"] = proj_score
68
+
69
+ # ── 4. Education section (0–10) ───────────────────────────────────────
70
+ breakdown["Education"] = 10 if sections.get("education") else 0
71
+
72
+ # ── 5. Resume length (0–10) ───────────────────────────────────────────
73
+ word_count = len(text.split())
74
+ # Ideal range: 300–700 words
75
+ if word_count >= 700:
76
+ length_score = 10
77
+ elif word_count >= 300:
78
+ length_score = int(5 + ((word_count - 300) / 400) * 5)
79
+ elif word_count >= 100:
80
+ length_score = int((word_count / 300) * 5)
81
+ else:
82
+ length_score = 0
83
+ breakdown["Length"] = length_score
84
+
85
+ # ── 6. Skill diversity (0–10) ─────────────────────────────────────────
86
+ # Reward having both technical AND soft skills
87
+ has_tech = len(skills.get("technical", [])) >= 3
88
+ has_soft = len(skills.get("soft", [])) >= 1
89
+ has_summary = sections.get("summary", False)
90
+ diversity_score = sum([has_tech * 5, has_soft * 3, has_summary * 2])
91
+ breakdown["Diversity"] = min(10, diversity_score)
92
+
93
+ total = sum(breakdown.values())
94
+
95
+ return {
96
+ "total": min(100, total),
97
+ "breakdown": breakdown,
98
+ }
99
 
 
 
100
 
101
+ def compute_ats_score(base_score: float, job_match_score: float) -> float:
 
102
  """
103
+ Compute final ATS score.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ Formula: ATS = 0.6 Γ— base_score + 0.4 Γ— job_match_score
106
+ Capped at 100.
107
 
108
  Args:
109
+ base_score : resume base score (0–100)
110
+ job_match_score: job description match percentage (0–100)
111
 
112
  Returns:
113
+ ATS score as a float (0–100), rounded to 1 decimal place.
 
 
 
 
114
  """
115
+ ats = (0.6 * base_score) + (0.4 * job_match_score)
116
+ return round(min(100.0, ats), 1)
 
 
 
 
117
 
 
118
 
119
+ # ---------------------------------------------------------------------------
120
+ # Internal helpers
121
+ # ---------------------------------------------------------------------------
122
 
123
+ def _extract_section_text(text: str, keywords: list) -> str:
124
+ """
125
+ Attempt to extract the content under a section heading.
126
 
127
+ Searches for lines containing any of the keywords and returns
128
+ all text until the next section-like heading.
 
 
 
 
 
 
 
129
 
130
+ Args:
131
+ text : full resume text
132
+ keywords: list of lowercase keywords to identify the section heading
133
+
134
+ Returns:
135
+ Extracted section text (may be empty string).
136
+ """
137
+ lines = text.splitlines()
138
+ in_section = False
139
+ collected = []
140
+
141
+ # Common heading indicators (short, possibly title-cased lines)
142
+ def _is_heading(line: str) -> bool:
143
+ stripped = line.strip()
144
+ return (
145
+ len(stripped) < 60
146
+ and stripped
147
+ and stripped == stripped.upper()
148
+ or any(
149
+ kw in stripped.lower()
150
+ for kw in [
151
+ "skills", "education", "experience", "project",
152
+ "certification", "summary", "objective", "awards",
153
+ "contact", "languages", "interests",
154
+ ]
155
+ )
156
  )
157
 
158
+ for line in lines:
159
+ line_lower = line.lower().strip()
160
+ if any(kw in line_lower for kw in keywords) and len(line.strip()) < 60:
161
+ in_section = True
162
+ continue
163
+ if in_section:
164
+ # Stop collecting at the next major heading
165
+ if _is_heading(line) and not any(kw in line.lower() for kw in keywords):
166
+ break
167
+ collected.append(line)
168
+
169
+ return " ".join(collected)