Niketjain2002 commited on
Commit
cfd6cec
·
verified ·
1 Parent(s): 1ae96e6

Upload src/input_processor.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/input_processor.py +186 -0
src/input_processor.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Input Processing Layer
3
+
4
+ Parses and normalizes raw inputs (job description, company context, resume)
5
+ into structured intermediate representations for feature extraction.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from dataclasses import dataclass, field, asdict
11
+ from typing import Optional
12
+
13
+
14
+ @dataclass
15
+ class CompanyContext:
16
+ stage: str # seed, series_a, series_b, growth, public, enterprise
17
+ industry: str
18
+ compensation_band: str # e.g. "$150K-$200K" or "L5 band"
19
+ location: str
20
+ remote_type: str # onsite, hybrid, remote, flexible
21
+ headcount: Optional[int] = None
22
+ glassdoor_rating: Optional[float] = None
23
+
24
+ def validate(self) -> list[str]:
25
+ errors = []
26
+ valid_stages = {"seed", "series_a", "series_b", "series_c", "growth", "public", "enterprise", "government", "nonprofit"}
27
+ if self.stage.lower() not in valid_stages:
28
+ errors.append(f"Unknown company stage: {self.stage}")
29
+ valid_remote = {"onsite", "hybrid", "remote", "flexible"}
30
+ if self.remote_type.lower() not in valid_remote:
31
+ errors.append(f"Unknown remote type: {self.remote_type}")
32
+ return errors
33
+
34
+
35
+ @dataclass
36
+ class ProcessedInput:
37
+ job_description: str
38
+ company_context: CompanyContext
39
+ resume_text: str
40
+ jd_sections: dict = field(default_factory=dict)
41
+ resume_sections: dict = field(default_factory=dict)
42
+ data_quality_score: float = 0.0
43
+ warnings: list = field(default_factory=list)
44
+
45
+ def to_dict(self) -> dict:
46
+ return {
47
+ "job_description": self.job_description,
48
+ "company_context": asdict(self.company_context),
49
+ "resume_text": self.resume_text,
50
+ "jd_sections": self.jd_sections,
51
+ "resume_sections": self.resume_sections,
52
+ "data_quality_score": self.data_quality_score,
53
+ "warnings": self.warnings,
54
+ }
55
+
56
+
57
+ class InputProcessor:
58
+ """Processes raw inputs into structured representations."""
59
+
60
+ def process(
61
+ self,
62
+ job_description: str,
63
+ company_context: dict,
64
+ resume_text: str,
65
+ ) -> ProcessedInput:
66
+ ctx = CompanyContext(**company_context)
67
+ warnings = ctx.validate()
68
+
69
+ jd_clean = self._clean_text(job_description)
70
+ resume_clean = self._clean_text(resume_text)
71
+
72
+ jd_sections = self._segment_jd(jd_clean)
73
+ resume_sections = self._segment_resume(resume_clean)
74
+
75
+ data_quality = self._assess_data_quality(jd_clean, resume_clean, ctx)
76
+
77
+ if data_quality < 0.3:
78
+ warnings.append("LOW_DATA_QUALITY: Inputs may be too sparse for reliable scoring")
79
+
80
+ return ProcessedInput(
81
+ job_description=jd_clean,
82
+ company_context=ctx,
83
+ resume_text=resume_clean,
84
+ jd_sections=jd_sections,
85
+ resume_sections=resume_sections,
86
+ data_quality_score=data_quality,
87
+ warnings=warnings,
88
+ )
89
+
90
+ def _clean_text(self, text: str) -> str:
91
+ text = re.sub(r"\r\n", "\n", text)
92
+ text = re.sub(r"[ \t]+", " ", text)
93
+ text = re.sub(r"\n{3,}", "\n\n", text)
94
+ return text.strip()
95
+
96
+ def _segment_jd(self, jd: str) -> dict:
97
+ """Heuristic segmentation of job description into sections."""
98
+ sections = {
99
+ "title": "",
100
+ "responsibilities": "",
101
+ "requirements": "",
102
+ "preferred": "",
103
+ "benefits": "",
104
+ "about": "",
105
+ "full_text": jd,
106
+ }
107
+
108
+ # Pattern-based extraction
109
+ patterns = {
110
+ "responsibilities": r"(?:responsibilities|what you.?ll do|the role|job duties)[:\s]*\n(.*?)(?=\n(?:requirements|qualifications|what we|preferred|benefits|about)|$)",
111
+ "requirements": r"(?:requirements|qualifications|what we.?re looking for|must have|minimum)[:\s]*\n(.*?)(?=\n(?:preferred|nice to have|benefits|about|responsibilities)|$)",
112
+ "preferred": r"(?:preferred|nice to have|bonus|ideal)[:\s]*\n(.*?)(?=\n(?:benefits|about|responsibilities|requirements)|$)",
113
+ "benefits": r"(?:benefits|perks|what we offer|compensation)[:\s]*\n(.*?)(?=\n(?:about|responsibilities|requirements)|$)",
114
+ }
115
+
116
+ for section, pattern in patterns.items():
117
+ match = re.search(pattern, jd, re.IGNORECASE | re.DOTALL)
118
+ if match:
119
+ sections[section] = match.group(1).strip()
120
+
121
+ # Extract title from first non-empty line
122
+ lines = [l.strip() for l in jd.split("\n") if l.strip()]
123
+ if lines:
124
+ sections["title"] = lines[0]
125
+
126
+ return sections
127
+
128
+ def _segment_resume(self, resume: str) -> dict:
129
+ """Heuristic segmentation of resume."""
130
+ sections = {
131
+ "contact": "",
132
+ "summary": "",
133
+ "experience": "",
134
+ "education": "",
135
+ "skills": "",
136
+ "projects": "",
137
+ "certifications": "",
138
+ "full_text": resume,
139
+ }
140
+
141
+ patterns = {
142
+ "summary": r"(?:summary|profile|objective|about)[:\s]*\n(.*?)(?=\n(?:experience|education|skills|projects|work)|$)",
143
+ "experience": r"(?:experience|work history|employment|professional background)[:\s]*\n(.*?)(?=\n(?:education|skills|projects|certifications)|$)",
144
+ "education": r"(?:education|academic|degrees?)[:\s]*\n(.*?)(?=\n(?:skills|projects|certifications|experience)|$)",
145
+ "skills": r"(?:skills|technical skills|technologies|competencies)[:\s]*\n(.*?)(?=\n(?:projects|certifications|education|experience)|$)",
146
+ }
147
+
148
+ for section, pattern in patterns.items():
149
+ match = re.search(pattern, resume, re.IGNORECASE | re.DOTALL)
150
+ if match:
151
+ sections[section] = match.group(1).strip()
152
+
153
+ return sections
154
+
155
+ def _assess_data_quality(self, jd: str, resume: str, ctx: CompanyContext) -> float:
156
+ """Score 0-1 representing input completeness and richness."""
157
+ signals = 0
158
+ total = 10
159
+
160
+ # JD quality
161
+ if len(jd) > 200:
162
+ signals += 1
163
+ if len(jd) > 500:
164
+ signals += 1
165
+ if any(kw in jd.lower() for kw in ["requirements", "qualifications", "responsibilities"]):
166
+ signals += 1
167
+
168
+ # Resume quality
169
+ if len(resume) > 300:
170
+ signals += 1
171
+ if len(resume) > 800:
172
+ signals += 1
173
+ if re.search(r"\d{4}", resume): # Contains years
174
+ signals += 1
175
+ if re.search(r"\d+%|\$\d+|\d+\s*(users|customers|team|engineers)", resume, re.IGNORECASE):
176
+ signals += 1 # Quantified achievements
177
+
178
+ # Company context quality
179
+ if ctx.compensation_band and ctx.compensation_band != "unknown":
180
+ signals += 1
181
+ if ctx.industry and ctx.industry != "unknown":
182
+ signals += 1
183
+ if ctx.stage and ctx.stage != "unknown":
184
+ signals += 1
185
+
186
+ return signals / total