| | """ |
| | Input Processing Layer |
| | |
| | Parses and normalizes raw inputs (job description, company context, resume) |
| | into structured intermediate representations for feature extraction. |
| | """ |
| |
|
| | import json |
| | import re |
| | from dataclasses import dataclass, field, asdict |
| | from typing import Optional |
| |
|
| |
|
| | @dataclass |
| | class CompanyContext: |
| | stage: str |
| | industry: str |
| | compensation_band: str |
| | location: str |
| | remote_type: str |
| | headcount: Optional[int] = None |
| | glassdoor_rating: Optional[float] = None |
| |
|
| | def validate(self) -> list[str]: |
| | errors = [] |
| | valid_stages = {"seed", "series_a", "series_b", "series_c", "growth", "public", "enterprise", "government", "nonprofit"} |
| | if self.stage.lower() not in valid_stages: |
| | errors.append(f"Unknown company stage: {self.stage}") |
| | valid_remote = {"onsite", "hybrid", "remote", "flexible"} |
| | if self.remote_type.lower() not in valid_remote: |
| | errors.append(f"Unknown remote type: {self.remote_type}") |
| | return errors |
| |
|
| |
|
| | @dataclass |
| | class ProcessedInput: |
| | job_description: str |
| | company_context: CompanyContext |
| | resume_text: str |
| | jd_sections: dict = field(default_factory=dict) |
| | resume_sections: dict = field(default_factory=dict) |
| | data_quality_score: float = 0.0 |
| | warnings: list = field(default_factory=list) |
| |
|
| | def to_dict(self) -> dict: |
| | return { |
| | "job_description": self.job_description, |
| | "company_context": asdict(self.company_context), |
| | "resume_text": self.resume_text, |
| | "jd_sections": self.jd_sections, |
| | "resume_sections": self.resume_sections, |
| | "data_quality_score": self.data_quality_score, |
| | "warnings": self.warnings, |
| | } |
| |
|
| |
|
| | class InputProcessor: |
| | """Processes raw inputs into structured representations.""" |
| |
|
| | def process( |
| | self, |
| | job_description: str, |
| | company_context: dict, |
| | resume_text: str, |
| | ) -> ProcessedInput: |
| | ctx = CompanyContext(**company_context) |
| | warnings = ctx.validate() |
| |
|
| | jd_clean = self._clean_text(job_description) |
| | resume_clean = self._clean_text(resume_text) |
| |
|
| | jd_sections = self._segment_jd(jd_clean) |
| | resume_sections = self._segment_resume(resume_clean) |
| |
|
| | data_quality = self._assess_data_quality(jd_clean, resume_clean, ctx) |
| |
|
| | if data_quality < 0.3: |
| | warnings.append("LOW_DATA_QUALITY: Inputs may be too sparse for reliable scoring") |
| |
|
| | return ProcessedInput( |
| | job_description=jd_clean, |
| | company_context=ctx, |
| | resume_text=resume_clean, |
| | jd_sections=jd_sections, |
| | resume_sections=resume_sections, |
| | data_quality_score=data_quality, |
| | warnings=warnings, |
| | ) |
| |
|
| | def _clean_text(self, text: str) -> str: |
| | text = re.sub(r"\r\n", "\n", text) |
| | text = re.sub(r"[ \t]+", " ", text) |
| | text = re.sub(r"\n{3,}", "\n\n", text) |
| | return text.strip() |
| |
|
| | def _segment_jd(self, jd: str) -> dict: |
| | """Heuristic segmentation of job description into sections.""" |
| | sections = { |
| | "title": "", |
| | "responsibilities": "", |
| | "requirements": "", |
| | "preferred": "", |
| | "benefits": "", |
| | "about": "", |
| | "full_text": jd, |
| | } |
| |
|
| | |
| | patterns = { |
| | "responsibilities": r"(?:responsibilities|what you.?ll do|the role|job duties)[:\s]*\n(.*?)(?=\n(?:requirements|qualifications|what we|preferred|benefits|about)|$)", |
| | "requirements": r"(?:requirements|qualifications|what we.?re looking for|must have|minimum)[:\s]*\n(.*?)(?=\n(?:preferred|nice to have|benefits|about|responsibilities)|$)", |
| | "preferred": r"(?:preferred|nice to have|bonus|ideal)[:\s]*\n(.*?)(?=\n(?:benefits|about|responsibilities|requirements)|$)", |
| | "benefits": r"(?:benefits|perks|what we offer|compensation)[:\s]*\n(.*?)(?=\n(?:about|responsibilities|requirements)|$)", |
| | } |
| |
|
| | for section, pattern in patterns.items(): |
| | match = re.search(pattern, jd, re.IGNORECASE | re.DOTALL) |
| | if match: |
| | sections[section] = match.group(1).strip() |
| |
|
| | |
| | lines = [l.strip() for l in jd.split("\n") if l.strip()] |
| | if lines: |
| | sections["title"] = lines[0] |
| |
|
| | return sections |
| |
|
| | def _segment_resume(self, resume: str) -> dict: |
| | """Heuristic segmentation of resume.""" |
| | sections = { |
| | "contact": "", |
| | "summary": "", |
| | "experience": "", |
| | "education": "", |
| | "skills": "", |
| | "projects": "", |
| | "certifications": "", |
| | "full_text": resume, |
| | } |
| |
|
| | patterns = { |
| | "summary": r"(?:summary|profile|objective|about)[:\s]*\n(.*?)(?=\n(?:experience|education|skills|projects|work)|$)", |
| | "experience": r"(?:experience|work history|employment|professional background)[:\s]*\n(.*?)(?=\n(?:education|skills|projects|certifications)|$)", |
| | "education": r"(?:education|academic|degrees?)[:\s]*\n(.*?)(?=\n(?:skills|projects|certifications|experience)|$)", |
| | "skills": r"(?:skills|technical skills|technologies|competencies)[:\s]*\n(.*?)(?=\n(?:projects|certifications|education|experience)|$)", |
| | } |
| |
|
| | for section, pattern in patterns.items(): |
| | match = re.search(pattern, resume, re.IGNORECASE | re.DOTALL) |
| | if match: |
| | sections[section] = match.group(1).strip() |
| |
|
| | return sections |
| |
|
| | def _assess_data_quality(self, jd: str, resume: str, ctx: CompanyContext) -> float: |
| | """Score 0-1 representing input completeness and richness.""" |
| | signals = 0 |
| | total = 10 |
| |
|
| | |
| | if len(jd) > 200: |
| | signals += 1 |
| | if len(jd) > 500: |
| | signals += 1 |
| | if any(kw in jd.lower() for kw in ["requirements", "qualifications", "responsibilities"]): |
| | signals += 1 |
| |
|
| | |
| | if len(resume) > 300: |
| | signals += 1 |
| | if len(resume) > 800: |
| | signals += 1 |
| | if re.search(r"\d{4}", resume): |
| | signals += 1 |
| | if re.search(r"\d+%|\$\d+|\d+\s*(users|customers|team|engineers)", resume, re.IGNORECASE): |
| | signals += 1 |
| |
|
| | |
| | if ctx.compensation_band and ctx.compensation_band != "unknown": |
| | signals += 1 |
| | if ctx.industry and ctx.industry != "unknown": |
| | signals += 1 |
| | if ctx.stage and ctx.stage != "unknown": |
| | signals += 1 |
| |
|
| | return signals / total |
| |
|