Spaces:

Niketjain2002
/

recruitment-intelligence

Sleeping

App Files Files Community

Niketjain2002 commited on 4 days ago

Commit

cfd6cec

verified ·

1 Parent(s): 1ae96e6

Upload src/input_processor.py with huggingface_hub

Browse files

Files changed (1) hide show

src/input_processor.py +186 -0

src/input_processor.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Input Processing Layer
+Parses and normalizes raw inputs (job description, company context, resume)
+into structured intermediate representations for feature extraction.
+"""
+import json
+import re
+from dataclasses import dataclass, field, asdict
+from typing import Optional
+@dataclass
+class CompanyContext:
+    stage: str  # seed, series_a, series_b, growth, public, enterprise
+    industry: str
+    compensation_band: str  # e.g. "$150K-$200K" or "L5 band"
+    location: str
+    remote_type: str  # onsite, hybrid, remote, flexible
+    headcount: Optional[int] = None
+    glassdoor_rating: Optional[float] = None
+    def validate(self) -> list[str]:
+        errors = []
+        valid_stages = {"seed", "series_a", "series_b", "series_c", "growth", "public", "enterprise", "government", "nonprofit"}
+        if self.stage.lower() not in valid_stages:
+            errors.append(f"Unknown company stage: {self.stage}")
+        valid_remote = {"onsite", "hybrid", "remote", "flexible"}
+        if self.remote_type.lower() not in valid_remote:
+            errors.append(f"Unknown remote type: {self.remote_type}")
+        return errors
+@dataclass
+class ProcessedInput:
+    job_description: str
+    company_context: CompanyContext
+    resume_text: str
+    jd_sections: dict = field(default_factory=dict)
+    resume_sections: dict = field(default_factory=dict)
+    data_quality_score: float = 0.0
+    warnings: list = field(default_factory=list)
+    def to_dict(self) -> dict:
+        return {
+            "job_description": self.job_description,
+            "company_context": asdict(self.company_context),
+            "resume_text": self.resume_text,
+            "jd_sections": self.jd_sections,
+            "resume_sections": self.resume_sections,
+            "data_quality_score": self.data_quality_score,
+            "warnings": self.warnings,
+        }
+class InputProcessor:
+    """Processes raw inputs into structured representations."""
+    def process(
+        self,
+        job_description: str,
+        company_context: dict,
+        resume_text: str,
+    ) -> ProcessedInput:
+        ctx = CompanyContext(**company_context)
+        warnings = ctx.validate()
+        jd_clean = self._clean_text(job_description)
+        resume_clean = self._clean_text(resume_text)
+        jd_sections = self._segment_jd(jd_clean)
+        resume_sections = self._segment_resume(resume_clean)
+        data_quality = self._assess_data_quality(jd_clean, resume_clean, ctx)
+        if data_quality < 0.3:
+            warnings.append("LOW_DATA_QUALITY: Inputs may be too sparse for reliable scoring")
+        return ProcessedInput(
+            job_description=jd_clean,
+            company_context=ctx,
+            resume_text=resume_clean,
+            jd_sections=jd_sections,
+            resume_sections=resume_sections,
+            data_quality_score=data_quality,
+            warnings=warnings,
+        )
+    def _clean_text(self, text: str) -> str:
+        text = re.sub(r"\r\n", "\n", text)
+        text = re.sub(r"[ \t]+", " ", text)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text.strip()
+    def _segment_jd(self, jd: str) -> dict:
+        """Heuristic segmentation of job description into sections."""
+        sections = {
+            "title": "",
+            "responsibilities": "",
+            "requirements": "",
+            "preferred": "",
+            "benefits": "",
+            "about": "",
+            "full_text": jd,
+        }
+        # Pattern-based extraction
+        patterns = {
+            "responsibilities": r"(?:responsibilities|what you.?ll do|the role|job duties)[:\s]*\n(.*?)(?=\n(?:requirements|qualifications|what we|preferred|benefits|about)|$)",
+            "requirements": r"(?:requirements|qualifications|what we.?re looking for|must have|minimum)[:\s]*\n(.*?)(?=\n(?:preferred|nice to have|benefits|about|responsibilities)|$)",
+            "preferred": r"(?:preferred|nice to have|bonus|ideal)[:\s]*\n(.*?)(?=\n(?:benefits|about|responsibilities|requirements)|$)",
+            "benefits": r"(?:benefits|perks|what we offer|compensation)[:\s]*\n(.*?)(?=\n(?:about|responsibilities|requirements)|$)",
+        }
+        for section, pattern in patterns.items():
+            match = re.search(pattern, jd, re.IGNORECASE | re.DOTALL)
+            if match:
+                sections[section] = match.group(1).strip()
+        # Extract title from first non-empty line
+        lines = [l.strip() for l in jd.split("\n") if l.strip()]
+        if lines:
+            sections["title"] = lines[0]
+        return sections
+    def _segment_resume(self, resume: str) -> dict:
+        """Heuristic segmentation of resume."""
+        sections = {
+            "contact": "",
+            "summary": "",
+            "experience": "",
+            "education": "",
+            "skills": "",
+            "projects": "",
+            "certifications": "",
+            "full_text": resume,
+        }
+        patterns = {
+            "summary": r"(?:summary|profile|objective|about)[:\s]*\n(.*?)(?=\n(?:experience|education|skills|projects|work)|$)",
+            "experience": r"(?:experience|work history|employment|professional background)[:\s]*\n(.*?)(?=\n(?:education|skills|projects|certifications)|$)",
+            "education": r"(?:education|academic|degrees?)[:\s]*\n(.*?)(?=\n(?:skills|projects|certifications|experience)|$)",
+            "skills": r"(?:skills|technical skills|technologies|competencies)[:\s]*\n(.*?)(?=\n(?:projects|certifications|education|experience)|$)",
+        }
+        for section, pattern in patterns.items():
+            match = re.search(pattern, resume, re.IGNORECASE | re.DOTALL)
+            if match:
+                sections[section] = match.group(1).strip()
+        return sections
+    def _assess_data_quality(self, jd: str, resume: str, ctx: CompanyContext) -> float:
+        """Score 0-1 representing input completeness and richness."""
+        signals = 0
+        total = 10
+        # JD quality
+        if len(jd) > 200:
+            signals += 1
+        if len(jd) > 500:
+            signals += 1
+        if any(kw in jd.lower() for kw in ["requirements", "qualifications", "responsibilities"]):
+            signals += 1
+        # Resume quality
+        if len(resume) > 300:
+            signals += 1
+        if len(resume) > 800:
+            signals += 1
+        if re.search(r"\d{4}", resume):  # Contains years
+            signals += 1
+        if re.search(r"\d+%|\$\d+|\d+\s*(users|customers|team|engineers)", resume, re.IGNORECASE):
+            signals += 1  # Quantified achievements
+        # Company context quality
+        if ctx.compensation_band and ctx.compensation_band != "unknown":
+            signals += 1
+        if ctx.industry and ctx.industry != "unknown":
+            signals += 1
+        if ctx.stage and ctx.stage != "unknown":
+            signals += 1
+        return signals / total