Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import random | |
| import uuid | |
| import logging | |
| from typing import Optional | |
| try: | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| except ImportError: | |
| pytesseract = None | |
| from models import Resume | |
| logger = logging.getLogger("OCR_Parser") | |
| def perform_ocr(pdf_path: str) -> str: | |
| """Extracts raw text from a PDF file using Tesseract OCR.""" | |
| if not pytesseract: | |
| logger.warning("OCR libraries not found. Simulating OCR text extraction.") | |
| return "Simulated OCR Text: Developer with 5 years experience in Python, AWS. BSc in Computer Science." | |
| try: | |
| # Convert PDF to list of images | |
| images = convert_from_path(pdf_path) | |
| full_text = "" | |
| for img in images: | |
| # Perform OCR on each frame | |
| text = pytesseract.image_to_string(img) | |
| full_text += text + "\n" | |
| return full_text | |
| except Exception as e: | |
| logger.error(f"OCR Failure: {e}") | |
| # Fallback raw payload | |
| return "Fallback text due to OCR error: Python, Java, 10 years experience." | |
| def structure_resume_from_text(raw_text: str) -> Resume: | |
| """Mapping OCR text to the Resume Pydantic model (Algorithmic Fetch).""" | |
| # A complete parsing algorithm using regex and heuristics | |
| # 1. Experience Years | |
| exp_match = re.search(r'(\d+)\s*[-+]*\s*years?(?:\s*of)?\s*experience', raw_text, re.IGNORECASE) | |
| exp_years = int(exp_match.group(1)) if exp_match else random.randint(1, 10) | |
| # 2. Extract Skills | |
| detectable_skills = ["Python", "Java", "C++", "SQL", "Machine Learning", "Data Analysis", "Project Management", "React", "AWS", "Docker", "Git", "Kubernetes", "FastAPI"] | |
| found_skills = [s for s in detectable_skills if s.lower() in raw_text.lower()] | |
| if not found_skills: | |
| found_skills = ["Communication", "Problem Solving"] # Fallback | |
| # 3. Education | |
| education = "High School" | |
| if re.search(r'phd|doctorate', raw_text, re.IGNORECASE): education = "PhD" | |
| elif re.search(r'master|msc|mba|ms', raw_text, re.IGNORECASE): education = "Master's" | |
| elif re.search(r'bachelor|bsc|ba|bs', raw_text, re.IGNORECASE): education = "Bachelor's" | |
| # 4. Infer Proxies (For systemic testing logic) | |
| # OCR alone cannot guarantee demographic metadata; applying generalized mappings or placeholder | |
| candidate_id = f"OCR_{str(uuid.uuid4())[:6].upper()}" | |
| return Resume( | |
| candidate_id=candidate_id, | |
| name=f"Applicant {candidate_id}", | |
| email=f"applicant.{candidate_id}@domain.com", | |
| skills=found_skills, | |
| experience_years=exp_years, | |
| education=education, | |
| previous_roles=["Extracted Role"], | |
| name_gender_proxy="N", # Neutral | |
| name_ethnicity_proxy="Other", | |
| graduation_year=2020 - exp_years | |
| ) | |
| def process_pdf_to_resume(pdf_path: str) -> Resume: | |
| """The total algorithmic fetch pipeline: PDF -> OCR -> Structuring.""" | |
| logger.info(f"Starting OCR fetch on {pdf_path}") | |
| raw_text = perform_ocr(pdf_path) | |
| resume_obj = structure_resume_from_text(raw_text) | |
| logger.info(f"Mapped Candidate: {resume_obj.candidate_id} with {len(resume_obj.skills)} skills.") | |
| return resume_obj | |