HR-Resume-Analyzer / text_processing.py
DreamStream-1's picture
Create text_processing.py
d052038 verified
import re
from datetime import datetime
def preprocess_text(text):
"""Enhanced text preprocessing."""
text = text.lower()
text = re.sub(r'[^\w\s-]', ' ', text) # Keep hyphens for date ranges
text = re.sub(r'\s+', ' ', text)
return text.strip()
def extract_dates(text):
"""Improved date extraction with various formats."""
date_patterns = [
r'(\d{4}/\d{2})\s*-\s*(\d{4}/\d{2}|present|current)',
r'(\w+\s+\d{4})\s*-\s*(\w+\s+\d{4}|present|current)',
r'(\d{4})\s*-\s*(\d{4}|present|current)',
]
dates = []
for pattern in date_patterns:
matches = re.finditer(pattern, text, re.IGNORECASE)
dates.extend((m.group(1), m.group(2)) for m in matches)
return dates
def parse_date(date_str):
"""Parse various date formats."""
if not date_str or date_str.lower() in ['present', 'current']:
return datetime.now()
try:
# Try different date formats
formats = ['%Y/%m', '%B %Y', '%b %Y', '%Y']
for fmt in formats:
try:
return datetime.strptime(date_str, fmt)
except ValueError:
continue
return None
except Exception:
return None