Spaces:
Sleeping
Sleeping
File size: 3,485 Bytes
4dbd292 7e2ee1c b0185e1 7e2ee1c 4dbd292 b0185e1 4dbd292 b0185e1 4dbd292 b0185e1 4dbd292 b0185e1 4dbd292 b0185e1 4dbd292 b0185e1 4dbd292 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from datetime import datetime
import re
def clean_html(raw_html):
"""Remove HTML tags from a string."""
if not raw_html or not isinstance(raw_html, str):
return ""
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext.strip()
def normalize_job_data(job_data, source):
"""
Unifies job data from different sources into a common schema.
Args:
job_data (dict): The raw job data from an API.
source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").
Returns:
dict: A dictionary with the normalized job data.
"""
if source == "RemoteOK":
date_posted_str = 'N/A'
try:
# Safely convert timestamp
timestamp = int(job_data.get('date', 0))
if timestamp > 0:
date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
except (ValueError, TypeError, OSError):
pass
return {
'id': job_data.get('id', ''),
'title': job_data.get('position', 'N/A'),
'company': job_data.get('company', 'N/A'),
'location': job_data.get('location', 'Remote'),
'description': clean_html(job_data.get('description', '')),
'url': job_data.get('url', ''),
'date_posted': date_posted_str,
'source': source
}
elif source == "LinkedIn (Stub)":
return {
'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
'title': job_data.get('title', 'N/A'),
'company': job_data.get('company_name', 'N/A'),
'location': job_data.get('location', 'Remote'),
'description': clean_html(job_data.get('description', '')),
'url': job_data.get('job_url', ''),
'date_posted': job_data.get('posted_at', 'N/A'),
'source': source
}
elif source == "Upwork (Stub)":
return {
'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
'title': job_data.get('title', 'N/A'),
'company': 'Upwork Client',
'location': job_data.get('client', {}).get('country', 'Remote'),
'description': clean_html(job_data.get('snippet', '')),
'url': job_data.get('url', ''),
'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
'source': source
}
elif source == "Freelancer (Stub)":
return {
'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
'title': job_data.get('title', 'N/A'),
'company': 'Freelancer Client',
'location': job_data.get('job_details', {}).get('country', 'Remote'),
'description': clean_html(job_data.get('description', '')),
'url': job_data.get('link', ''),
'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
'source': source
}
else:
# Generic fallback
return {
'id': job_data.get('id', ''),
'title': job_data.get('title', 'N/A'),
'company': job_data.get('company', 'N/A'),
'location': job_data.get('location', 'Remote'),
'description': clean_html(job_data.get('description', '')),
'url': job_data.get('url', ''),
'date_posted': 'N/A',
'source': source
}
|