Spaces:
Sleeping
Sleeping
Create normalizer.py
Browse files- backend/agents/normalizer.py +78 -0
backend/agents/normalizer.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def clean_html(raw_html):
|
| 5 |
+
"""Remove HTML tags from a string."""
|
| 6 |
+
if not raw_html or not isinstance(raw_html, str):
|
| 7 |
+
return ""
|
| 8 |
+
cleanr = re.compile('<.*?>')
|
| 9 |
+
cleantext = re.sub(cleanr, '', raw_html)
|
| 10 |
+
return cleantext.strip()
|
| 11 |
+
|
| 12 |
+
def normalize_job_data(job_data, source):
|
| 13 |
+
"""
|
| 14 |
+
Unifies job data from different sources into a common schema.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
job_data (dict): The raw job data from an API.
|
| 18 |
+
source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
dict: A dictionary with the normalized job data.
|
| 22 |
+
"""
|
| 23 |
+
if source == "RemoteOK":
|
| 24 |
+
return {
|
| 25 |
+
'id': job_data.get('id', ''),
|
| 26 |
+
'title': job_data.get('position', 'N/A'),
|
| 27 |
+
'company': job_data.get('company', 'N/A'),
|
| 28 |
+
'location': job_data.get('location', 'Remote'),
|
| 29 |
+
'description': clean_html(job_data.get('description', '')),
|
| 30 |
+
'url': job_data.get('url', ''),
|
| 31 |
+
'date_posted': datetime.fromtimestamp(job_data.get('date', 0)).strftime('%Y-%m-%d'),
|
| 32 |
+
'source': source
|
| 33 |
+
}
|
| 34 |
+
elif source == "LinkedIn (Stub)":
|
| 35 |
+
return {
|
| 36 |
+
'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
|
| 37 |
+
'title': job_data.get('title', 'N/A'),
|
| 38 |
+
'company': job_data.get('company_name', 'N/A'),
|
| 39 |
+
'location': job_data.get('location', 'Remote'),
|
| 40 |
+
'description': clean_html(job_data.get('description', '')),
|
| 41 |
+
'url': job_data.get('job_url', ''),
|
| 42 |
+
'date_posted': job_data.get('posted_at', 'N/A'),
|
| 43 |
+
'source': source
|
| 44 |
+
}
|
| 45 |
+
elif source == "Upwork (Stub)":
|
| 46 |
+
return {
|
| 47 |
+
'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
|
| 48 |
+
'title': job_data.get('title', 'N/A'),
|
| 49 |
+
'company': 'Upwork Client', # Upwork is client-based
|
| 50 |
+
'location': job_data.get('client', {}).get('country', 'Remote'),
|
| 51 |
+
'description': clean_html(job_data.get('snippet', '')),
|
| 52 |
+
'url': job_data.get('url', ''),
|
| 53 |
+
'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
|
| 54 |
+
'source': source
|
| 55 |
+
}
|
| 56 |
+
elif source == "Freelancer (Stub)":
|
| 57 |
+
return {
|
| 58 |
+
'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
|
| 59 |
+
'title': job_data.get('title', 'N/A'),
|
| 60 |
+
'company': 'Freelancer Client',
|
| 61 |
+
'location': job_data.get('job_details', {}).get('country', 'Remote'),
|
| 62 |
+
'description': clean_html(job_data.get('description', '')),
|
| 63 |
+
'url': job_data.get('link', ''),
|
| 64 |
+
'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
|
| 65 |
+
'source': source
|
| 66 |
+
}
|
| 67 |
+
else:
|
| 68 |
+
# Generic fallback
|
| 69 |
+
return {
|
| 70 |
+
'id': job_data.get('id', ''),
|
| 71 |
+
'title': job_data.get('title', 'N/A'),
|
| 72 |
+
'company': job_data.get('company', 'N/A'),
|
| 73 |
+
'location': job_data.get('location', 'Remote'),
|
| 74 |
+
'description': clean_html(job_data.get('description', '')),
|
| 75 |
+
'url': job_data.get('url', ''),
|
| 76 |
+
'date_posted': 'N/A',
|
| 77 |
+
'source': source
|
| 78 |
+
}
|