Spaces:
Sleeping
Sleeping
Update backend/agents/normalizer.py
Browse files
backend/agents/normalizer.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
from datetime import datetime
|
| 2 |
-
from .normalizer import normalize_job_data
|
| 3 |
-
|
| 4 |
import re
|
| 5 |
|
| 6 |
def clean_html(raw_html):
|
|
@@ -25,12 +23,11 @@ def normalize_job_data(job_data, source):
|
|
| 25 |
if source == "RemoteOK":
|
| 26 |
date_posted_str = 'N/A'
|
| 27 |
try:
|
| 28 |
-
#
|
| 29 |
timestamp = int(job_data.get('date', 0))
|
| 30 |
if timestamp > 0:
|
| 31 |
date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
|
| 32 |
except (ValueError, TypeError, OSError):
|
| 33 |
-
# If the date is not a valid number or causes an error, it will default to 'N/A'.
|
| 34 |
pass
|
| 35 |
|
| 36 |
return {
|
|
@@ -40,9 +37,10 @@ def normalize_job_data(job_data, source):
|
|
| 40 |
'location': job_data.get('location', 'Remote'),
|
| 41 |
'description': clean_html(job_data.get('description', '')),
|
| 42 |
'url': job_data.get('url', ''),
|
| 43 |
-
'date_posted': date_posted_str,
|
| 44 |
'source': source
|
| 45 |
}
|
|
|
|
| 46 |
elif source == "LinkedIn (Stub)":
|
| 47 |
return {
|
| 48 |
'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
|
|
@@ -54,17 +52,19 @@ def normalize_job_data(job_data, source):
|
|
| 54 |
'date_posted': job_data.get('posted_at', 'N/A'),
|
| 55 |
'source': source
|
| 56 |
}
|
|
|
|
| 57 |
elif source == "Upwork (Stub)":
|
| 58 |
return {
|
| 59 |
'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
|
| 60 |
'title': job_data.get('title', 'N/A'),
|
| 61 |
-
'company': 'Upwork Client',
|
| 62 |
'location': job_data.get('client', {}).get('country', 'Remote'),
|
| 63 |
'description': clean_html(job_data.get('snippet', '')),
|
| 64 |
'url': job_data.get('url', ''),
|
| 65 |
'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
|
| 66 |
'source': source
|
| 67 |
}
|
|
|
|
| 68 |
elif source == "Freelancer (Stub)":
|
| 69 |
return {
|
| 70 |
'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
|
|
@@ -76,6 +76,7 @@ def normalize_job_data(job_data, source):
|
|
| 76 |
'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
|
| 77 |
'source': source
|
| 78 |
}
|
|
|
|
| 79 |
else:
|
| 80 |
# Generic fallback
|
| 81 |
return {
|
|
@@ -88,4 +89,3 @@ def normalize_job_data(job_data, source):
|
|
| 88 |
'date_posted': 'N/A',
|
| 89 |
'source': source
|
| 90 |
}
|
| 91 |
-
|
|
|
|
| 1 |
from datetime import datetime
|
|
|
|
|
|
|
| 2 |
import re
|
| 3 |
|
| 4 |
def clean_html(raw_html):
|
|
|
|
| 23 |
if source == "RemoteOK":
|
| 24 |
date_posted_str = 'N/A'
|
| 25 |
try:
|
| 26 |
+
# Safely convert timestamp
|
| 27 |
timestamp = int(job_data.get('date', 0))
|
| 28 |
if timestamp > 0:
|
| 29 |
date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
|
| 30 |
except (ValueError, TypeError, OSError):
|
|
|
|
| 31 |
pass
|
| 32 |
|
| 33 |
return {
|
|
|
|
| 37 |
'location': job_data.get('location', 'Remote'),
|
| 38 |
'description': clean_html(job_data.get('description', '')),
|
| 39 |
'url': job_data.get('url', ''),
|
| 40 |
+
'date_posted': date_posted_str,
|
| 41 |
'source': source
|
| 42 |
}
|
| 43 |
+
|
| 44 |
elif source == "LinkedIn (Stub)":
|
| 45 |
return {
|
| 46 |
'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
|
|
|
|
| 52 |
'date_posted': job_data.get('posted_at', 'N/A'),
|
| 53 |
'source': source
|
| 54 |
}
|
| 55 |
+
|
| 56 |
elif source == "Upwork (Stub)":
|
| 57 |
return {
|
| 58 |
'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
|
| 59 |
'title': job_data.get('title', 'N/A'),
|
| 60 |
+
'company': 'Upwork Client',
|
| 61 |
'location': job_data.get('client', {}).get('country', 'Remote'),
|
| 62 |
'description': clean_html(job_data.get('snippet', '')),
|
| 63 |
'url': job_data.get('url', ''),
|
| 64 |
'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
|
| 65 |
'source': source
|
| 66 |
}
|
| 67 |
+
|
| 68 |
elif source == "Freelancer (Stub)":
|
| 69 |
return {
|
| 70 |
'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
|
|
|
|
| 76 |
'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
|
| 77 |
'source': source
|
| 78 |
}
|
| 79 |
+
|
| 80 |
else:
|
| 81 |
# Generic fallback
|
| 82 |
return {
|
|
|
|
| 89 |
'date_posted': 'N/A',
|
| 90 |
'source': source
|
| 91 |
}
|
|
|