Alpha108 commited on
Commit
4dbd292
·
verified ·
1 Parent(s): b373d74

Create normalizer.py

Browse files
Files changed (1) hide show
  1. backend/agents/normalizer.py +78 -0
backend/agents/normalizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ import re
3
+
4
+ def clean_html(raw_html):
5
+ """Remove HTML tags from a string."""
6
+ if not raw_html or not isinstance(raw_html, str):
7
+ return ""
8
+ cleanr = re.compile('<.*?>')
9
+ cleantext = re.sub(cleanr, '', raw_html)
10
+ return cleantext.strip()
11
+
12
+ def normalize_job_data(job_data, source):
13
+ """
14
+ Unifies job data from different sources into a common schema.
15
+
16
+ Args:
17
+ job_data (dict): The raw job data from an API.
18
+ source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").
19
+
20
+ Returns:
21
+ dict: A dictionary with the normalized job data.
22
+ """
23
+ if source == "RemoteOK":
24
+ return {
25
+ 'id': job_data.get('id', ''),
26
+ 'title': job_data.get('position', 'N/A'),
27
+ 'company': job_data.get('company', 'N/A'),
28
+ 'location': job_data.get('location', 'Remote'),
29
+ 'description': clean_html(job_data.get('description', '')),
30
+ 'url': job_data.get('url', ''),
31
+ 'date_posted': datetime.fromtimestamp(job_data.get('date', 0)).strftime('%Y-%m-%d'),
32
+ 'source': source
33
+ }
34
+ elif source == "LinkedIn (Stub)":
35
+ return {
36
+ 'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
37
+ 'title': job_data.get('title', 'N/A'),
38
+ 'company': job_data.get('company_name', 'N/A'),
39
+ 'location': job_data.get('location', 'Remote'),
40
+ 'description': clean_html(job_data.get('description', '')),
41
+ 'url': job_data.get('job_url', ''),
42
+ 'date_posted': job_data.get('posted_at', 'N/A'),
43
+ 'source': source
44
+ }
45
+ elif source == "Upwork (Stub)":
46
+ return {
47
+ 'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
48
+ 'title': job_data.get('title', 'N/A'),
49
+ 'company': 'Upwork Client', # Upwork is client-based
50
+ 'location': job_data.get('client', {}).get('country', 'Remote'),
51
+ 'description': clean_html(job_data.get('snippet', '')),
52
+ 'url': job_data.get('url', ''),
53
+ 'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
54
+ 'source': source
55
+ }
56
+ elif source == "Freelancer (Stub)":
57
+ return {
58
+ 'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
59
+ 'title': job_data.get('title', 'N/A'),
60
+ 'company': 'Freelancer Client',
61
+ 'location': job_data.get('job_details', {}).get('country', 'Remote'),
62
+ 'description': clean_html(job_data.get('description', '')),
63
+ 'url': job_data.get('link', ''),
64
+ 'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
65
+ 'source': source
66
+ }
67
+ else:
68
+ # Generic fallback
69
+ return {
70
+ 'id': job_data.get('id', ''),
71
+ 'title': job_data.get('title', 'N/A'),
72
+ 'company': job_data.get('company', 'N/A'),
73
+ 'location': job_data.get('location', 'Remote'),
74
+ 'description': clean_html(job_data.get('description', '')),
75
+ 'url': job_data.get('url', ''),
76
+ 'date_posted': 'N/A',
77
+ 'source': source
78
+ }