File size: 3,485 Bytes
4dbd292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e2ee1c
 
b0185e1
7e2ee1c
 
 
 
 
 
4dbd292
 
 
 
 
 
 
b0185e1
4dbd292
 
b0185e1
4dbd292
 
 
 
 
 
 
 
 
 
 
b0185e1
4dbd292
 
 
 
b0185e1
4dbd292
 
 
 
 
 
b0185e1
4dbd292
 
 
 
 
 
 
 
 
 
 
b0185e1
4dbd292
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from datetime import datetime
import re

def clean_html(raw_html):
    """Remove HTML tags from a string."""
    if not raw_html or not isinstance(raw_html, str):
        return ""
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext.strip()

def normalize_job_data(job_data, source):
    """
    Unifies job data from different sources into a common schema.

    Args:
        job_data (dict): The raw job data from an API.
        source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").

    Returns:
        dict: A dictionary with the normalized job data.
    """
    if source == "RemoteOK":
        date_posted_str = 'N/A'
        try:
            # Safely convert timestamp
            timestamp = int(job_data.get('date', 0))
            if timestamp > 0:
                date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
        except (ValueError, TypeError, OSError):
            pass

        return {
            'id': job_data.get('id', ''),
            'title': job_data.get('position', 'N/A'),
            'company': job_data.get('company', 'N/A'),
            'location': job_data.get('location', 'Remote'),
            'description': clean_html(job_data.get('description', '')),
            'url': job_data.get('url', ''),
            'date_posted': date_posted_str,
            'source': source
        }

    elif source == "LinkedIn (Stub)":
        return {
            'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
            'title': job_data.get('title', 'N/A'),
            'company': job_data.get('company_name', 'N/A'),
            'location': job_data.get('location', 'Remote'),
            'description': clean_html(job_data.get('description', '')),
            'url': job_data.get('job_url', ''),
            'date_posted': job_data.get('posted_at', 'N/A'),
            'source': source
        }

    elif source == "Upwork (Stub)":
        return {
            'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
            'title': job_data.get('title', 'N/A'),
            'company': 'Upwork Client',
            'location': job_data.get('client', {}).get('country', 'Remote'),
            'description': clean_html(job_data.get('snippet', '')),
            'url': job_data.get('url', ''),
            'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
            'source': source
        }

    elif source == "Freelancer (Stub)":
        return {
            'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
            'title': job_data.get('title', 'N/A'),
            'company': 'Freelancer Client',
            'location': job_data.get('job_details', {}).get('country', 'Remote'),
            'description': clean_html(job_data.get('description', '')),
            'url': job_data.get('link', ''),
            'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
            'source': source
        }

    else:
        # Generic fallback
        return {
            'id': job_data.get('id', ''),
            'title': job_data.get('title', 'N/A'),
            'company': job_data.get('company', 'N/A'),
            'location': job_data.get('location', 'Remote'),
            'description': clean_html(job_data.get('description', '')),
            'url': job_data.get('url', ''),
            'date_posted': 'N/A',
            'source': source
        }