Spaces:
Sleeping
Sleeping
| from datetime import datetime | |
| import re | |
| def clean_html(raw_html): | |
| """Remove HTML tags from a string.""" | |
| if not raw_html or not isinstance(raw_html, str): | |
| return "" | |
| cleanr = re.compile('<.*?>') | |
| cleantext = re.sub(cleanr, '', raw_html) | |
| return cleantext.strip() | |
| def normalize_job_data(job_data, source): | |
| """ | |
| Unifies job data from different sources into a common schema. | |
| Args: | |
| job_data (dict): The raw job data from an API. | |
| source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn"). | |
| Returns: | |
| dict: A dictionary with the normalized job data. | |
| """ | |
| if source == "RemoteOK": | |
| date_posted_str = 'N/A' | |
| try: | |
| # Safely convert timestamp | |
| timestamp = int(job_data.get('date', 0)) | |
| if timestamp > 0: | |
| date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d') | |
| except (ValueError, TypeError, OSError): | |
| pass | |
| return { | |
| 'id': job_data.get('id', ''), | |
| 'title': job_data.get('position', 'N/A'), | |
| 'company': job_data.get('company', 'N/A'), | |
| 'location': job_data.get('location', 'Remote'), | |
| 'description': clean_html(job_data.get('description', '')), | |
| 'url': job_data.get('url', ''), | |
| 'date_posted': date_posted_str, | |
| 'source': source | |
| } | |
| elif source == "LinkedIn (Stub)": | |
| return { | |
| 'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}", | |
| 'title': job_data.get('title', 'N/A'), | |
| 'company': job_data.get('company_name', 'N/A'), | |
| 'location': job_data.get('location', 'Remote'), | |
| 'description': clean_html(job_data.get('description', '')), | |
| 'url': job_data.get('job_url', ''), | |
| 'date_posted': job_data.get('posted_at', 'N/A'), | |
| 'source': source | |
| } | |
| elif source == "Upwork (Stub)": | |
| return { | |
| 'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}", | |
| 'title': job_data.get('title', 'N/A'), | |
| 'company': 'Upwork Client', | |
| 'location': job_data.get('client', {}).get('country', 'Remote'), | |
| 'description': clean_html(job_data.get('snippet', '')), | |
| 'url': job_data.get('url', ''), | |
| 'date_posted': job_data.get('published_on', 'N/A').split('T')[0], | |
| 'source': source | |
| } | |
| elif source == "Freelancer (Stub)": | |
| return { | |
| 'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}", | |
| 'title': job_data.get('title', 'N/A'), | |
| 'company': 'Freelancer Client', | |
| 'location': job_data.get('job_details', {}).get('country', 'Remote'), | |
| 'description': clean_html(job_data.get('description', '')), | |
| 'url': job_data.get('link', ''), | |
| 'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'), | |
| 'source': source | |
| } | |
| else: | |
| # Generic fallback | |
| return { | |
| 'id': job_data.get('id', ''), | |
| 'title': job_data.get('title', 'N/A'), | |
| 'company': job_data.get('company', 'N/A'), | |
| 'location': job_data.get('location', 'Remote'), | |
| 'description': clean_html(job_data.get('description', '')), | |
| 'url': job_data.get('url', ''), | |
| 'date_posted': 'N/A', | |
| 'source': source | |
| } | |