Spaces:

Alpha108
/

MatchHive-ai

Sleeping

App Files Files Community

MatchHive-ai / backend /agents /normalizer.py

Alpha108

Update backend/agents/normalizer.py

b0185e1 verified 4 months ago

raw

history blame contribute delete

3.49 kB

	from datetime import datetime
	import re

	def clean_html(raw_html):
	"""Remove HTML tags from a string."""
	if not raw_html or not isinstance(raw_html, str):
	return ""
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext.strip()

	def normalize_job_data(job_data, source):
	"""
	Unifies job data from different sources into a common schema.

	Args:
	job_data (dict): The raw job data from an API.
	source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").

	Returns:
	dict: A dictionary with the normalized job data.
	"""
	if source == "RemoteOK":
	date_posted_str = 'N/A'
	try:
	# Safely convert timestamp
	timestamp = int(job_data.get('date', 0))
	if timestamp > 0:
	date_posted_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d')
	except (ValueError, TypeError, OSError):
	pass

	return {
	'id': job_data.get('id', ''),
	'title': job_data.get('position', 'N/A'),
	'company': job_data.get('company', 'N/A'),
	'location': job_data.get('location', 'Remote'),
	'description': clean_html(job_data.get('description', '')),
	'url': job_data.get('url', ''),
	'date_posted': date_posted_str,
	'source': source
	}

	elif source == "LinkedIn (Stub)":
	return {
	'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
	'title': job_data.get('title', 'N/A'),
	'company': job_data.get('company_name', 'N/A'),
	'location': job_data.get('location', 'Remote'),
	'description': clean_html(job_data.get('description', '')),
	'url': job_data.get('job_url', ''),
	'date_posted': job_data.get('posted_at', 'N/A'),
	'source': source
	}

	elif source == "Upwork (Stub)":
	return {
	'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
	'title': job_data.get('title', 'N/A'),
	'company': 'Upwork Client',
	'location': job_data.get('client', {}).get('country', 'Remote'),
	'description': clean_html(job_data.get('snippet', '')),
	'url': job_data.get('url', ''),
	'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
	'source': source
	}

	elif source == "Freelancer (Stub)":
	return {
	'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
	'title': job_data.get('title', 'N/A'),
	'company': 'Freelancer Client',
	'location': job_data.get('job_details', {}).get('country', 'Remote'),
	'description': clean_html(job_data.get('description', '')),
	'url': job_data.get('link', ''),
	'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
	'source': source
	}

	else:
	# Generic fallback
	return {
	'id': job_data.get('id', ''),
	'title': job_data.get('title', 'N/A'),
	'company': job_data.get('company', 'N/A'),
	'location': job_data.get('location', 'Remote'),
	'description': clean_html(job_data.get('description', '')),
	'url': job_data.get('url', ''),
	'date_posted': 'N/A',
	'source': source
	}