Spaces:

Alpha108
/

MatchHive-ai

Sleeping

App Files Files Community

Alpha108 commited on Sep 27, 2025

Commit

4dbd292

verified ·

1 Parent(s): b373d74

Create normalizer.py

Browse files

Files changed (1) hide show

backend/agents/normalizer.py +78 -0

backend/agents/normalizer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from datetime import datetime
+import re
+def clean_html(raw_html):
+    """Remove HTML tags from a string."""
+    if not raw_html or not isinstance(raw_html, str):
+        return ""
+    cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', raw_html)
+    return cleantext.strip()
+def normalize_job_data(job_data, source):
+    """
+    Unifies job data from different sources into a common schema.
+    Args:
+        job_data (dict): The raw job data from an API.
+        source (str): The name of the job source (e.g., "RemoteOK", "LinkedIn").
+    Returns:
+        dict: A dictionary with the normalized job data.
+    """
+    if source == "RemoteOK":
+        return {
+            'id': job_data.get('id', ''),
+            'title': job_data.get('position', 'N/A'),
+            'company': job_data.get('company', 'N/A'),
+            'location': job_data.get('location', 'Remote'),
+            'description': clean_html(job_data.get('description', '')),
+            'url': job_data.get('url', ''),
+            'date_posted': datetime.fromtimestamp(job_data.get('date', 0)).strftime('%Y-%m-%d'),
+            'source': source
+        }
+    elif source == "LinkedIn (Stub)":
+        return {
+            'id': f"linkedin_{job_data.get('title', '').replace(' ', '_')}",
+            'title': job_data.get('title', 'N/A'),
+            'company': job_data.get('company_name', 'N/A'),
+            'location': job_data.get('location', 'Remote'),
+            'description': clean_html(job_data.get('description', '')),
+            'url': job_data.get('job_url', ''),
+            'date_posted': job_data.get('posted_at', 'N/A'),
+            'source': source
+        }
+    elif source == "Upwork (Stub)":
+        return {
+            'id': f"upwork_{job_data.get('title', '').replace(' ', '_')}",
+            'title': job_data.get('title', 'N/A'),
+            'company': 'Upwork Client', # Upwork is client-based
+            'location': job_data.get('client', {}).get('country', 'Remote'),
+            'description': clean_html(job_data.get('snippet', '')),
+            'url': job_data.get('url', ''),
+            'date_posted': job_data.get('published_on', 'N/A').split('T')[0],
+            'source': source
+        }
+    elif source == "Freelancer (Stub)":
+        return {
+            'id': f"freelancer_{job_data.get('title', '').replace(' ', '_')}",
+            'title': job_data.get('title', 'N/A'),
+            'company': 'Freelancer Client',
+            'location': job_data.get('job_details', {}).get('country', 'Remote'),
+            'description': clean_html(job_data.get('description', '')),
+            'url': job_data.get('link', ''),
+            'date_posted': datetime.fromtimestamp(job_data.get('submitdate', 0)).strftime('%Y-%m-%d'),
+            'source': source
+        }
+    else:
+        # Generic fallback
+        return {
+            'id': job_data.get('id', ''),
+            'title': job_data.get('title', 'N/A'),
+            'company': job_data.get('company', 'N/A'),
+            'location': job_data.get('location', 'Remote'),
+            'description': clean_html(job_data.get('description', '')),
+            'url': job_data.get('url', ''),
+            'date_posted': 'N/A',
+            'source': source
+        }