import re import requests import json import pandas as pd from notion_client import Client import logging def is_valid_url(url): """Validate URL format""" pattern = re.compile( r'^https?://' r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' r'localhost|' r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' r'(?::\d+)?' r'(?:/?|[/?]\S+)$', re.IGNORECASE) return url.startswith(('http://', 'https://')) and bool(pattern.match(url)) def process_file(file_content, file_name, api_endpoint): """Process uploaded file through API""" try: response = requests.post( api_endpoint, files={"file": (file_name, file_content)} ) return handle_api_response(response) except Exception as e: raise Exception(f"File processing failed: {str(e)}") def process_url(url, api_endpoint): """Process URL through API""" if not is_valid_url(url): raise ValueError("Invalid URL format") try: response = requests.post( api_endpoint, json={"websiteUrl": url}, headers={"Content-Type": "application/json"} ) return handle_api_response(response) except Exception as e: raise Exception(f"URL processing failed: {str(e)}") def handle_api_response(response): """Handle API response and return standardized data""" if response.status_code == 200: # First, try to detect if the response text looks like JSON response_text = response.text.strip() # Check if response looks like JSON (starts with { or [) if response_text.startswith(('{', '[')): try: # Try to parse as JSON first response_data = json.loads(response_text) if isinstance(response_data, dict): # Handle both direct responses and responses with 'value' field content = response_data.get('value', response_data) # If content is a string, try to parse it as JSON if isinstance(content, str): try: if content.strip().startswith(('{', '[')): return json.loads(content) else: return content except json.JSONDecodeError: return content return content return response_data except json.JSONDecodeError: # If JSON parsing fails, return as text return response_text else: # Doesn't look like JSON, return as plain text (sad path) return response_text elif response.status_code == 500: raise Exception("Multiple job postings detected. Please use the specific job posting URL.") else: raise Exception(f"API Error: {response.status_code}") def display_job_data(job_data): if isinstance(job_data, dict): # Convert dict to DataFrame return pd.DataFrame([job_data]) elif isinstance(job_data, str): # Check if the string contains HTML tags if '<' in job_data and '>' in job_data: # Return a tuple to indicate it's HTML content return ('html', job_data) else: # Return regular text return job_data else: # Fallback string conversion return str(job_data) def process_text(text, api_endpoint): """Send raw text to API for processing.""" try: response = requests.post( api_endpoint, json={"text": text}, # Using the expected payload format headers={"Content-Type": "application/json"} ) return handle_api_response(response) except Exception as e: raise Exception(f"Text processing failed: {str(e)}") # Add this new function def validate_job_fields(data): """Validate required fields in job data. Returns: tuple: (is_valid: bool, error_message: str) """ required_fields = { 'Company': 'Empresa', 'Job Title': 'Título del puesto', 'apply_Url': 'URL de aplicación', 'Remote': 'Modalidad' } missing = [] for field, display_name in required_fields.items(): field_value = data.get(field) if pd.isna(field_value) or not str(field_value).strip(): missing.append(display_name) if missing: return (False, f"Campos obligatorios faltantes: {', '.join(missing)}") return (True, None) # Modify send_to_notion to include validation def send_to_notion(data, database_id, token): """Send data to Notion after validation.""" # Validate first is_valid, error_msg = validate_job_fields(data) if not is_valid: raise ValueError(error_msg) # Original Notion logic try: notion = Client(auth=token) new_page = { "Role": {"title": [{"text": {"content": str(data["Job Title"])}}]}, "Startup": {"rich_text": [{"text": {"content": str(data["Company"])}}]}, "Apply URL": {"url": data["apply_Url"]}, "Summary": {"rich_text": [{"text": {"content": str(data.get("Description", ""))}}]}, "Location": {"rich_text": [{"text": {"content": str(data.get("Location", ""))}}]}, "Remote": {"select": {"name": str(data["Remote"])}} } if data.get("file_Url"): new_page["Original file"] = {"url": data["file_Url"]} created_page = notion.pages.create( parent={"database_id": database_id}, properties=new_page ) return created_page["url"] except Exception as e: logging.error(f"Notion API error: {str(e)}") raise Exception(f"Error al enviar a Notion: {str(e)}")