import feedparser import json import requests import time import os import re from bs4 import BeautifulSoup from datetime import datetime # Configuration USER_AGENT = "Firstify AI Tracker (contact@example.com)" TC_RSS_URL = "https://techcrunch.com/category/startups/feed/" # AI Specific Keywords AI_KEYWORDS = ["ai training", "human feedback", "rlhf", "llm evaluation", "data annotation", "ai workforce", "turing", "prolific", "annotation"] HEADERS = {'User-Agent': USER_AGENT} def slugify(text): text = text.lower() text = re.sub(r'[^a-z0-9]+', '-', text) return text.strip('-') def fetch_ai_training_from_news(): print("Scouting AI Training platforms from news...") response = requests.get(TC_RSS_URL, headers=HEADERS) if response.status_code != 200: return [] feed = feedparser.parse(response.text) platforms = [] for entry in feed.entries: title_lower = entry.title.lower() summary_lower = entry.summary.lower() if hasattr(entry, 'summary') else "" if any(keyword in title_lower or keyword in summary_lower for keyword in AI_KEYWORDS): # Extract company name (heuristic) company_name = entry.title.split(" raises ")[0].split(" funding")[0].split(":")[0] platforms.append({ "src": "AI Scout", "company_name": company_name, "slug": slugify(company_name), "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(), "link": entry.link, "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...", "type": "AI Training", "funding_amount": "N/A", "founders": [] }) return platforms def get_curated_platforms(): # Focus: Platforms where people make money providing data for AI training curated = [ {"name": "DataAnnotation.tech", "desc": "Highest paying AI training platform. Direct access to core tasks.", "url": "https://www.dataannotation.tech/"}, {"name": "Outlier.ai", "desc": "Scale AI's platform for expert RLHF and data training.", "url": "https://app.outlier.ai/en/expert/opportunities"}, {"name": "Remotasks", "desc": "Classic micro-tasking for AI. Direct sign-up for workers.", "url": "https://www.remotasks.com/en/signup"}, {"name": "Prolific", "desc": "Best for researchers and high-quality data providers.", "url": "https://www.prolific.com/participants"}, {"name": "Clickworker", "desc": "Join the Clickworker crowd for AI and data processing.", "url": "https://www.clickworker.com/clickworker/"}, {"name": "Appen", "desc": "Flexible AI training jobs for the world's leading tech labs.", "url": "https://appen.com/jobs/"}, {"name": "Telus Digital", "desc": "Massive AI community for data collection and evaluation.", "url": "https://jobs.telusdigital.com/en_US/careers/aicommunity?intcmp=telus-digital-site"}, {"name": "Toloka", "desc": "Global crowd-force for AI training and RLHF.", "url": "https://toloka.ai/careers#job-list"}, {"name": "Mindrift", "desc": "Generate high-quality data for AI as an expert freelancer.", "url": "https://mindrift.ai/apply"}, {"name": "Turing", "desc": "Largest network for AI training and specialized engineering jobs.", "url": "https://www.turing.com/jobs"}, {"name": "OneForma", "desc": "Centific's platform for AI data and language projects.", "url": "https://www.oneforma.com/jobs/"}, {"name": "Neevo", "desc": "Defined.ai's platform for earning money through tasking.", "url": "https://neevo.defined.ai/"}, {"name": "CloudFactory", "desc": "Meaningful work in data labeling and AI training.", "url": "https://www.cloudfactory.com/cloudworker-details"}, {"name": "Micro1", "desc": "Join an elite network of AI-vetted remote talent.", "url": "https://www.micro1.ai/jobs"}, {"name": "Rapidata", "desc": "High-speed human feedback tasks. Instant sign-up portal.", "url": "https://rapidata.ai/"}, {"name": "Botpool", "desc": "AI training and data annotation marketplace.", "url": "https://www.botpool.ai/find-jobs"}, {"name": "Amazon MTurk", "desc": "The original marketplace for human intelligence (HITs).", "url": "https://www.mturk.com/get-started"}, {"name": "Invisible Technologies", "desc": "Work as an advanced AI training operator.", "url": "https://invisibletech.ai/join-us#open-roles"}, {"name": "Sama", "desc": "Impact-driven data labeling and AI training roles.", "url": "https://www.sama.com/ai101"}, {"name": "Mercor", "desc": "AI-driven recruiting for top-tier training and dev roles.", "url": "https://mercor.com/"}, {"name": "Rex.zone", "desc": "Curated RLHF and data services for training experts.", "url": "https://www.rex.zone/open-opportunities"} ] platforms = [] for p in curated: platforms.append({ "src": "Curated", "company_name": p['name'], "slug": slugify(p['name']), "date": datetime.now().isoformat(), "link": p['url'], "summary": p['desc'], "type": "AI Training Jobs", "funding_amount": "Active", "founders": [] }) return platforms def main(): print("Starting Newly AI Training Scouting...") news_leads = fetch_ai_training_from_news() curated_leads = get_curated_platforms() all_data = news_leads + curated_leads # Deduplicate seen = set() deduped = [] for d in all_data: if d['slug'] not in seen: deduped.append(d) seen.add(d['slug']) # Sync to Frontend script_dir = os.path.dirname(os.path.abspath(__file__)) frontend_public_path = os.path.join(script_dir, "..", "web", "public", "training_data.json") paths_to_save = ["training_data.json"] if os.path.exists(os.path.dirname(frontend_public_path)): paths_to_save.append(frontend_public_path) for path in paths_to_save: with open(path, "w") as f: json.dump(deduped, f, indent=4) print(f"Success! Aggregated {len(deduped)} AI Training platforms into {path}") if __name__ == "__main__": main()