| import feedparser |
| import json |
| import requests |
| import time |
| import os |
| import re |
| from bs4 import BeautifulSoup |
| from datetime import datetime |
|
|
| |
| USER_AGENT = "Firstify AI Tracker (contact@example.com)" |
| TC_RSS_URL = "https://techcrunch.com/category/startups/feed/" |
| |
| AI_KEYWORDS = ["ai training", "human feedback", "rlhf", "llm evaluation", "data annotation", "ai workforce", "turing", "prolific", "annotation"] |
|
|
| HEADERS = {'User-Agent': USER_AGENT} |
|
|
| def slugify(text): |
| text = text.lower() |
| text = re.sub(r'[^a-z0-9]+', '-', text) |
| return text.strip('-') |
|
|
| def fetch_ai_training_from_news(): |
| print("Scouting AI Training platforms from news...") |
| response = requests.get(TC_RSS_URL, headers=HEADERS) |
| if response.status_code != 200: return [] |
| |
| feed = feedparser.parse(response.text) |
| platforms = [] |
| for entry in feed.entries: |
| title_lower = entry.title.lower() |
| summary_lower = entry.summary.lower() if hasattr(entry, 'summary') else "" |
| |
| if any(keyword in title_lower or keyword in summary_lower for keyword in AI_KEYWORDS): |
| |
| company_name = entry.title.split(" raises ")[0].split(" funding")[0].split(":")[0] |
| platforms.append({ |
| "src": "AI Scout", |
| "company_name": company_name, |
| "slug": slugify(company_name), |
| "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(), |
| "link": entry.link, |
| "summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...", |
| "type": "AI Training", |
| "funding_amount": "N/A", |
| "founders": [] |
| }) |
| return platforms |
|
|
| def get_curated_platforms(): |
| |
| curated = [ |
| {"name": "DataAnnotation.tech", "desc": "Highest paying AI training platform. Direct access to core tasks.", "url": "https://www.dataannotation.tech/"}, |
| {"name": "Outlier.ai", "desc": "Scale AI's platform for expert RLHF and data training.", "url": "https://app.outlier.ai/en/expert/opportunities"}, |
| {"name": "Remotasks", "desc": "Classic micro-tasking for AI. Direct sign-up for workers.", "url": "https://www.remotasks.com/en/signup"}, |
| {"name": "Prolific", "desc": "Best for researchers and high-quality data providers.", "url": "https://www.prolific.com/participants"}, |
| {"name": "Clickworker", "desc": "Join the Clickworker crowd for AI and data processing.", "url": "https://www.clickworker.com/clickworker/"}, |
| {"name": "Appen", "desc": "Flexible AI training jobs for the world's leading tech labs.", "url": "https://appen.com/jobs/"}, |
| {"name": "Telus Digital", "desc": "Massive AI community for data collection and evaluation.", "url": "https://jobs.telusdigital.com/en_US/careers/aicommunity?intcmp=telus-digital-site"}, |
| {"name": "Toloka", "desc": "Global crowd-force for AI training and RLHF.", "url": "https://toloka.ai/careers#job-list"}, |
| {"name": "Mindrift", "desc": "Generate high-quality data for AI as an expert freelancer.", "url": "https://mindrift.ai/apply"}, |
| {"name": "Turing", "desc": "Largest network for AI training and specialized engineering jobs.", "url": "https://www.turing.com/jobs"}, |
| {"name": "OneForma", "desc": "Centific's platform for AI data and language projects.", "url": "https://www.oneforma.com/jobs/"}, |
| {"name": "Neevo", "desc": "Defined.ai's platform for earning money through tasking.", "url": "https://neevo.defined.ai/"}, |
| {"name": "CloudFactory", "desc": "Meaningful work in data labeling and AI training.", "url": "https://www.cloudfactory.com/cloudworker-details"}, |
| {"name": "Micro1", "desc": "Join an elite network of AI-vetted remote talent.", "url": "https://www.micro1.ai/jobs"}, |
| {"name": "Rapidata", "desc": "High-speed human feedback tasks. Instant sign-up portal.", "url": "https://rapidata.ai/"}, |
| {"name": "Botpool", "desc": "AI training and data annotation marketplace.", "url": "https://www.botpool.ai/find-jobs"}, |
| {"name": "Amazon MTurk", "desc": "The original marketplace for human intelligence (HITs).", "url": "https://www.mturk.com/get-started"}, |
| {"name": "Invisible Technologies", "desc": "Work as an advanced AI training operator.", "url": "https://invisibletech.ai/join-us#open-roles"}, |
| {"name": "Sama", "desc": "Impact-driven data labeling and AI training roles.", "url": "https://www.sama.com/ai101"}, |
| {"name": "Mercor", "desc": "AI-driven recruiting for top-tier training and dev roles.", "url": "https://mercor.com/"}, |
| {"name": "Rex.zone", "desc": "Curated RLHF and data services for training experts.", "url": "https://www.rex.zone/open-opportunities"} |
| ] |
| platforms = [] |
| for p in curated: |
| platforms.append({ |
| "src": "Curated", |
| "company_name": p['name'], |
| "slug": slugify(p['name']), |
| "date": datetime.now().isoformat(), |
| "link": p['url'], |
| "summary": p['desc'], |
| "type": "AI Training Jobs", |
| "funding_amount": "Active", |
| "founders": [] |
| }) |
| return platforms |
|
|
| def main(): |
| print("Starting Newly AI Training Scouting...") |
| |
| news_leads = fetch_ai_training_from_news() |
| curated_leads = get_curated_platforms() |
| |
| all_data = news_leads + curated_leads |
| |
| |
| seen = set() |
| deduped = [] |
| for d in all_data: |
| if d['slug'] not in seen: |
| deduped.append(d) |
| seen.add(d['slug']) |
|
|
| |
| script_dir = os.path.dirname(os.path.abspath(__file__)) |
| frontend_public_path = os.path.join(script_dir, "..", "web", "public", "training_data.json") |
| |
| paths_to_save = ["training_data.json"] |
| if os.path.exists(os.path.dirname(frontend_public_path)): |
| paths_to_save.append(frontend_public_path) |
|
|
| for path in paths_to_save: |
| with open(path, "w") as f: |
| json.dump(deduped, f, indent=4) |
| print(f"Success! Aggregated {len(deduped)} AI Training platforms into {path}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|