startups / ai_training_scraper.py
babaTEEpe's picture
Update ai_training_scraper.py
c528907 verified
import feedparser
import json
import requests
import time
import os
import re
from bs4 import BeautifulSoup
from datetime import datetime
# Configuration
USER_AGENT = "Firstify AI Tracker (contact@example.com)"
TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
# AI Specific Keywords
AI_KEYWORDS = ["ai training", "human feedback", "rlhf", "llm evaluation", "data annotation", "ai workforce", "turing", "prolific", "annotation"]
HEADERS = {'User-Agent': USER_AGENT}
def slugify(text):
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')
def fetch_ai_training_from_news():
print("Scouting AI Training platforms from news...")
response = requests.get(TC_RSS_URL, headers=HEADERS)
if response.status_code != 200: return []
feed = feedparser.parse(response.text)
platforms = []
for entry in feed.entries:
title_lower = entry.title.lower()
summary_lower = entry.summary.lower() if hasattr(entry, 'summary') else ""
if any(keyword in title_lower or keyword in summary_lower for keyword in AI_KEYWORDS):
# Extract company name (heuristic)
company_name = entry.title.split(" raises ")[0].split(" funding")[0].split(":")[0]
platforms.append({
"src": "AI Scout",
"company_name": company_name,
"slug": slugify(company_name),
"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
"link": entry.link,
"summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
"type": "AI Training",
"funding_amount": "N/A",
"founders": []
})
return platforms
def get_curated_platforms():
# Focus: Platforms where people make money providing data for AI training
curated = [
{"name": "DataAnnotation.tech", "desc": "Highest paying AI training platform. Direct access to core tasks.", "url": "https://www.dataannotation.tech/"},
{"name": "Outlier.ai", "desc": "Scale AI's platform for expert RLHF and data training.", "url": "https://app.outlier.ai/en/expert/opportunities"},
{"name": "Remotasks", "desc": "Classic micro-tasking for AI. Direct sign-up for workers.", "url": "https://www.remotasks.com/en/signup"},
{"name": "Prolific", "desc": "Best for researchers and high-quality data providers.", "url": "https://www.prolific.com/participants"},
{"name": "Clickworker", "desc": "Join the Clickworker crowd for AI and data processing.", "url": "https://www.clickworker.com/clickworker/"},
{"name": "Appen", "desc": "Flexible AI training jobs for the world's leading tech labs.", "url": "https://appen.com/jobs/"},
{"name": "Telus Digital", "desc": "Massive AI community for data collection and evaluation.", "url": "https://jobs.telusdigital.com/en_US/careers/aicommunity?intcmp=telus-digital-site"},
{"name": "Toloka", "desc": "Global crowd-force for AI training and RLHF.", "url": "https://toloka.ai/careers#job-list"},
{"name": "Mindrift", "desc": "Generate high-quality data for AI as an expert freelancer.", "url": "https://mindrift.ai/apply"},
{"name": "Turing", "desc": "Largest network for AI training and specialized engineering jobs.", "url": "https://www.turing.com/jobs"},
{"name": "OneForma", "desc": "Centific's platform for AI data and language projects.", "url": "https://www.oneforma.com/jobs/"},
{"name": "Neevo", "desc": "Defined.ai's platform for earning money through tasking.", "url": "https://neevo.defined.ai/"},
{"name": "CloudFactory", "desc": "Meaningful work in data labeling and AI training.", "url": "https://www.cloudfactory.com/cloudworker-details"},
{"name": "Micro1", "desc": "Join an elite network of AI-vetted remote talent.", "url": "https://www.micro1.ai/jobs"},
{"name": "Rapidata", "desc": "High-speed human feedback tasks. Instant sign-up portal.", "url": "https://rapidata.ai/"},
{"name": "Botpool", "desc": "AI training and data annotation marketplace.", "url": "https://www.botpool.ai/find-jobs"},
{"name": "Amazon MTurk", "desc": "The original marketplace for human intelligence (HITs).", "url": "https://www.mturk.com/get-started"},
{"name": "Invisible Technologies", "desc": "Work as an advanced AI training operator.", "url": "https://invisibletech.ai/join-us#open-roles"},
{"name": "Sama", "desc": "Impact-driven data labeling and AI training roles.", "url": "https://www.sama.com/ai101"},
{"name": "Mercor", "desc": "AI-driven recruiting for top-tier training and dev roles.", "url": "https://mercor.com/"},
{"name": "Rex.zone", "desc": "Curated RLHF and data services for training experts.", "url": "https://www.rex.zone/open-opportunities"}
]
platforms = []
for p in curated:
platforms.append({
"src": "Curated",
"company_name": p['name'],
"slug": slugify(p['name']),
"date": datetime.now().isoformat(),
"link": p['url'],
"summary": p['desc'],
"type": "AI Training Jobs",
"funding_amount": "Active",
"founders": []
})
return platforms
def main():
print("Starting Newly AI Training Scouting...")
news_leads = fetch_ai_training_from_news()
curated_leads = get_curated_platforms()
all_data = news_leads + curated_leads
# Deduplicate
seen = set()
deduped = []
for d in all_data:
if d['slug'] not in seen:
deduped.append(d)
seen.add(d['slug'])
# Sync to Frontend
script_dir = os.path.dirname(os.path.abspath(__file__))
frontend_public_path = os.path.join(script_dir, "..", "web", "public", "training_data.json")
paths_to_save = ["training_data.json"]
if os.path.exists(os.path.dirname(frontend_public_path)):
paths_to_save.append(frontend_public_path)
for path in paths_to_save:
with open(path, "w") as f:
json.dump(deduped, f, indent=4)
print(f"Success! Aggregated {len(deduped)} AI Training platforms into {path}")
if __name__ == "__main__":
main()