Spaces:

babaTEEpe
/

startups

Sleeping

App Files Files Community

startups / ai_training_scraper.py

babaTEEpe

Update ai_training_scraper.py

c528907 verified about 1 month ago

raw

history blame contribute delete

6.31 kB

	import feedparser
	import json
	import requests
	import time
	import os
	import re
	from bs4 import BeautifulSoup
	from datetime import datetime

	# Configuration
	USER_AGENT = "Firstify AI Tracker (contact@example.com)"
	TC_RSS_URL = "https://techcrunch.com/category/startups/feed/"
	# AI Specific Keywords
	AI_KEYWORDS = ["ai training", "human feedback", "rlhf", "llm evaluation", "data annotation", "ai workforce", "turing", "prolific", "annotation"]

	HEADERS = {'User-Agent': USER_AGENT}

	def slugify(text):
	text = text.lower()
	text = re.sub(r'[^a-z0-9]+', '-', text)
	return text.strip('-')

	def fetch_ai_training_from_news():
	print("Scouting AI Training platforms from news...")
	response = requests.get(TC_RSS_URL, headers=HEADERS)
	if response.status_code != 200: return []

	feed = feedparser.parse(response.text)
	platforms = []
	for entry in feed.entries:
	title_lower = entry.title.lower()
	summary_lower = entry.summary.lower() if hasattr(entry, 'summary') else ""

	if any(keyword in title_lower or keyword in summary_lower for keyword in AI_KEYWORDS):
	# Extract company name (heuristic)
	company_name = entry.title.split(" raises ")[0].split(" funding")[0].split(":")[0]
	platforms.append({
	"src": "AI Scout",
	"company_name": company_name,
	"slug": slugify(company_name),
	"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
	"link": entry.link,
	"summary": BeautifulSoup(entry.summary, 'html.parser').text[:500] + "...",
	"type": "AI Training",
	"funding_amount": "N/A",
	"founders": []
	})
	return platforms

	def get_curated_platforms():
	# Focus: Platforms where people make money providing data for AI training
	curated = [
	{"name": "DataAnnotation.tech", "desc": "Highest paying AI training platform. Direct access to core tasks.", "url": "https://www.dataannotation.tech/"},
	{"name": "Outlier.ai", "desc": "Scale AI's platform for expert RLHF and data training.", "url": "https://app.outlier.ai/en/expert/opportunities"},
	{"name": "Remotasks", "desc": "Classic micro-tasking for AI. Direct sign-up for workers.", "url": "https://www.remotasks.com/en/signup"},
	{"name": "Prolific", "desc": "Best for researchers and high-quality data providers.", "url": "https://www.prolific.com/participants"},
	{"name": "Clickworker", "desc": "Join the Clickworker crowd for AI and data processing.", "url": "https://www.clickworker.com/clickworker/"},
	{"name": "Appen", "desc": "Flexible AI training jobs for the world's leading tech labs.", "url": "https://appen.com/jobs/"},
	{"name": "Telus Digital", "desc": "Massive AI community for data collection and evaluation.", "url": "https://jobs.telusdigital.com/en_US/careers/aicommunity?intcmp=telus-digital-site"},
	{"name": "Toloka", "desc": "Global crowd-force for AI training and RLHF.", "url": "https://toloka.ai/careers#job-list"},
	{"name": "Mindrift", "desc": "Generate high-quality data for AI as an expert freelancer.", "url": "https://mindrift.ai/apply"},
	{"name": "Turing", "desc": "Largest network for AI training and specialized engineering jobs.", "url": "https://www.turing.com/jobs"},
	{"name": "OneForma", "desc": "Centific's platform for AI data and language projects.", "url": "https://www.oneforma.com/jobs/"},
	{"name": "Neevo", "desc": "Defined.ai's platform for earning money through tasking.", "url": "https://neevo.defined.ai/"},
	{"name": "CloudFactory", "desc": "Meaningful work in data labeling and AI training.", "url": "https://www.cloudfactory.com/cloudworker-details"},
	{"name": "Micro1", "desc": "Join an elite network of AI-vetted remote talent.", "url": "https://www.micro1.ai/jobs"},
	{"name": "Rapidata", "desc": "High-speed human feedback tasks. Instant sign-up portal.", "url": "https://rapidata.ai/"},
	{"name": "Botpool", "desc": "AI training and data annotation marketplace.", "url": "https://www.botpool.ai/find-jobs"},
	{"name": "Amazon MTurk", "desc": "The original marketplace for human intelligence (HITs).", "url": "https://www.mturk.com/get-started"},
	{"name": "Invisible Technologies", "desc": "Work as an advanced AI training operator.", "url": "https://invisibletech.ai/join-us#open-roles"},
	{"name": "Sama", "desc": "Impact-driven data labeling and AI training roles.", "url": "https://www.sama.com/ai101"},
	{"name": "Mercor", "desc": "AI-driven recruiting for top-tier training and dev roles.", "url": "https://mercor.com/"},
	{"name": "Rex.zone", "desc": "Curated RLHF and data services for training experts.", "url": "https://www.rex.zone/open-opportunities"}
	]
	platforms = []
	for p in curated:
	platforms.append({
	"src": "Curated",
	"company_name": p['name'],
	"slug": slugify(p['name']),
	"date": datetime.now().isoformat(),
	"link": p['url'],
	"summary": p['desc'],
	"type": "AI Training Jobs",
	"funding_amount": "Active",
	"founders": []
	})
	return platforms

	def main():
	print("Starting Newly AI Training Scouting...")

	news_leads = fetch_ai_training_from_news()
	curated_leads = get_curated_platforms()

	all_data = news_leads + curated_leads

	# Deduplicate
	seen = set()
	deduped = []
	for d in all_data:
	if d['slug'] not in seen:
	deduped.append(d)
	seen.add(d['slug'])

	# Sync to Frontend
	script_dir = os.path.dirname(os.path.abspath(__file__))
	frontend_public_path = os.path.join(script_dir, "..", "web", "public", "training_data.json")

	paths_to_save = ["training_data.json"]
	if os.path.exists(os.path.dirname(frontend_public_path)):
	paths_to_save.append(frontend_public_path)

	for path in paths_to_save:
	with open(path, "w") as f:
	json.dump(deduped, f, indent=4)
	print(f"Success! Aggregated {len(deduped)} AI Training platforms into {path}")

	if __name__ == "__main__":
	main()