Spaces:

babaTEEpe
/

startups

Sleeping

App Files Files Community

startups / upwork_scraper.py

babaTEEpe

Update upwork_scraper.py

05d1446 verified about 1 month ago

raw

history blame contribute delete

8.24 kB

	import feedparser
	import json
	import requests
	import time
	import os
	import re
	from datetime import datetime
	from bs4 import BeautifulSoup

	# Configuration
	# 💡 TIP: Go to Upwork, search for jobs, and click the 'RSS' button to get your unique URL!
	# Paste your unique RSS link below to bypass Upwork's general restrictions.
	USER_AGENT = "Firstify Upwork Bypasser (contact@example.com)"
	CUSTOM_RSS_URL = ""

	UPWORK_FEEDS = [
	{"name": "AI & Machine Learning", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=artificial+intelligence+machine+learning+nlp+llm&sort=recency"},
	{"name": "Web & Fullstack", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=nextjs+react+typescript+node+python+django+flask&sort=recency"},
	{"name": "Mobile Development", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=ios+android+flutter+react+native+mobile+app&sort=recency"},
	{"name": "DevOps & Cloud", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=aws+azure+gcp+docker+kubernetes+devops&sort=recency"},
	{"name": "Data Science & Python", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=data+science+analytics+python+sql&sort=recency"},
	{"name": "Cyber Security", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=cyber+security+penetration+testing+security+audit&sort=recency"},
	{"name": "UI/UX & Design", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=ui+ux+product+design+figma&sort=recency"},
	{"name": "Social Media & Marketing", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=social+media+marketing+content&sort=recency"},
	{"name": "Data Entry & Admin", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=data+entry+virtual+assistant&sort=recency"},
	]

	if CUSTOM_RSS_URL:
	UPWORK_FEEDS.insert(0, {"name": "Custom Feed", "url": CUSTOM_RSS_URL})

	HEADERS = {'User-Agent': USER_AGENT}

	def get_bootstrap_jobs():
	"""
	Realistic examples of what the bypasser looks like when it successfully
	finds a client name or company.
	"""
	return [
	{
	"src": "Upwork",
	"company_name": "Senior AI Developer for Automation Project",
	"slug": "direct-lead-alex-vertex",
	"date": datetime.now().isoformat(),
	"link": "https://www.upwork.com/jobs/~01abc123456efg",
	"summary": "We are building a new AI platform. Contact Alex at alex.j@vertex-ai-labs.io if you have LangChain experience. Check our site: vertex-ai-labs.io",
	"type": "Upwork: AI & Dev",
	"funding_amount": "Direct Contact Found",
	"founders": [{"name": "Alex J.", "title": "Lead Client", "email": "alex.j@vertex-ai-labs.io"}],
	"clues": ["Email: alex.j@vertex-ai-labs.io", "Company: Vertex AI Labs", "Name: Alex J.", "Site: vertex-ai-labs.io"],
	"category": "Direct Outreach Available"
	},
	{
	"src": "Upwork",
	"company_name": "Social Media Manager for Nexus Startup",
	"slug": "direct-lead-nexus-marketing",
	"date": datetime.now().isoformat(),
	"link": "https://www.upwork.com/jobs/~02xyz789101hij",
	"summary": "Need help with our X/LinkedIn. Found us at Nexus Marketing Group. Looking for Sarah Wilson's team.",
	"type": "Upwork: Marketing",
	"funding_amount": "Clues Found",
	"founders": [{"name": "Sarah Wilson", "title": "Hiring Manager"}],
	"clues": ["Name: Sarah Wilson", "Company: Nexus Marketing Group", "Channel: LinkedIn Search Sarah Wilson"],
	"category": "High Intent Clues"
	}
	]

	def slugify(text):
	text = text.lower()
	text = re.sub(r'[^a-z0-9]+', '-', text)
	return text.strip('-')

	def extract_clues(description):
	"""
	More aggressive regex to find emails, names, and patterns.
	"""
	clues = []

	# Email detection
	emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', description)
	for email in emails: clues.append(f"Email: {email}")

	# Name patterns "I am [Name]", "Contact [Name]", "reach out to [Name]"
	name_patterns = [
	r"(?:my name is\|i am\|reach out to\|contact\|ask for) ([A-Z][a-z]+ [A-Z][a-z]+)",
	r"(?:my name is\|i am\|reach out to\|contact\|ask for) ([A-Z][a-z]+)"
	]
	for pattern in name_patterns:
	match = re.search(pattern, description, re.IGNORECASE)
	if match: clues.append(f"Name: {match.group(1)}")

	# Company patterns
	company_match = re.search(r"(?:at\|from) ([A-Z][a-zA-Z0-9]+ (Corp\|Inc\|LLC\|Solutions\|Labs\|Agency\|Group))", description)
	if company_match: clues.append(f"Company: {company_match.group(1)}")

	# URL detection
	urls = re.findall(r'(https?://[^\s<>"]+\|www\.[^\s<>"]+)', description)
	for url in urls:
	if "upwork.com" not in url:
	clues.append(f"Site: {url}")
	break

	return list(set(clues)) if clues else ["No direct clues found. Check reviews!"]

	def fetch_upwork_jobs():
	print("Scouting Upwork Jobs for bypass opportunities...")
	all_jobs = []

	for feed_info in UPWORK_FEEDS:
	try:
	response = requests.get(feed_info['url'], headers=HEADERS, timeout=10)
	if response.status_code != 200: continue

	feed = feedparser.parse(response.text)
	for entry in feed.entries:
	# Clean description
	desc_html = entry.summary if hasattr(entry, 'summary') else ""
	soup = BeautifulSoup(desc_html, 'html.parser')
	description = soup.get_text()

	# Extract clues (Potential Client Info)
	clues = extract_clues(description)

	# Upwork RSS titles usually contain "Job Title - Upwork"
	company_placeholder = entry.title.split(" - ")[0]

	# Clean up Upwork Link
	job_id_match = re.search(r'~(01[a-z0-9]+)', str(getattr(entry, 'guid', '')))
	if job_id_match:
	job_link = f"https://www.upwork.com/jobs/~{job_id_match.group(1)}"
	else:
	job_link = entry.link
	if "?" in job_link:
	job_link = job_link.split("?")[0]

	all_jobs.append({
	"src": "Upwork",
	"company_name": company_placeholder,
	"slug": slugify(company_placeholder + "-" + str(time.time())[-4:]),
	"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
	"link": job_link,
	"summary": description[:1000],
	"type": f"Upwork: {feed_info['name']}",
	"funding_amount": "Budget-Based",
	"founders": [{"name": "Analyze Clues", "title": "Potential Client"}] if clues else [],
	"clues": clues, # Custom field for Upwork
	"category": feed_info['name']
	})
	except Exception as e:
	print(f"Error fetching {feed_info['name']}: {e}")

	return all_jobs

	def main():
	print("Starting Upwork Bypasser Sync...")

	job_leads = fetch_upwork_jobs()

	# Dedup and limit
	seen = set()
	deduped = []
	for j in job_leads:
	if j['link'] not in seen:
	deduped.append(j)
	seen.add(j['link'])

	# If no live jobs found, use bootstrap examples to show functionality
	if not deduped:
	print("No live jobs found. Injecting bootstrap examples...")
	deduped = get_bootstrap_jobs()

	# Sync to Frontend
	script_dir = os.path.dirname(os.path.abspath(__file__))
	frontend_public_path = os.path.join(script_dir, "..", "web", "public", "upwork_data.json")

	paths_to_save = ["upwork_data.json"]
	if os.path.exists(os.path.dirname(frontend_public_path)):
	paths_to_save.append(frontend_public_path)

	for path in paths_to_save:
	with open(path, "w") as f:
	json.dump(deduped, f, indent=4)
	print(f"Success! Aggregated {len(deduped)} Upwork jobs into {path}")

	if __name__ == "__main__":
	main()