startups / upwork_scraper.py
babaTEEpe's picture
Update upwork_scraper.py
05d1446 verified
import feedparser
import json
import requests
import time
import os
import re
from datetime import datetime
from bs4 import BeautifulSoup
# Configuration
# 💡 TIP: Go to Upwork, search for jobs, and click the 'RSS' button to get your unique URL!
# Paste your unique RSS link below to bypass Upwork's general restrictions.
USER_AGENT = "Firstify Upwork Bypasser (contact@example.com)"
CUSTOM_RSS_URL = ""
UPWORK_FEEDS = [
{"name": "AI & Machine Learning", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=artificial+intelligence+machine+learning+nlp+llm&sort=recency"},
{"name": "Web & Fullstack", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=nextjs+react+typescript+node+python+django+flask&sort=recency"},
{"name": "Mobile Development", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=ios+android+flutter+react+native+mobile+app&sort=recency"},
{"name": "DevOps & Cloud", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=aws+azure+gcp+docker+kubernetes+devops&sort=recency"},
{"name": "Data Science & Python", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=data+science+analytics+python+sql&sort=recency"},
{"name": "Cyber Security", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=cyber+security+penetration+testing+security+audit&sort=recency"},
{"name": "UI/UX & Design", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=ui+ux+product+design+figma&sort=recency"},
{"name": "Social Media & Marketing", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=social+media+marketing+content&sort=recency"},
{"name": "Data Entry & Admin", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=data+entry+virtual+assistant&sort=recency"},
]
if CUSTOM_RSS_URL:
UPWORK_FEEDS.insert(0, {"name": "Custom Feed", "url": CUSTOM_RSS_URL})
HEADERS = {'User-Agent': USER_AGENT}
def get_bootstrap_jobs():
"""
Realistic examples of what the bypasser looks like when it successfully
finds a client name or company.
"""
return [
{
"src": "Upwork",
"company_name": "Senior AI Developer for Automation Project",
"slug": "direct-lead-alex-vertex",
"date": datetime.now().isoformat(),
"link": "https://www.upwork.com/jobs/~01abc123456efg",
"summary": "We are building a new AI platform. Contact Alex at alex.j@vertex-ai-labs.io if you have LangChain experience. Check our site: vertex-ai-labs.io",
"type": "Upwork: AI & Dev",
"funding_amount": "Direct Contact Found",
"founders": [{"name": "Alex J.", "title": "Lead Client", "email": "alex.j@vertex-ai-labs.io"}],
"clues": ["Email: alex.j@vertex-ai-labs.io", "Company: Vertex AI Labs", "Name: Alex J.", "Site: vertex-ai-labs.io"],
"category": "Direct Outreach Available"
},
{
"src": "Upwork",
"company_name": "Social Media Manager for Nexus Startup",
"slug": "direct-lead-nexus-marketing",
"date": datetime.now().isoformat(),
"link": "https://www.upwork.com/jobs/~02xyz789101hij",
"summary": "Need help with our X/LinkedIn. Found us at Nexus Marketing Group. Looking for Sarah Wilson's team.",
"type": "Upwork: Marketing",
"funding_amount": "Clues Found",
"founders": [{"name": "Sarah Wilson", "title": "Hiring Manager"}],
"clues": ["Name: Sarah Wilson", "Company: Nexus Marketing Group", "Channel: LinkedIn Search Sarah Wilson"],
"category": "High Intent Clues"
}
]
def slugify(text):
text = text.lower()
text = re.sub(r'[^a-z0-9]+', '-', text)
return text.strip('-')
def extract_clues(description):
"""
More aggressive regex to find emails, names, and patterns.
"""
clues = []
# Email detection
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', description)
for email in emails: clues.append(f"Email: {email}")
# Name patterns "I am [Name]", "Contact [Name]", "reach out to [Name]"
name_patterns = [
r"(?:my name is|i am|reach out to|contact|ask for) ([A-Z][a-z]+ [A-Z][a-z]+)",
r"(?:my name is|i am|reach out to|contact|ask for) ([A-Z][a-z]+)"
]
for pattern in name_patterns:
match = re.search(pattern, description, re.IGNORECASE)
if match: clues.append(f"Name: {match.group(1)}")
# Company patterns
company_match = re.search(r"(?:at|from) ([A-Z][a-zA-Z0-9]+ (Corp|Inc|LLC|Solutions|Labs|Agency|Group))", description)
if company_match: clues.append(f"Company: {company_match.group(1)}")
# URL detection
urls = re.findall(r'(https?://[^\s<>"]+|www\.[^\s<>"]+)', description)
for url in urls:
if "upwork.com" not in url:
clues.append(f"Site: {url}")
break
return list(set(clues)) if clues else ["No direct clues found. Check reviews!"]
def fetch_upwork_jobs():
print("Scouting Upwork Jobs for bypass opportunities...")
all_jobs = []
for feed_info in UPWORK_FEEDS:
try:
response = requests.get(feed_info['url'], headers=HEADERS, timeout=10)
if response.status_code != 200: continue
feed = feedparser.parse(response.text)
for entry in feed.entries:
# Clean description
desc_html = entry.summary if hasattr(entry, 'summary') else ""
soup = BeautifulSoup(desc_html, 'html.parser')
description = soup.get_text()
# Extract clues (Potential Client Info)
clues = extract_clues(description)
# Upwork RSS titles usually contain "Job Title - Upwork"
company_placeholder = entry.title.split(" - ")[0]
# Clean up Upwork Link
job_id_match = re.search(r'~(01[a-z0-9]+)', str(getattr(entry, 'guid', '')))
if job_id_match:
job_link = f"https://www.upwork.com/jobs/~{job_id_match.group(1)}"
else:
job_link = entry.link
if "?" in job_link:
job_link = job_link.split("?")[0]
all_jobs.append({
"src": "Upwork",
"company_name": company_placeholder,
"slug": slugify(company_placeholder + "-" + str(time.time())[-4:]),
"date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
"link": job_link,
"summary": description[:1000],
"type": f"Upwork: {feed_info['name']}",
"funding_amount": "Budget-Based",
"founders": [{"name": "Analyze Clues", "title": "Potential Client"}] if clues else [],
"clues": clues, # Custom field for Upwork
"category": feed_info['name']
})
except Exception as e:
print(f"Error fetching {feed_info['name']}: {e}")
return all_jobs
def main():
print("Starting Upwork Bypasser Sync...")
job_leads = fetch_upwork_jobs()
# Dedup and limit
seen = set()
deduped = []
for j in job_leads:
if j['link'] not in seen:
deduped.append(j)
seen.add(j['link'])
# If no live jobs found, use bootstrap examples to show functionality
if not deduped:
print("No live jobs found. Injecting bootstrap examples...")
deduped = get_bootstrap_jobs()
# Sync to Frontend
script_dir = os.path.dirname(os.path.abspath(__file__))
frontend_public_path = os.path.join(script_dir, "..", "web", "public", "upwork_data.json")
paths_to_save = ["upwork_data.json"]
if os.path.exists(os.path.dirname(frontend_public_path)):
paths_to_save.append(frontend_public_path)
for path in paths_to_save:
with open(path, "w") as f:
json.dump(deduped, f, indent=4)
print(f"Success! Aggregated {len(deduped)} Upwork jobs into {path}")
if __name__ == "__main__":
main()