babaTEEpe commited on
Commit
b8d4e1d
·
verified ·
1 Parent(s): c2e6127

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +66 -0
  2. upwork_scraper.py +115 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify
2
+ from flask_cors import CORS
3
+ import threading
4
+ import time
5
+ import subprocess
6
+ import sys
7
+ import os
8
+ import json
9
+
10
+ app = Flask(__name__)
11
+ CORS(app) # Enable CORS so the React frontend can fetch data
12
+
13
+ def run_scrapers_periodically():
14
+ """Background thread to run scrapers every 60 minutes."""
15
+ while True:
16
+ print(f"\n[{time.strftime('%Y-%m-%d %H:%M:%S')}] Starting scheduled background aggregation...")
17
+ try:
18
+ # Run aggregator.py
19
+ subprocess.run([sys.executable, "aggregator.py"], check=True)
20
+ # Run ai_training_scraper.py
21
+ subprocess.run([sys.executable, "ai_training_scraper.py"], check=True)
22
+ # Run upwork_scraper.py
23
+ subprocess.run([sys.executable, "upwork_scraper.py"], check=True)
24
+ print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] All scrapers synced successfully.")
25
+ except Exception as e:
26
+ print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Aggregation error: {e}")
27
+
28
+ # Wait for 60 minutes
29
+ time.sleep(3600)
30
+
31
+ @app.route("/")
32
+ def health_check():
33
+ return jsonify({"status": "running", "platform": "Firstify Engine"})
34
+
35
+ @app.route("/api/startups")
36
+ def get_startups():
37
+ try:
38
+ with open("data.json", "r") as f:
39
+ return jsonify(json.load(f))
40
+ except:
41
+ return jsonify([])
42
+
43
+ @app.route("/api/training")
44
+ def get_training():
45
+ try:
46
+ with open("training_data.json", "r") as f:
47
+ return jsonify(json.load(f))
48
+ except:
49
+ return jsonify([])
50
+
51
+ @app.route("/api/upwork")
52
+ def get_upwork():
53
+ try:
54
+ with open("upwork_data.json", "r") as f:
55
+ return jsonify(json.load(f))
56
+ except:
57
+ return jsonify([])
58
+
59
+ if __name__ == "__main__":
60
+ # Start the scraping thread
61
+ threading.Thread(target=run_scrapers_periodically, daemon=True).start()
62
+
63
+ # Start the Flask server
64
+ # Hugging Face Spaces use port 7860 by default
65
+ port = int(os.environ.get("PORT", 7860))
66
+ app.run(host="0.0.0.0", port=port)
upwork_scraper.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import feedparser
2
+ import json
3
+ import requests
4
+ import time
5
+ import os
6
+ import re
7
+ from datetime import datetime
8
+ from bs4 import BeautifulSoup
9
+
10
+ # Configuration
11
+ # 💡 TIP: Go to Upwork, search for jobs, and click the 'RSS' button to get your unique URL!
12
+ # Paste your unique RSS link below to bypass Upwork's general restrictions.
13
+ USER_AGENT = "Firstify Upwork Bypasser (contact@example.com)"
14
+ CUSTOM_RSS_URL = ""
15
+
16
+ UPWORK_FEEDS = [
17
+ {"name": "Custom Feed", "url": CUSTOM_RSS_URL},
18
+ {"name": "AI & Machine Learning", "url": "https://www.upwork.com/ab/feed/jobs/rss?q=artificial+intelligence&sort=recency"},
19
+ ]
20
+
21
+ HEADERS = {'User-Agent': USER_AGENT}
22
+
23
+ def slugify(text):
24
+ text = text.lower()
25
+ text = re.sub(r'[^a-z0-9]+', '-', text)
26
+ return text.strip('-')
27
+
28
+ def extract_clues(description):
29
+ """
30
+ Search for names, company names, or URLs hidden in the description.
31
+ """
32
+ clues = []
33
+
34
+ # Heuristic 1: Look for "I am [Name]" or "My name is [Name]"
35
+ name_match = re.search(r"my name is ([A-Z][a-z]+ [A-Z][a-z]+)", description)
36
+ if name_match: clues.append(f"Name: {name_match.group(1)}")
37
+
38
+ # Heuristic 2: Look for company mentions "at [Company]" or "[Company] Inc"
39
+ company_match = re.search(r"at ([A-Z][a-zA-Z0-9]+ (Corp|Inc|LLC|Solutions|Labs))", description)
40
+ if company_match: clues.append(f"Company: {company_match.group(1)}")
41
+
42
+ # Heuristic 3: Look for URLs
43
+ urls = re.findall(r'(https?://[^\s<>"]+|www\.[^\s<>"]+)', description)
44
+ if urls: clues.append(f"Site: {urls[0]}")
45
+
46
+ return clues if clues else ["No direct clues found. Check reviews!"]
47
+
48
+ def fetch_upwork_jobs():
49
+ print("Scouting Upwork Jobs for bypass opportunities...")
50
+ all_jobs = []
51
+
52
+ for feed_info in UPWORK_FEEDS:
53
+ try:
54
+ response = requests.get(feed_info['url'], headers=HEADERS, timeout=10)
55
+ if response.status_code != 200: continue
56
+
57
+ feed = feedparser.parse(response.text)
58
+ for entry in feed.entries:
59
+ # Clean description
60
+ desc_html = entry.summary if hasattr(entry, 'summary') else ""
61
+ soup = BeautifulSoup(desc_html, 'html.parser')
62
+ description = soup.get_text()
63
+
64
+ # Extract clues (Potential Client Info)
65
+ clues = extract_clues(description)
66
+
67
+ # Upwork RSS titles usually contain "Job Title - Upwork"
68
+ company_placeholder = entry.title.split(" - ")[0]
69
+
70
+ all_jobs.append({
71
+ "src": "Upwork",
72
+ "company_name": company_placeholder,
73
+ "slug": slugify(company_placeholder + "-" + str(time.time())[-4:]),
74
+ "date": entry.published if hasattr(entry, 'published') else datetime.now().isoformat(),
75
+ "link": entry.link,
76
+ "summary": description[:1000],
77
+ "type": f"Upwork: {feed_info['name']}",
78
+ "funding_amount": "Budget-Based",
79
+ "founders": [{"name": "Analyze Clues", "title": "Potential Client"}] if clues else [],
80
+ "clues": clues, # Custom field for Upwork
81
+ "category": feed_info['name']
82
+ })
83
+ except Exception as e:
84
+ print(f"Error fetching {feed_info['name']}: {e}")
85
+
86
+ return all_jobs
87
+
88
+ def main():
89
+ print("Starting Upwork Bypasser Sync...")
90
+
91
+ job_leads = fetch_upwork_jobs()
92
+
93
+ # Dedup and limit
94
+ seen = set()
95
+ deduped = []
96
+ for j in job_leads:
97
+ if j['link'] not in seen:
98
+ deduped.append(j)
99
+ seen.add(j['link'])
100
+
101
+ # Sync to Frontend
102
+ script_dir = os.path.dirname(os.path.abspath(__file__))
103
+ frontend_public_path = os.path.join(script_dir, "..", "web", "public", "upwork_data.json")
104
+
105
+ paths_to_save = ["upwork_data.json"]
106
+ if os.path.exists(os.path.dirname(frontend_public_path)):
107
+ paths_to_save.append(frontend_public_path)
108
+
109
+ for path in paths_to_save:
110
+ with open(path, "w") as f:
111
+ json.dump(deduped, f, indent=4)
112
+ print(f"Success! Aggregated {len(deduped)} Upwork jobs into {path}")
113
+
114
+ if __name__ == "__main__":
115
+ main()