abc1181 commited on
Commit
f2ab351
·
verified ·
1 Parent(s): ab2fb0a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Flask, request, jsonify
3
+ from playwright.sync_api import sync_playwright
4
+ from bs4 import BeautifulSoup
5
+
6
+ app = Flask(__name__)
7
+
8
+ @app.route('/scrape', methods=['POST'])
9
+ def scrape():
10
+ # Security Check
11
+ api_key = request.headers.get("X-API-KEY")
12
+ if api_key != os.getenv("BROWSER_API_KEY"):
13
+ return jsonify({"error": "Unauthorized"}), 401
14
+
15
+ url = request.json.get("url")
16
+ if not url:
17
+ return jsonify({"error": "No URL provided"}), 400
18
+
19
+ with sync_playwright() as p:
20
+ browser = p.chromium.launch(headless=True)
21
+ context = browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
22
+ page = context.new_page()
23
+
24
+ try:
25
+ # Wait for JS to finish loading
26
+ page.goto(url, wait_until="networkidle", timeout=30000)
27
+ content = page.content()
28
+
29
+ soup = BeautifulSoup(content, 'html.parser')
30
+ # Strip junk
31
+ for s in soup(['script', 'style', 'nav', 'footer', 'header']): s.decompose()
32
+ text = soup.get_text(separator=' ', strip=True)[:8000]
33
+
34
+ return jsonify({"text": text})
35
+ except Exception as e:
36
+ return jsonify({"error": str(e)}), 500
37
+ finally:
38
+ browser.close()
39
+
40
+ if __name__ == "__main__":
41
+ app.run(host="0.0.0.0", port=7860)