Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
| 1 |
import os
|
| 2 |
-
os.system("playwright install")
|
| 3 |
import re
|
| 4 |
import urllib.parse
|
| 5 |
import asyncio
|
| 6 |
from typing import Dict, Optional
|
| 7 |
from itertools import cycle
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
from bs4 import BeautifulSoup, NavigableString
|
| 11 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
class CredentialRevolver:
|
| 14 |
def __init__(self, proxy_string: str):
|
| 15 |
self.proxies = self._parse_proxies(proxy_string)
|
|
@@ -57,6 +63,7 @@ SEARCH_ENGINES = {
|
|
| 57 |
"Perplexity": "https://www.perplexity.ai/search?q={query}",
|
| 58 |
}
|
| 59 |
|
|
|
|
| 60 |
class HTML_TO_MARKDOWN_CONVERTER:
|
| 61 |
def __init__(self, soup: BeautifulSoup, base_url: str):
|
| 62 |
self.soup = soup
|
|
@@ -108,6 +115,7 @@ class HTML_TO_MARKDOWN_CONVERTER:
|
|
| 108 |
return f"\n\n\n\n"
|
| 109 |
return inner_md
|
| 110 |
|
|
|
|
| 111 |
async def perform_web_browse(action: str, query: str, browser_name: str, search_engine_name: str):
|
| 112 |
browser_key = browser_name.lower()
|
| 113 |
if "playwright" not in PLAYWRIGHT_STATE:
|
|
@@ -170,21 +178,54 @@ async def perform_web_browse(action: str, query: str, browser_name: str, search_
|
|
| 170 |
if 'page' in locals() and not page.is_closed(): await page.close()
|
| 171 |
if 'context' in locals(): await context.close()
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
|
|
|
| 189 |
if __name__ == "__main__":
|
| 190 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import re
|
| 3 |
import urllib.parse
|
| 4 |
import asyncio
|
| 5 |
from typing import Dict, Optional
|
| 6 |
from itertools import cycle
|
| 7 |
|
| 8 |
+
# Install playwright if not present
|
| 9 |
+
os.system("playwright install")
|
| 10 |
+
|
| 11 |
+
from flask import Flask, request, jsonify
|
| 12 |
from bs4 import BeautifulSoup, NavigableString
|
| 13 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
| 14 |
|
| 15 |
+
# --- Flask App Initialization ---
|
| 16 |
+
app = Flask(__name__)
|
| 17 |
+
|
| 18 |
+
# --- Credential and State Management (largely unchanged) ---
|
| 19 |
class CredentialRevolver:
|
| 20 |
def __init__(self, proxy_string: str):
|
| 21 |
self.proxies = self._parse_proxies(proxy_string)
|
|
|
|
| 63 |
"Perplexity": "https://www.perplexity.ai/search?q={query}",
|
| 64 |
}
|
| 65 |
|
| 66 |
+
# --- HTML to Markdown Conversion (unchanged) ---
|
| 67 |
class HTML_TO_MARKDOWN_CONVERTER:
|
| 68 |
def __init__(self, soup: BeautifulSoup, base_url: str):
|
| 69 |
self.soup = soup
|
|
|
|
| 115 |
return f"\n\n\n\n"
|
| 116 |
return inner_md
|
| 117 |
|
| 118 |
+
# --- Core Web Browsing Logic (unchanged) ---
|
| 119 |
async def perform_web_browse(action: str, query: str, browser_name: str, search_engine_name: str):
|
| 120 |
browser_key = browser_name.lower()
|
| 121 |
if "playwright" not in PLAYWRIGHT_STATE:
|
|
|
|
| 178 |
if 'page' in locals() and not page.is_closed(): await page.close()
|
| 179 |
if 'context' in locals(): await context.close()
|
| 180 |
|
| 181 |
+
|
| 182 |
+
# --- API Endpoint Definition ---
|
| 183 |
+
@app.route('/web_browse', methods=['POST'])
|
| 184 |
+
def web_browse():
|
| 185 |
+
"""
|
| 186 |
+
API endpoint to perform a web search or scrape a URL.
|
| 187 |
+
This endpoint expects a JSON payload with the following parameters:
|
| 188 |
+
- "action": "Search" or "Scrape URL" (required)
|
| 189 |
+
- "query": The search term or the URL to scrape (required)
|
| 190 |
+
- "browser_name": "firefox", "chromium", or "webkit" (optional, default: "firefox")
|
| 191 |
+
- "search_engine_name": Name of the search engine (optional, default: "DuckDuckGo")
|
| 192 |
|
| 193 |
+
Example usage with curl:
|
| 194 |
+
curl -X POST http://127.0.0.1:5000/web_browse \
|
| 195 |
+
-H "Content-Type: application/json" \
|
| 196 |
+
-d '{
|
| 197 |
+
"action": "Search",
|
| 198 |
+
"query": "latest news on AI",
|
| 199 |
+
"browser_name": "firefox",
|
| 200 |
+
"search_engine_name": "Google"
|
| 201 |
+
}'
|
| 202 |
+
"""
|
| 203 |
+
if not request.is_json:
|
| 204 |
+
return jsonify({"status": "error", "error_message": "Invalid input: payload must be JSON"}), 400
|
| 205 |
+
|
| 206 |
+
data = request.get_json()
|
| 207 |
+
action = data.get('action')
|
| 208 |
+
query = data.get('query')
|
| 209 |
+
browser_name = data.get('browser_name', 'firefox') # Default to firefox
|
| 210 |
+
search_engine_name = data.get('search_engine_name', 'DuckDuckGo') # Default to DuckDuckGo
|
| 211 |
+
|
| 212 |
+
if not action or not query:
|
| 213 |
+
return jsonify({"status": "error", "error_message": "Missing required parameters: 'action' and 'query' are mandatory."}), 400
|
| 214 |
|
| 215 |
+
if action not in ["Search", "Scrape URL"]:
|
| 216 |
+
return jsonify({"status": "error", "error_message": "Invalid 'action'. Must be 'Search' or 'Scrape URL'."}), 400
|
| 217 |
+
|
| 218 |
+
# Run the async function in the current event loop
|
| 219 |
+
try:
|
| 220 |
+
result = asyncio.run(perform_web_browse(action, query, browser_name, search_engine_name))
|
| 221 |
+
status_code = 200 if result.get("status") == "success" else 500
|
| 222 |
+
return jsonify(result), status_code
|
| 223 |
+
except Exception as e:
|
| 224 |
+
return jsonify({"status": "error", "query": query, "error_message": f"An unexpected server error occurred: {str(e)}"}), 500
|
| 225 |
+
|
| 226 |
|
| 227 |
+
# --- Main Application Runner ---
|
| 228 |
if __name__ == "__main__":
|
| 229 |
+
print(f"Flask server starting... {REVOLVER.count()} proxies loaded.")
|
| 230 |
+
print("API Endpoint available at POST http://127.0.0.1:7860/web_browse")
|
| 231 |
+
app.run(host='0.0.0.0', port=7860, debug=True)
|