Spaces:
Sleeping
Sleeping
meirk-brd
commited on
Commit
·
2c4cae4
1
Parent(s):
ebf4777
handle gradio file wrapping
Browse files
tool.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
import requests
|
| 7 |
from smolagents.tools import Tool
|
|
@@ -25,7 +26,19 @@ class BrightDataScraperTool(Tool):
|
|
| 25 |
}
|
| 26 |
super().__init__()
|
| 27 |
|
| 28 |
-
def forward(self, url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
|
| 30 |
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
|
| 31 |
|
|
@@ -39,7 +52,7 @@ class BrightDataScraperTool(Tool):
|
|
| 39 |
}
|
| 40 |
|
| 41 |
payload = {
|
| 42 |
-
"url":
|
| 43 |
"zone": unlocker_zone,
|
| 44 |
"format": "raw",
|
| 45 |
"data_format": "markdown",
|
|
@@ -52,3 +65,20 @@ class BrightDataScraperTool(Tool):
|
|
| 52 |
except requests.exceptions.RequestException as exc:
|
| 53 |
details = exc.response.text if getattr(exc, "response", None) is not None else ""
|
| 54 |
return json.dumps({"error": str(exc), "details": details})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
from typing import Optional, Tuple
|
| 6 |
|
| 7 |
import requests
|
| 8 |
from smolagents.tools import Tool
|
|
|
|
| 26 |
}
|
| 27 |
super().__init__()
|
| 28 |
|
| 29 |
+
def forward(self, url) -> str:
|
| 30 |
+
url_str, file_path = self._coerce_url_input(url)
|
| 31 |
+
|
| 32 |
+
if file_path:
|
| 33 |
+
try:
|
| 34 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 35 |
+
return f.read()
|
| 36 |
+
except OSError as exc:
|
| 37 |
+
return json.dumps({"error": f"Failed to read uploaded file: {exc}"})
|
| 38 |
+
|
| 39 |
+
if not url_str:
|
| 40 |
+
return json.dumps({"error": "No valid URL provided"})
|
| 41 |
+
|
| 42 |
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
|
| 43 |
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
|
| 44 |
|
|
|
|
| 52 |
}
|
| 53 |
|
| 54 |
payload = {
|
| 55 |
+
"url": url_str,
|
| 56 |
"zone": unlocker_zone,
|
| 57 |
"format": "raw",
|
| 58 |
"data_format": "markdown",
|
|
|
|
| 65 |
except requests.exceptions.RequestException as exc:
|
| 66 |
details = exc.response.text if getattr(exc, "response", None) is not None else ""
|
| 67 |
return json.dumps({"error": str(exc), "details": details})
|
| 68 |
+
|
| 69 |
+
def _coerce_url_input(self, raw) -> Tuple[Optional[str], Optional[str]]:
|
| 70 |
+
if isinstance(raw, str):
|
| 71 |
+
return raw, None
|
| 72 |
+
|
| 73 |
+
if isinstance(raw, dict):
|
| 74 |
+
file_path = raw.get("path") or raw.get("name")
|
| 75 |
+
if file_path and os.path.isfile(file_path):
|
| 76 |
+
return None, file_path
|
| 77 |
+
|
| 78 |
+
orig_name = raw.get("orig_name")
|
| 79 |
+
if isinstance(orig_name, str) and orig_name:
|
| 80 |
+
if orig_name.startswith(("http://", "https://")):
|
| 81 |
+
return orig_name, None
|
| 82 |
+
return f"https://{orig_name}", None
|
| 83 |
+
|
| 84 |
+
return None, None
|