Tools / Modules /ScrapeGraphAI.py
chmielvu's picture
Upload folder using huggingface_hub
588592f verified
raw
history blame
32.8 kB
from __future__ import annotations
import json
import os
from typing import Annotated, Any, Literal
import gradio as gr
from app import _log_call_end, _log_call_start, _truncate_for_log
from ._core import _resolve_path
from ._docstrings import autodoc
TOOL_SUMMARY = (
"Scrape and extract structured data from known URLs using ScrapeGraphAI with "
"Mistral-only models. Supports single-page extraction, bounded crawl extraction, "
"multi-URL extraction, rendered markdown, and image-aware extraction."
)
ACTION_CHOICES = [
"extract",
"crawl_extract",
"multi_extract",
"render_markdown",
"vision_extract",
]
RENDER_CHOICES = ["auto", "browser", "http"]
TEXT_MODEL_ENV = "SCRAPEGRAPH_TEXT_MODEL"
VISION_MODEL_ENV = "SCRAPEGRAPH_VISION_MODEL"
DEFAULT_TEXT_MODEL = "mistral-small-latest"
DEFAULT_VISION_MODEL = "pixtral-12b-latest"
_IMPORT_ERROR: Exception | None = None
try:
from langchain.chat_models import init_chat_model
from pydantic import BaseModel, Field, create_model
from scrapegraphai.graphs import SmartScraperGraph, SmartScraperMultiGraph
from scrapegraphai.graphs.abstract_graph import AbstractGraph
from scrapegraphai.graphs.base_graph import BaseGraph
from scrapegraphai.nodes import (
DescriptionNode,
FetchNode,
FetchNodeLevelK,
GenerateAnswerNodeKLevel,
GenerateAnswerOmniNode,
ImageToTextNode,
ParseNode,
ParseNodeDepthK,
RAGNode,
)
from scrapegraphai.utils.convert_to_md import convert_to_md
except Exception as exc: # pragma: no cover - import error path is runtime-only
_IMPORT_ERROR = exc
init_chat_model = None
BaseModel = None
Field = None
create_model = None
SmartScraperGraph = None
SmartScraperMultiGraph = None
AbstractGraph = None
BaseGraph = None
DescriptionNode = None
FetchNode = None
FetchNodeLevelK = None
GenerateAnswerNodeKLevel = None
GenerateAnswerOmniNode = None
ImageToTextNode = None
ParseNode = None
ParseNodeDepthK = None
RAGNode = None
convert_to_md = None
else:
class _LimitedFetchNodeLevelK(FetchNodeLevelK):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_pages = None if self.node_config is None else self.node_config.get("max_pages")
def obtain_content(self, documents, loader_kwargs):
documents = super().obtain_content(documents, loader_kwargs)
if self.max_pages and len(documents) > self.max_pages:
return documents[: self.max_pages]
return documents
class _BoundedDepthSearchGraph(AbstractGraph):
def __init__(self, prompt: str, source: str, config: dict, schema: type[BaseModel] | None = None):
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self):
fetch_node_k = _LimitedFetchNodeLevelK(
input="url| local_dir",
output=["docs"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
"force": self.config.get("force", False),
"cut": self.config.get("cut", True),
"browser_base": self.config.get("browser_base"),
"storage_state": self.config.get("storage_state"),
"depth": self.config.get("depth", 1),
"only_inside_links": self.config.get("only_inside_links", False),
"max_pages": self.config.get("max_pages"),
},
)
parse_node_k = ParseNodeDepthK(
input="docs",
output=["docs"],
node_config={"verbose": self.config.get("verbose", False)},
)
description_node = DescriptionNode(
input="docs",
output=["docs"],
node_config={
"llm_model": self.llm_model,
"verbose": self.config.get("verbose", False),
"cache_path": self.config.get("cache_path", False),
},
)
rag_node = RAGNode(
input="docs",
output=["vectorial_db"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.config.get("embedder_model", False),
"verbose": self.config.get("verbose", False),
},
)
generate_answer_k = GenerateAnswerNodeKLevel(
input="vectorial_db",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"embedder_model": self.config.get("embedder_model", False),
"verbose": self.config.get("verbose", False),
"schema": self.schema,
},
)
return BaseGraph(
nodes=[fetch_node_k, parse_node_k, description_node, rag_node, generate_answer_k],
edges=[
(fetch_node_k, parse_node_k),
(parse_node_k, description_node),
(description_node, rag_node),
(rag_node, generate_answer_k),
],
entry_point=fetch_node_k,
graph_name=self.__class__.__name__,
)
def run(self):
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")
class _MistralOmniScraperGraph(AbstractGraph):
def __init__(self, prompt: str, source: str, config: dict, schema: type[BaseModel] | None = None):
self.max_images = config.get("max_images", 5)
super().__init__(prompt, config, source, schema)
self.input_key = "url" if source.startswith("http") else "local_dir"
def _create_graph(self):
vision_model = init_chat_model(
model=self.config.get("vision_model", DEFAULT_VISION_MODEL),
model_provider="mistralai",
api_key=self.config["llm"]["api_key"],
temperature=0,
)
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
"storage_state": self.config.get("storage_state"),
"use_soup": self.config.get("use_soup", False),
"timeout": self.config.get("timeout", 30),
},
)
parse_node = ParseNode(
input="doc & (url | local_dir)",
output=["parsed_doc", "link_urls", "img_urls"],
node_config={
"chunk_size": self.model_token,
"parse_urls": True,
"llm_model": self.llm_model,
},
)
image_to_text_node = ImageToTextNode(
input="img_urls",
output=["img_desc"],
node_config={
"llm_model": vision_model,
"max_images": self.max_images,
},
)
generate_answer_omni_node = GenerateAnswerOmniNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
output=["answer"],
node_config={
"llm_model": self.llm_model,
"additional_info": self.config.get("additional_info"),
"schema": self.schema,
},
)
return BaseGraph(
nodes=[fetch_node, parse_node, image_to_text_node, generate_answer_omni_node],
edges=[
(fetch_node, parse_node),
(parse_node, image_to_text_node),
(image_to_text_node, generate_answer_omni_node),
],
entry_point=fetch_node,
graph_name=self.__class__.__name__,
)
def run(self):
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)
return self.final_state.get("answer", "No answer found.")
class ScrapeGraphToolError(RuntimeError):
def __init__(self, code: str, message: str, hint: str | None = None):
super().__init__(message)
self.code = code
self.message = message
self.hint = hint
def _json_response(payload: dict[str, Any]) -> str:
return json.dumps(payload, ensure_ascii=False, indent=2, default=str)
def _error_response(action: str, code: str, message: str, hint: str | None = None) -> str:
return _json_response(
{
"action": action,
"error": {"code": code, "message": message, **({"hint": hint} if hint else {})},
}
)
def _require_scrapegraph() -> None:
if _IMPORT_ERROR is not None:
raise ScrapeGraphToolError(
"missing_scrapegraph_dependencies",
f"ScrapeGraphAI dependencies are unavailable: {_IMPORT_ERROR}",
"Install `scrapegraphai>=1.75.1` and its runtime dependencies.",
)
def _require_mistral_key() -> str:
api_key = os.getenv("MISTRAL_API_KEY", "").strip()
if not api_key:
raise ScrapeGraphToolError(
"missing_mistral_api_key",
"MISTRAL_API_KEY is not configured.",
"Set MISTRAL_API_KEY in the environment before using ScrapeGraphAI extraction actions.",
)
return api_key
def _coerce_urls(urls: Any) -> list[str]:
if urls is None or urls == "":
return []
if isinstance(urls, list):
return [str(url).strip() for url in urls if str(url).strip()]
if isinstance(urls, str):
text = urls.strip()
if not text:
return []
if text.startswith("["):
parsed = json.loads(text)
if not isinstance(parsed, list):
raise ScrapeGraphToolError("invalid_urls", "urls must be a JSON array of URL strings.")
return [str(url).strip() for url in parsed if str(url).strip()]
return [part.strip() for part in text.replace("\r", "\n").replace(",", "\n").split("\n") if part.strip()]
raise ScrapeGraphToolError("invalid_urls", "urls must be provided as a list or JSON array string.")
def _coerce_schema(schema_json: Any) -> dict[str, Any] | None:
if schema_json in (None, "", {}):
return None
if isinstance(schema_json, dict):
return schema_json
if isinstance(schema_json, str):
try:
parsed = json.loads(schema_json)
except json.JSONDecodeError as exc:
raise ScrapeGraphToolError("invalid_schema_json", f"schema_json is not valid JSON: {exc}") from exc
if not isinstance(parsed, dict):
raise ScrapeGraphToolError("invalid_schema_json", "schema_json must decode to a JSON object.")
return parsed
raise ScrapeGraphToolError("invalid_schema_json", "schema_json must be a JSON object or JSON string.")
def _schema_to_type(name: str, schema: dict[str, Any]) -> Any:
schema_type = schema.get("type")
if schema_type == "string":
return str
if schema_type == "integer":
return int
if schema_type == "number":
return float
if schema_type == "boolean":
return bool
if schema_type == "array":
item_schema = schema.get("items", {})
return list[_schema_to_type(f"{name}Item", item_schema)]
if schema_type == "object" or "properties" in schema:
properties = schema.get("properties", {})
required = set(schema.get("required", []))
fields: dict[str, tuple[Any, Any]] = {}
for prop_name, prop_schema in properties.items():
prop_type = _schema_to_type(f"{name}{prop_name.title()}", prop_schema)
description = prop_schema.get("description")
is_required = prop_name in required
annotation = prop_type if is_required else (prop_type | None)
default = Field(... if is_required else None, description=description)
fields[prop_name] = (annotation, default)
return create_model(name, **fields)
return Any
def _schema_to_model(schema: dict[str, Any] | None) -> type[BaseModel] | None:
if not schema:
return None
if schema.get("type") not in (None, "object") and "properties" not in schema:
raise ScrapeGraphToolError(
"invalid_schema_json",
"Only object-shaped JSON schemas are supported for schema_json.",
)
model_type = _schema_to_type("ScrapeGraphResult", schema)
if not isinstance(model_type, type) or not issubclass(model_type, BaseModel):
raise ScrapeGraphToolError(
"invalid_schema_json",
"schema_json must define an object with properties for structured extraction.",
)
return model_type
def _resolve_storage_state(storage_state_path: str | None) -> str | None:
if not storage_state_path:
return None
candidate = storage_state_path.strip()
if not candidate:
return None
if os.path.isabs(candidate):
resolved = candidate
else:
resolved, _ = _resolve_path(candidate)
if not os.path.exists(resolved):
raise ScrapeGraphToolError(
"invalid_storage_state_path",
f"Storage state file not found: {candidate}",
)
return resolved
def _build_config(
*,
api_key: str | None,
text_model: str | None = None,
render_mode: str = "auto",
timeout_s: int = 30,
storage_state_path: str | None = None,
depth: int | None = None,
max_pages: int | None = None,
same_domain_only: bool | None = None,
max_images: int | None = None,
vision_model: str | None = None,
) -> dict[str, Any]:
if render_mode not in RENDER_CHOICES:
raise ScrapeGraphToolError("invalid_render_mode", f"Unsupported render_mode: {render_mode}")
config: dict[str, Any] = {
"headless": True,
"verbose": False,
"timeout": max(5, int(timeout_s)),
"use_soup": render_mode == "http",
}
if api_key:
config["llm"] = {
"api_key": api_key,
"model": f"mistralai/{text_model or os.getenv(TEXT_MODEL_ENV, DEFAULT_TEXT_MODEL)}",
"temperature": 0,
}
if storage_state_path:
config["storage_state"] = storage_state_path
if depth is not None:
config["depth"] = max(1, int(depth))
if max_pages is not None:
config["max_pages"] = max(1, int(max_pages))
if same_domain_only is not None:
config["only_inside_links"] = bool(same_domain_only)
if max_images is not None:
config["max_images"] = max(1, int(max_images))
if vision_model:
config["vision_model"] = vision_model
return config
def _json_safe(value: Any) -> Any:
if BaseModel is not None and isinstance(value, BaseModel):
return value.model_dump(mode="json")
if isinstance(value, dict):
return {key: _json_safe(val) for key, val in value.items()}
if isinstance(value, list):
return [_json_safe(item) for item in value]
if isinstance(value, tuple):
return [_json_safe(item) for item in value]
if hasattr(value, "metadata") and hasattr(value, "page_content"):
return {
"page_content": getattr(value, "page_content", ""),
"metadata": _json_safe(getattr(value, "metadata", {})),
}
if isinstance(value, str):
stripped = value.strip()
if stripped.startswith("{") or stripped.startswith("["):
try:
return json.loads(stripped)
except Exception:
return value
return value
def _extract_sources(state: dict[str, Any], fallback: list[str] | None = None) -> list[str]:
sources: list[str] = []
for item in state.get("docs", []) or []:
source = item.get("source") if isinstance(item, dict) else None
if source and source not in sources:
sources.append(source)
for doc in state.get("doc", []) or []:
metadata = getattr(doc, "metadata", {}) or {}
source = metadata.get("source")
if source and source not in sources:
sources.append(source)
if not sources and fallback:
sources.extend([source for source in fallback if source])
return sources
def _extract_links_and_images(doc_state: dict[str, Any], url: str) -> tuple[list[str], list[str]]:
parse_node = ParseNode(
input="doc & url",
output=["parsed_doc", "link_urls", "img_urls"],
node_config={
"parse_urls": True,
"parse_html": True,
"chunk_size": 8192,
"llm_model": None,
},
)
docs = doc_state.get("doc")
if not docs:
docs = doc_state.get("html_content", [])
if not docs:
return [], []
state = {"doc": docs, "url": url}
parse_node.execute(state)
return state.get("link_urls", []) or [], state.get("img_urls", []) or []
def _render_markdown_with_fetch(url: str, config: dict[str, Any]) -> tuple[dict[str, Any], list[dict[str, Any]]]:
fetch_node = FetchNode(
input="url",
output=["doc"],
node_config=config,
)
state = {"url": url}
state = fetch_node.execute(state)
docs = state.get("doc", []) or []
if not docs:
raise ScrapeGraphToolError("fetch_failed", "ScrapeGraph fetch returned no documents for render_markdown.")
html = getattr(docs[0], "page_content", None) or ""
if not html.strip():
raise ScrapeGraphToolError("fetch_failed", "Fetched document for render_markdown had empty content.")
state["markdown"] = convert_to_md(html)
return state, []
@autodoc(summary=TOOL_SUMMARY)
def ScrapeGraphAI(
action: Annotated[
Literal["extract", "crawl_extract", "multi_extract", "render_markdown", "vision_extract"],
"Action to run: extract, crawl_extract, multi_extract, render_markdown, or vision_extract.",
] = "extract",
url: Annotated[str, "Single URL for extract, crawl_extract, render_markdown, or vision_extract."] = "",
urls: Annotated[list[str] | str | None, "Explicit list of URLs for multi_extract. Accepts a list or JSON array string."] = None,
prompt: Annotated[str, "Natural-language extraction prompt. Required for extraction actions."] = "",
schema_json: Annotated[dict[str, Any] | str | None, "Optional object-shaped JSON schema for structured extraction."] = None,
render_mode: Annotated[Literal["auto", "browser", "http"], "Fetch mode. `browser` uses ScrapeGraph browser loading, `http` uses requests + soup, `auto` currently follows ScrapeGraph's browser-first path."] = "auto",
include_images: Annotated[bool, "For `extract`, include page images in the extraction context."] = False,
depth: Annotated[int, "For `crawl_extract`, crawl depth from the starting URL."] = 1,
max_pages: Annotated[int, "For `crawl_extract`, soft cap on fetched pages."] = 4,
same_domain_only: Annotated[bool, "For `crawl_extract`, stay within the starting site's links only."] = True,
max_urls: Annotated[int, "For `multi_extract`, maximum URLs allowed in one call."] = 8,
max_images: Annotated[int, "For `vision_extract` and image-aware extraction, maximum images to describe."] = 5,
max_chars: Annotated[int, "For `render_markdown`, trim returned markdown to this many characters."] = 12000,
include_links: Annotated[bool, "For `render_markdown`, include discovered page links."] = True,
timeout_s: Annotated[int, "Timeout in seconds passed to ScrapeGraph fetch and generation nodes."] = 30,
storage_state_path: Annotated[str, "Optional Playwright storage state JSON path for authenticated pages."] = "",
return_debug: Annotated[bool, "Include execution metadata and graph execution info in the response."] = False,
) -> str:
_log_call_start(
"ScrapeGraphAI",
action=action,
url=url,
urls=urls,
prompt=_truncate_for_log(prompt or "", 180),
render_mode=render_mode,
include_images=include_images,
depth=depth,
max_pages=max_pages,
max_urls=max_urls,
max_images=max_images,
timeout_s=timeout_s,
storage_state_path=storage_state_path,
return_debug=return_debug,
)
try:
_require_scrapegraph()
storage_state = _resolve_storage_state(storage_state_path)
schema = _coerce_schema(schema_json)
schema_model = _schema_to_model(schema)
text_model_name = os.getenv(TEXT_MODEL_ENV, DEFAULT_TEXT_MODEL)
vision_model_name = os.getenv(VISION_MODEL_ENV, DEFAULT_VISION_MODEL)
if action == "render_markdown":
if not url.strip():
raise ScrapeGraphToolError("missing_url", "url is required for render_markdown.")
final_state, exec_info = _render_markdown_with_fetch(
url.strip(),
_build_config(
api_key=None,
render_mode=render_mode,
timeout_s=timeout_s,
storage_state_path=storage_state,
),
)
markdown = (final_state.get("markdown") or "")[: max(1000, int(max_chars))]
links, images = _extract_links_and_images(final_state, url.strip())
response = {
"action": action,
"result": {"markdown": markdown},
"sources": [url.strip()],
"artifacts": {
"markdown": markdown,
"links": links if include_links else [],
"images": images if include_images else [],
"per_url_results": [],
},
"meta": {
"render_mode_used": render_mode,
"text_model": None,
"vision_model": None,
},
"warnings": [],
}
if return_debug:
response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(exec_info)}
result = _json_response(response)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
api_key = _require_mistral_key()
if action == "extract":
if not url.strip() or not prompt.strip():
raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for extract.")
config = _build_config(
api_key=api_key,
text_model=text_model_name,
render_mode=render_mode,
timeout_s=timeout_s,
storage_state_path=storage_state,
max_images=max_images,
vision_model=vision_model_name,
)
graph_cls = _MistralOmniScraperGraph if include_images else SmartScraperGraph
graph = graph_cls(prompt=prompt.strip(), source=url.strip(), config=config, schema=schema_model)
result_data = _json_safe(graph.run())
final_state = graph.get_state()
response = {
"action": action,
"result": result_data,
"sources": _extract_sources(final_state, [url.strip()]),
"artifacts": {
"markdown": None,
"links": final_state.get("link_urls", []) or [],
"images": final_state.get("img_urls", []) or [],
"per_url_results": [],
},
"meta": {
"render_mode_used": render_mode,
"text_model": text_model_name,
"vision_model": vision_model_name if include_images else None,
},
"warnings": [],
}
if return_debug:
response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
result = _json_response(response)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
if action == "vision_extract":
if not url.strip() or not prompt.strip():
raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for vision_extract.")
graph = _MistralOmniScraperGraph(
prompt=prompt.strip(),
source=url.strip(),
config=_build_config(
api_key=api_key,
text_model=text_model_name,
render_mode=render_mode,
timeout_s=timeout_s,
storage_state_path=storage_state,
max_images=max_images,
vision_model=vision_model_name,
),
schema=schema_model,
)
result_data = _json_safe(graph.run())
final_state = graph.get_state()
img_urls = final_state.get("img_urls", []) or []
if not img_urls:
raise ScrapeGraphToolError("no_images_found", "No images were found on the page for vision_extract.")
response = {
"action": action,
"result": result_data,
"sources": _extract_sources(final_state, [url.strip()]),
"artifacts": {
"markdown": None,
"links": final_state.get("link_urls", []) or [],
"images": img_urls,
"per_url_results": [],
},
"meta": {
"render_mode_used": render_mode,
"text_model": text_model_name,
"vision_model": vision_model_name,
},
"warnings": [],
}
if return_debug:
response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
result = _json_response(response)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
if action == "multi_extract":
normalized_urls = _coerce_urls(urls)
if not normalized_urls or not prompt.strip():
raise ScrapeGraphToolError("missing_arguments", "urls and prompt are required for multi_extract.")
if len(normalized_urls) > max(1, int(max_urls)):
raise ScrapeGraphToolError("too_many_urls", f"multi_extract supports at most {max_urls} URLs per call.")
graph = SmartScraperMultiGraph(
prompt=prompt.strip(),
source=normalized_urls,
config=_build_config(
api_key=api_key,
text_model=text_model_name,
render_mode=render_mode,
timeout_s=timeout_s,
storage_state_path=storage_state,
),
schema=schema_model,
)
result_data = _json_safe(graph.run())
final_state = graph.get_state()
response = {
"action": action,
"result": result_data,
"sources": normalized_urls,
"artifacts": {
"markdown": None,
"links": [],
"images": [],
"per_url_results": _json_safe(final_state.get("results", [])),
},
"meta": {
"render_mode_used": render_mode,
"text_model": text_model_name,
"vision_model": None,
},
"warnings": [],
}
if return_debug:
response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
result = _json_response(response)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
if action == "crawl_extract":
if not url.strip() or not prompt.strip():
raise ScrapeGraphToolError("missing_arguments", "url and prompt are required for crawl_extract.")
graph = _BoundedDepthSearchGraph(
prompt=prompt.strip(),
source=url.strip(),
config=_build_config(
api_key=api_key,
text_model=text_model_name,
render_mode=render_mode,
timeout_s=timeout_s,
storage_state_path=storage_state,
depth=depth,
max_pages=max_pages,
same_domain_only=same_domain_only,
),
schema=schema_model,
)
result_data = _json_safe(graph.run())
final_state = graph.get_state()
response = {
"action": action,
"result": result_data,
"sources": _extract_sources(final_state, [url.strip()]),
"artifacts": {
"markdown": None,
"links": [],
"images": [],
"per_url_results": [],
},
"meta": {
"render_mode_used": render_mode,
"text_model": text_model_name,
"vision_model": None,
},
"warnings": [],
}
if return_debug:
response["debug"] = {"final_state": _json_safe(final_state), "execution_info": _json_safe(graph.get_execution_info())}
result = _json_response(response)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
raise ScrapeGraphToolError("unsupported_action", f"Unsupported action: {action}")
except ScrapeGraphToolError as exc:
result = _error_response(action, exc.code, exc.message, exc.hint)
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
except Exception as exc: # pragma: no cover - runtime integration path
code = "browser_unavailable" if "playwright" in str(exc).lower() or "chromium" in str(exc).lower() else "fetch_failed"
result = _error_response(action, code, f"ScrapeGraphAI action failed: {exc}")
_log_call_end("ScrapeGraphAI", _truncate_for_log(result))
return result
def build_interface() -> gr.Interface:
return gr.Interface(
fn=ScrapeGraphAI,
inputs=[
gr.Dropdown(choices=ACTION_CHOICES, value="extract", label="Action"),
gr.Textbox(label="URL", placeholder="https://example.com"),
gr.JSON(label="URLs", value=[]),
gr.Textbox(label="Prompt", lines=4, placeholder="Extract pricing tiers and main limits."),
gr.JSON(label="Schema JSON", value={}),
gr.Dropdown(choices=RENDER_CHOICES, value="auto", label="Render Mode"),
gr.Checkbox(label="Include Images", value=False),
gr.Number(label="Depth", value=1, precision=0),
gr.Number(label="Max Pages", value=4, precision=0),
gr.Checkbox(label="Same Domain Only", value=True),
gr.Number(label="Max URLs", value=8, precision=0),
gr.Number(label="Max Images", value=5, precision=0),
gr.Number(label="Max Chars", value=12000, precision=0),
gr.Checkbox(label="Include Links", value=True),
gr.Number(label="Timeout (seconds)", value=30, precision=0),
gr.Textbox(label="Storage State Path", placeholder="Optional Playwright storage_state JSON path"),
gr.Checkbox(label="Return Debug", value=False),
],
outputs=gr.Textbox(label="Result", lines=20, max_lines=40),
title="ScrapeGraphAI",
description="<div style=\"text-align:center\">Mistral-only structured scraping using ScrapeGraphAI graphs.</div>",
api_description=TOOL_SUMMARY,
flagging_mode="never",
)
__all__ = ["ScrapeGraphAI", "build_interface"]