Karim shoair commited on
Commit ·
05b9f2c
1
Parent(s): 2457b42
refactor: changes to be used by the mcp server
Browse files- scrapling/core/_types.py +3 -11
- scrapling/core/shell.py +58 -19
scrapling/core/_types.py
CHANGED
|
@@ -20,24 +20,16 @@ from typing import (
|
|
| 20 |
Match,
|
| 21 |
Mapping,
|
| 22 |
Awaitable,
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 26 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 27 |
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
|
|
|
| 28 |
StrOrBytes = Union[str, bytes]
|
| 29 |
|
| 30 |
-
try:
|
| 31 |
-
from typing import Protocol
|
| 32 |
-
except ImportError:
|
| 33 |
-
# Added in Python 3.8
|
| 34 |
-
Protocol = object
|
| 35 |
-
|
| 36 |
-
try:
|
| 37 |
-
from typing import SupportsIndex
|
| 38 |
-
except ImportError:
|
| 39 |
-
# 'SupportsIndex' got added in Python 3.8
|
| 40 |
-
SupportsIndex = None
|
| 41 |
|
| 42 |
if TYPE_CHECKING:
|
| 43 |
# typing.Self requires Python 3.11
|
|
|
|
| 20 |
Match,
|
| 21 |
Mapping,
|
| 22 |
Awaitable,
|
| 23 |
+
Protocol,
|
| 24 |
+
SupportsIndex,
|
| 25 |
)
|
| 26 |
|
| 27 |
SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
|
| 28 |
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
|
| 29 |
PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
| 30 |
+
extraction_types = Literal["text", "html", "markdown"]
|
| 31 |
StrOrBytes = Union[str, bytes]
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
if TYPE_CHECKING:
|
| 35 |
# typing.Self requires Python 3.11
|
scrapling/core/shell.py
CHANGED
|
@@ -28,7 +28,15 @@ from scrapling import __version__
|
|
| 28 |
from scrapling.core.custom_types import TextHandler
|
| 29 |
from scrapling.core.utils import log
|
| 30 |
from scrapling.parser import Adaptor, Adaptors
|
| 31 |
-
from scrapling.core._types import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from scrapling.fetchers import (
|
| 33 |
Fetcher,
|
| 34 |
AsyncFetcher,
|
|
@@ -561,13 +569,55 @@ Type 'exit' or press Ctrl+D to exit.
|
|
| 561 |
class Convertor:
|
| 562 |
"""Utils for the extract shell command"""
|
| 563 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 564 |
@classmethod
|
| 565 |
-
def
|
| 566 |
"""Convert HTML content to Markdown"""
|
| 567 |
from markdownify import markdownify
|
| 568 |
|
| 569 |
return markdownify(body)
|
| 570 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
@classmethod
|
| 572 |
def write_content_to_file(
|
| 573 |
cls, page: Adaptor, filename: str, css_selector: Optional[str] = None
|
|
@@ -582,21 +632,10 @@ class Convertor:
|
|
| 582 |
"Unknown file type: filename must end with '.md', '.html', or '.txt'"
|
| 583 |
)
|
| 584 |
else:
|
| 585 |
-
body = page if not css_selector else page.css_first(css_selector)
|
| 586 |
with open(filename, "w", encoding="utf-8") as f:
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
for s in (
|
| 594 |
-
"\n",
|
| 595 |
-
"\r",
|
| 596 |
-
"\t",
|
| 597 |
-
" ",
|
| 598 |
-
):
|
| 599 |
-
# Remove consecutive white-spaces
|
| 600 |
-
txt_content = re_sub(f"[{s}]+", s, txt_content)
|
| 601 |
-
|
| 602 |
-
f.write(txt_content)
|
|
|
|
| 28 |
from scrapling.core.custom_types import TextHandler
|
| 29 |
from scrapling.core.utils import log
|
| 30 |
from scrapling.parser import Adaptor, Adaptors
|
| 31 |
+
from scrapling.core._types import (
|
| 32 |
+
List,
|
| 33 |
+
Optional,
|
| 34 |
+
Dict,
|
| 35 |
+
Tuple,
|
| 36 |
+
Any,
|
| 37 |
+
Union,
|
| 38 |
+
extraction_types,
|
| 39 |
+
)
|
| 40 |
from scrapling.fetchers import (
|
| 41 |
Fetcher,
|
| 42 |
AsyncFetcher,
|
|
|
|
| 569 |
class Convertor:
|
| 570 |
"""Utils for the extract shell command"""
|
| 571 |
|
| 572 |
+
_extension_map: dict[str, extraction_types] = {
|
| 573 |
+
"md": "markdown",
|
| 574 |
+
"html": "html",
|
| 575 |
+
"txt": "text",
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
@classmethod
|
| 579 |
+
def _convert_to_markdown(cls, body: TextHandler) -> str:
|
| 580 |
"""Convert HTML content to Markdown"""
|
| 581 |
from markdownify import markdownify
|
| 582 |
|
| 583 |
return markdownify(body)
|
| 584 |
|
| 585 |
+
@classmethod
|
| 586 |
+
def _extract_content(
|
| 587 |
+
cls,
|
| 588 |
+
page: Adaptor,
|
| 589 |
+
extraction_type: extraction_types = "markdown",
|
| 590 |
+
css_selector: Optional[str] = None,
|
| 591 |
+
main_content_only: bool = False,
|
| 592 |
+
) -> str:
|
| 593 |
+
"""Extract the content of an Adaptor"""
|
| 594 |
+
if not page or not isinstance(page, Adaptor):
|
| 595 |
+
raise TypeError("Input must be of type `Adaptor`")
|
| 596 |
+
elif not extraction_type or extraction_type not in cls._extension_map.values():
|
| 597 |
+
raise ValueError(f"Unknown extraction type: {extraction_type}")
|
| 598 |
+
else:
|
| 599 |
+
if main_content_only:
|
| 600 |
+
page = page.css_first("body") or page
|
| 601 |
+
|
| 602 |
+
page = page if not css_selector else page.css_first(css_selector)
|
| 603 |
+
match extraction_type:
|
| 604 |
+
case "markdown":
|
| 605 |
+
return cls._convert_to_markdown(page.body)
|
| 606 |
+
case "html":
|
| 607 |
+
return page.body
|
| 608 |
+
case "text":
|
| 609 |
+
txt_content = page.get_all_text(strip=True)
|
| 610 |
+
for s in (
|
| 611 |
+
"\n",
|
| 612 |
+
"\r",
|
| 613 |
+
"\t",
|
| 614 |
+
" ",
|
| 615 |
+
):
|
| 616 |
+
# Remove consecutive white-spaces
|
| 617 |
+
txt_content = re_sub(f"[{s}]+", s, txt_content)
|
| 618 |
+
return txt_content
|
| 619 |
+
return ""
|
| 620 |
+
|
| 621 |
@classmethod
|
| 622 |
def write_content_to_file(
|
| 623 |
cls, page: Adaptor, filename: str, css_selector: Optional[str] = None
|
|
|
|
| 632 |
"Unknown file type: filename must end with '.md', '.html', or '.txt'"
|
| 633 |
)
|
| 634 |
else:
|
|
|
|
| 635 |
with open(filename, "w", encoding="utf-8") as f:
|
| 636 |
+
extension = filename.split(".")[-1]
|
| 637 |
+
f.write(
|
| 638 |
+
cls._extract_content(
|
| 639 |
+
page, cls._extension_map[extension], css_selector=css_selector
|
| 640 |
+
)
|
| 641 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|