Karim shoair commited on
Commit
05b9f2c
·
1 Parent(s): 2457b42

refactor: changes to be used by the mcp server

Browse files
Files changed (2) hide show
  1. scrapling/core/_types.py +3 -11
  2. scrapling/core/shell.py +58 -19
scrapling/core/_types.py CHANGED
@@ -20,24 +20,16 @@ from typing import (
20
  Match,
21
  Mapping,
22
  Awaitable,
 
 
23
  )
24
 
25
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
26
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
27
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
 
28
  StrOrBytes = Union[str, bytes]
29
 
30
- try:
31
- from typing import Protocol
32
- except ImportError:
33
- # Added in Python 3.8
34
- Protocol = object
35
-
36
- try:
37
- from typing import SupportsIndex
38
- except ImportError:
39
- # 'SupportsIndex' got added in Python 3.8
40
- SupportsIndex = None
41
 
42
  if TYPE_CHECKING:
43
  # typing.Self requires Python 3.11
 
20
  Match,
21
  Mapping,
22
  Awaitable,
23
+ Protocol,
24
+ SupportsIndex,
25
  )
26
 
27
  SUPPORTED_HTTP_METHODS = Literal["GET", "POST", "PUT", "DELETE"]
28
  SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
29
  PageLoadStates = Literal["commit", "domcontentloaded", "load", "networkidle"]
30
+ extraction_types = Literal["text", "html", "markdown"]
31
  StrOrBytes = Union[str, bytes]
32
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  if TYPE_CHECKING:
35
  # typing.Self requires Python 3.11
scrapling/core/shell.py CHANGED
@@ -28,7 +28,15 @@ from scrapling import __version__
28
  from scrapling.core.custom_types import TextHandler
29
  from scrapling.core.utils import log
30
  from scrapling.parser import Adaptor, Adaptors
31
- from scrapling.core._types import List, Optional, Dict, Tuple, Any, Union
 
 
 
 
 
 
 
 
32
  from scrapling.fetchers import (
33
  Fetcher,
34
  AsyncFetcher,
@@ -561,13 +569,55 @@ Type 'exit' or press Ctrl+D to exit.
561
  class Convertor:
562
  """Utils for the extract shell command"""
563
 
 
 
 
 
 
 
564
  @classmethod
565
- def __convert_to_markdown(cls, body: TextHandler) -> str:
566
  """Convert HTML content to Markdown"""
567
  from markdownify import markdownify
568
 
569
  return markdownify(body)
570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  @classmethod
572
  def write_content_to_file(
573
  cls, page: Adaptor, filename: str, css_selector: Optional[str] = None
@@ -582,21 +632,10 @@ class Convertor:
582
  "Unknown file type: filename must end with '.md', '.html', or '.txt'"
583
  )
584
  else:
585
- body = page if not css_selector else page.css_first(css_selector)
586
  with open(filename, "w", encoding="utf-8") as f:
587
- if filename.endswith(".md"):
588
- f.write(cls.__convert_to_markdown(body.body))
589
- elif filename.endswith(".html"):
590
- f.write(body.body)
591
- elif filename.endswith(".txt"):
592
- txt_content = body.get_all_text(strip=True)
593
- for s in (
594
- "\n",
595
- "\r",
596
- "\t",
597
- " ",
598
- ):
599
- # Remove consecutive white-spaces
600
- txt_content = re_sub(f"[{s}]+", s, txt_content)
601
-
602
- f.write(txt_content)
 
28
  from scrapling.core.custom_types import TextHandler
29
  from scrapling.core.utils import log
30
  from scrapling.parser import Adaptor, Adaptors
31
+ from scrapling.core._types import (
32
+ List,
33
+ Optional,
34
+ Dict,
35
+ Tuple,
36
+ Any,
37
+ Union,
38
+ extraction_types,
39
+ )
40
  from scrapling.fetchers import (
41
  Fetcher,
42
  AsyncFetcher,
 
569
  class Convertor:
570
  """Utils for the extract shell command"""
571
 
572
+ _extension_map: dict[str, extraction_types] = {
573
+ "md": "markdown",
574
+ "html": "html",
575
+ "txt": "text",
576
+ }
577
+
578
  @classmethod
579
+ def _convert_to_markdown(cls, body: TextHandler) -> str:
580
  """Convert HTML content to Markdown"""
581
  from markdownify import markdownify
582
 
583
  return markdownify(body)
584
 
585
+ @classmethod
586
+ def _extract_content(
587
+ cls,
588
+ page: Adaptor,
589
+ extraction_type: extraction_types = "markdown",
590
+ css_selector: Optional[str] = None,
591
+ main_content_only: bool = False,
592
+ ) -> str:
593
+ """Extract the content of an Adaptor"""
594
+ if not page or not isinstance(page, Adaptor):
595
+ raise TypeError("Input must be of type `Adaptor`")
596
+ elif not extraction_type or extraction_type not in cls._extension_map.values():
597
+ raise ValueError(f"Unknown extraction type: {extraction_type}")
598
+ else:
599
+ if main_content_only:
600
+ page = page.css_first("body") or page
601
+
602
+ page = page if not css_selector else page.css_first(css_selector)
603
+ match extraction_type:
604
+ case "markdown":
605
+ return cls._convert_to_markdown(page.body)
606
+ case "html":
607
+ return page.body
608
+ case "text":
609
+ txt_content = page.get_all_text(strip=True)
610
+ for s in (
611
+ "\n",
612
+ "\r",
613
+ "\t",
614
+ " ",
615
+ ):
616
+ # Remove consecutive white-spaces
617
+ txt_content = re_sub(f"[{s}]+", s, txt_content)
618
+ return txt_content
619
+ return ""
620
+
621
  @classmethod
622
  def write_content_to_file(
623
  cls, page: Adaptor, filename: str, css_selector: Optional[str] = None
 
632
  "Unknown file type: filename must end with '.md', '.html', or '.txt'"
633
  )
634
  else:
 
635
  with open(filename, "w", encoding="utf-8") as f:
636
+ extension = filename.split(".")[-1]
637
+ f.write(
638
+ cls._extract_content(
639
+ page, cls._extension_map[extension], css_selector=css_selector
640
+ )
641
+ )