pragmatic-agent / modules /data /page_crawler.py
m97j's picture
First codes update
69c12a2
# app/modules/data/page_crawler.py
import markdownify
import requests
from bs4 import BeautifulSoup
from logs.logger import get_logger
logger = get_logger(__name__)
def fetch_page(url: str, timeout: int = 10) -> str | None:
"""
Retrieves an HTML page from a given URL
"""
try:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0 Safari/537.36"
)
}
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
logger.info(f"[crawler] Successfully fetched page: {url}, length={len(resp.text)}")
return resp.text
except Exception as e:
logger.error(f"[crawler] fetch_page failed for URL: {url} with error: {e}")
logger.debug("Traceback: ", exc_info=True)
return None
def extract_main_markdown(html: str) -> str:
"""
Extract body text from HTML and convert to Markdown
"""
try:
soup = BeautifulSoup(html, "html.parser")
# Remove unnecessary tags
for tag in soup(["script", "style", "header", "footer", "nav", "aside"]):
tag.decompose()
body = soup.body or soup
# Convert to markdown
md_text = markdownify.markdownify(str(body), strip=['a'])
return md_text
except Exception as e:
logger.error(f"[crawler] extract_main_text failed: {e}")
logger.debug("Traceback: ", exc_info=True)
return ""