| import argparse
|
| import os
|
| import requests
|
| from bs4 import BeautifulSoup
|
| from urllib.parse import urljoin
|
| from html2text import html2text
|
| from pathlib import Path
|
|
|
| def is_writable_path(target_path):
|
| """
|
| Check if a path is writable.
|
| """
|
| path = Path(os.path.dirname(target_path))
|
| if path.is_dir():
|
| if os.access(path, os.W_OK):
|
| return target_path
|
| else:
|
| raise argparse.ArgumentTypeError(f"Directory '{path}' is not writable.")
|
| else:
|
| raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist.")
|
|
|
| def main(url, markdown_path):
|
|
|
| with requests.Session() as session:
|
|
|
| response = session.get(url)
|
| response.raise_for_status()
|
|
|
|
|
| soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
| os.makedirs("./logs", exist_ok=True)
|
|
|
|
|
| for image in soup.find_all('img'):
|
| image_url = urljoin(url, image['src'])
|
| try:
|
| image_response = session.get(image_url, stream=True)
|
| image_response.raise_for_status()
|
| image_name = os.path.join("./logs", os.path.basename(image_url))
|
| with open(image_name, 'wb') as file:
|
| file.write(image_response.content)
|
| except requests.RequestException as e:
|
| print(f"Failed to download {image_url}: {e}")
|
|
|
|
|
| markdown_content = html2text(response.text)
|
|
|
|
|
| try:
|
| with open(markdown_path, "w", encoding="utf8") as file:
|
| file.write(markdown_content)
|
| print(f"Markdown content successfully written to {markdown_path}")
|
| except Exception as e:
|
| print(f"Failed to write markdown to {markdown_path}: {e}")
|
|
|
| if __name__ == "__main__":
|
| parser = argparse.ArgumentParser(description="Convert HTML to Markdown")
|
| parser.add_argument("url", help="The URL of the webpage to convert")
|
| parser.add_argument("markdown_path", help="The path to save the converted markdown file", type=is_writable_path)
|
| args = parser.parse_args()
|
|
|
| main(args.url, args.markdown_path)
|
|
|