Spaces:

ABCCCYYY
/

kohya_ss

Build error

App Files Files Community

kohya_ss / tools /convert_html_to_md.py

ABCCCYYY

Upload folder using huggingface_hub

cff1674 verified over 1 year ago

raw

history blame contribute delete

2.51 kB

	import argparse
	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from html2text import html2text
	from pathlib import Path

	def is_writable_path(target_path):
	"""
	Check if a path is writable.
	"""
	path = Path(os.path.dirname(target_path))
	if path.is_dir():
	if os.access(path, os.W_OK):
	return target_path
	else:
	raise argparse.ArgumentTypeError(f"Directory '{path}' is not writable.")
	else:
	raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist.")

	def main(url, markdown_path):
	# Create a session object
	with requests.Session() as session:
	# Send HTTP request to the specified URL
	response = session.get(url)
	response.raise_for_status() # Check for HTTP issues

	# Create a BeautifulSoup object and specify the parser
	soup = BeautifulSoup(response.text, 'html.parser')

	# Ensure the directory for saving images exists
	os.makedirs("./logs", exist_ok=True)

	# Find all image tags and save images
	for image in soup.find_all('img'):
	image_url = urljoin(url, image['src'])
	try:
	image_response = session.get(image_url, stream=True)
	image_response.raise_for_status()
	image_name = os.path.join("./logs", os.path.basename(image_url))
	with open(image_name, 'wb') as file:
	file.write(image_response.content)
	except requests.RequestException as e:
	print(f"Failed to download {image_url}: {e}")

	# Convert the HTML content to markdown
	markdown_content = html2text(response.text)

	# Save the markdown content to a file
	try:
	with open(markdown_path, "w", encoding="utf8") as file:
	file.write(markdown_content)
	print(f"Markdown content successfully written to {markdown_path}")
	except Exception as e:
	print(f"Failed to write markdown to {markdown_path}: {e}")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Convert HTML to Markdown")
	parser.add_argument("url", help="The URL of the webpage to convert")
	parser.add_argument("markdown_path", help="The path to save the converted markdown file", type=is_writable_path)
	args = parser.parse_args()

	main(args.url, args.markdown_path)