Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| from pathlib import Path | |
| import trafilatura | |
| def convert_html_to_markdown(input_dir: str, output_dir: str): | |
| """ | |
| Convert all HTML files in the input directory (including subdirectories) to Markdown files | |
| while preserving the directory structure. | |
| Args: | |
| input_dir (str): Path to the input directory containing HTML files | |
| output_dir (str): Path to the output directory where Markdown files will be saved | |
| """ | |
| # Convert paths to Path objects | |
| input_path = Path(input_dir) | |
| output_path = Path(output_dir) | |
| # Create output directory if it doesn't exist | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| # Walk through the input directory | |
| for root, dirs, files in os.walk(input_path): | |
| # Get the relative path from input directory | |
| rel_path = Path(root).relative_to(input_path) | |
| # Create corresponding output directory | |
| current_output_dir = output_path / rel_path | |
| current_output_dir.mkdir(parents=True, exist_ok=True) | |
| # Process each file in the current directory | |
| for file in files: | |
| if file.endswith((".html", ".htm")): | |
| # Construct input and output file paths | |
| input_file = Path(root) / file | |
| output_file = current_output_dir / f"{file.rsplit('.', 1)[0]}.md" | |
| try: | |
| # Read HTML file | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| html_content = f.read() | |
| # Convert HTML to Markdown | |
| markdown_content = trafilatura.extract( | |
| html_content, output_format="markdown" | |
| ) | |
| if markdown_content: | |
| # Create the relative path string | |
| relative_path = str(rel_path / file).replace("\\", "/") | |
| # Add the path information at the top of the markdown content | |
| path_header = f"<!-- Original URL path: {relative_path} -->\n\n" | |
| markdown_content = path_header + markdown_content | |
| # Write Markdown content to output file | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| f.write(markdown_content) | |
| print(f"Converted: {input_file} -> {output_file}") | |
| else: | |
| print(f"Warning: Could not extract content from {input_file}") | |
| except Exception as e: | |
| print(f"Error processing {input_file}: {str(e)}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Convert HTML files to Markdown using trafilatura" | |
| ) | |
| parser.add_argument("input_dir", help="Input directory containing HTML files") | |
| parser.add_argument("output_dir", help="Output directory for Markdown files") | |
| args = parser.parse_args() | |
| convert_html_to_markdown(args.input_dir, args.output_dir) | |
| if __name__ == "__main__": | |
| main() | |