import os import pymupdf.layout import pymupdf4llm from pathlib import Path import glob os.environ["TOKENIZERS_PARALLELISM"] = "false" def pdf_to_markdown(pdf_path, output_dir): doc = pymupdf.open(pdf_path) md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None) md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore') output_path = Path(output_dir) / Path(doc.name).stem Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8')) def pdfs_to_markdowns(path_pattern, overwrite: bool = False): output_dir = Path('./docs/markdowns') output_dir.mkdir(parents=True, exist_ok=True) for pdf_path in map(Path, glob.glob(path_pattern)): md_path = (output_dir / pdf_path.stem).with_suffix(".md") if overwrite or not md_path.exists(): pdf_to_markdown(pdf_path, output_dir) if __name__ == "__main__": pdf_folder = Path('./docs/pdf') for file in os.listdir(pdf_folder): file_path = pdf_folder / file if file_path.suffix.lower() == '.pdf': print(f"Processing file: {file_path}") pdf_to_markdown(file_path, './docs/markdowns')