File size: 1,263 Bytes
067cdc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
import os
import pymupdf.layout
import pymupdf4llm
from pathlib import Path
import glob
os.environ["TOKENIZERS_PARALLELISM"] = "false"
def pdf_to_markdown(pdf_path, output_dir):
doc = pymupdf.open(pdf_path)
md = pymupdf4llm.to_markdown(doc, header=False, footer=False, page_separators=True, ignore_images=True, write_images=False, image_path=None)
md_cleaned = md.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='ignore')
output_path = Path(output_dir) / Path(doc.name).stem
Path(output_path).with_suffix(".md").write_bytes(md_cleaned.encode('utf-8'))
def pdfs_to_markdowns(path_pattern, overwrite: bool = False):
output_dir = Path('./docs/markdowns')
output_dir.mkdir(parents=True, exist_ok=True)
for pdf_path in map(Path, glob.glob(path_pattern)):
md_path = (output_dir / pdf_path.stem).with_suffix(".md")
if overwrite or not md_path.exists():
pdf_to_markdown(pdf_path, output_dir)
if __name__ == "__main__":
pdf_folder = Path('./docs/pdf')
for file in os.listdir(pdf_folder):
file_path = pdf_folder / file
if file_path.suffix.lower() == '.pdf':
print(f"Processing file: {file_path}")
pdf_to_markdown(file_path, './docs/markdowns') |