File size: 5,914 Bytes

import os
from pathlib import Path
from typing import Callable, Dict, Any

import markdown
import fitz
from docx import Document
from bs4 import BeautifulSoup


class DocumentConverter:

    def convert(self, input_path:str, output_path:str,

                options:Dict[str,Any]|None=None,

                progress_callback:Callable|None=None)->bool:

        try:
            input_ext = Path(input_path).suffix.lower()
            output_ext = Path(output_path).suffix.lower()

            self._update(progress_callback, 10)

            Path(output_path).parent.mkdir(parents=True, exist_ok=True)

            success = False

            # TXT
            if input_ext == ".txt" and output_ext == ".pdf":
                success = self.txt_to_pdf(input_path, output_path)

            elif input_ext == ".txt" and output_ext == ".html":
                success = self.txt_to_html(input_path, output_path)

            elif input_ext == ".txt" and output_ext == ".md":
                success = self.txt_to_md(input_path, output_path)

            # MD
            elif input_ext == ".md" and output_ext == ".html":
                success = self.md_to_html(input_path, output_path)

            elif input_ext == ".md" and output_ext == ".txt":
                success = self.md_to_text(input_path, output_path)

            # HTML
            elif input_ext == ".html" and output_ext == ".txt":
                success = self.html_to_text(input_path, output_path)

            elif input_ext == ".html" and output_ext == ".md":
                success = self.html_to_md(input_path, output_path)

            # DOCX
            elif input_ext == ".docx" and output_ext == ".txt":
                success = self.docx_to_text(input_path, output_path)

            # PDF
            elif input_ext == ".pdf" and output_ext == ".txt":
                success = self.pdf_to_text(input_path, output_path)

            elif input_ext == ".pdf" and output_ext == ".html":
                success = self.pdf_to_html(input_path, output_path)

            else:
                raise ValueError(f"Unsupported conversion: {input_ext} -> {output_ext}")

            self._update(progress_callback, 100)

            return success

        except Exception as e:
            print(f"Document conversion error: {e}")
            return False

    def txt_to_pdf(self, input_path, output_path):
        pdf = fitz.open()
        page = pdf.new_page()

        text = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        page.insert_text((72, 72), text[:5000])

        pdf.save(output_path)
        return True

    def txt_to_html(self, input_path, output_path):
        text = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        html = f"<html><body><pre>{text}</pre></body></html>"

        Path(output_path).write_text(
            html,
            encoding="utf-8"
        )

        return True

    def txt_to_md(self, input_path, output_path):
        text = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        Path(output_path).write_text(
            text,
            encoding="utf-8"
        )

        return True

    def md_to_html(self, input_path, output_path):
        md = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        html = markdown.markdown(md)

        Path(output_path).write_text(
            html,
            encoding="utf-8"
        )

        return True

    def md_to_text(self, input_path, output_path):
        md = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        html = markdown.markdown(md)
        soup = BeautifulSoup(html, "html.parser")

        Path(output_path).write_text(
            soup.get_text(),
            encoding="utf-8"
        )

        return True

    def html_to_text(self, input_path, output_path):
        html = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        soup = BeautifulSoup(html, "html.parser")

        Path(output_path).write_text(
            soup.get_text(),
            encoding="utf-8"
        )

        return True

    def html_to_md(self, input_path, output_path):
        html = Path(input_path).read_text(
            encoding="utf-8",
            errors="ignore"
        )

        soup = BeautifulSoup(html, "html.parser")

        text = soup.get_text()

        Path(output_path).write_text(
            text,
            encoding="utf-8"
        )

        return True

    def docx_to_text(self, input_path, output_path):
        doc = Document(input_path)

        text = "\n".join(
            [p.text for p in doc.paragraphs]
        )

        Path(output_path).write_text(
            text,
            encoding="utf-8"
        )

        return True

    def pdf_to_text(self, input_path, output_path):
        doc = fitz.open(input_path)

        text = ""

        for page in doc:
            text += page.get_text()

        Path(output_path).write_text(
            text,
            encoding="utf-8"
        )

        return True

    def pdf_to_html(self, input_path, output_path):
        doc = fitz.open(input_path)

        html = ""

        for page in doc:
            html += page.get_text("html")

        Path(output_path).write_text(
            html,
            encoding="utf-8"
        )

        return True

    def _update(self, callback, value):
        try:
            if callback:
                callback(value)
        except:
            pass