Spaces:

Ansemin101
/

Markit

Paused

App Files Files Community

Markit / src /parsers /pypdfium_parser.py

AnseMin

file structure change

a370b95 about 1 year ago

raw

history blame contribute delete

2.75 kB

	from pathlib import Path
	from typing import Dict, List, Optional, Any, Union
	import json
	import pypdfium2 as pdfium

	from src.parsers.parser_interface import DocumentParser
	from src.parsers.parser_registry import ParserRegistry
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend


	class PyPdfiumParser(DocumentParser):
	"""Parser implementation using PyPdfium."""

	@classmethod
	def get_name(cls) -> str:
	return "PyPdfium"

	@classmethod
	def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
	return [
	{
	"id": "no_ocr",
	"name": "No OCR",
	"default_params": {}
	},
	{
	"id": "easyocr",
	"name": "EasyOCR",
	"default_params": {"languages": ["en"]}
	}
	]

	def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
	"""Parse a document using PyPdfium."""
	pipeline_options = PdfPipelineOptions()
	pipeline_options.do_table_structure = True
	pipeline_options.table_structure_options.do_cell_matching = True

	# Configure OCR based on the method
	if ocr_method == "easyocr":
	pipeline_options.do_ocr = True
	# Apply any custom parameters from kwargs
	if "languages" in kwargs:
	pipeline_options.ocr_options.lang = kwargs["languages"]
	else:
	pipeline_options.do_ocr = False

	# Create the converter
	converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_options=pipeline_options,
	backend=PyPdfiumDocumentBackend
	)
	}
	)

	# Convert the document
	result = converter.convert(Path(file_path))
	doc = result.document

	# Return the content in the requested format
	output_format = kwargs.get("output_format", "markdown")
	if output_format.lower() == "json":
	return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
	elif output_format.lower() == "text":
	return doc.export_to_text()
	elif output_format.lower() == "document_tags":
	return doc.export_to_document_tokens()
	else:
	return doc.export_to_markdown()


	# Register the parser with the registry
	ParserRegistry.register(PyPdfiumParser)