Spaces:

chandanzeon
/

DoclingAPI

Sleeping

App Files Files Community

DoclingAPI / helper.py

chandanzeon

Undo prev changes

64dff79 verified 12 months ago

raw

history blame contribute delete

11.4 kB

	from docling.document_converter import DocumentConverter
	import logging
	import re
	from uuid import uuid4
	from typing import List, Optional, Generator, Set
	from functools import partial, reduce
	from itertools import chain
	from PyPDF2 import PdfReader, PdfWriter

	tag_list = ["Sources:", "Source:", "Tags-", "Tags:", "CONTENTS", "ANNEX", "EXERCISES", "Project/Activity"]

	logger = logging.getLogger(__name__)

	import os

	try:
	converter = DocumentConverter()
	except Exception as e:
	logger.error(f"Error initializing Docling DocumentConverter: {e}")

	def split_pdf(input_pdf, output_pdf, start_page, end_page):
	reader = PdfReader(input_pdf)
	writer = PdfWriter()
	for i in range(start_page, end_page+1):
	writer.add_page(reader.pages[i])
	with open(output_pdf, "wb") as output_file:
	writer.write(output_file)
	print(f"PDF split successfully: {output_pdf}")

	def get_texts(res):
	page_texts = {pg:"" for pg in res['pages'].keys()}
	texts = res.get('texts')
	for item in texts:
	for prov in item['prov']:
	page_no = prov['page_no']
	text = item['text']
	page_key = f'{page_no}'
	if page_key not in page_texts:
	page_texts[page_key] = text
	else:
	page_texts[page_key] += ' ' + text
	return page_texts

	def clean_the_text(text):
	"""
	Cleans the extracted text by removing unnecessary characters and formatting issues.

	Args:
	text (str): The extracted text.

	Returns:
	str: The cleaned text.
	"""
	try:
	text = re.sub(r'\n\s*\n', '\n', text)
	text = text.replace("\t", " ")
	text = text.replace("\f", " ")
	text = re.sub(r'\b(\w+\s*)\1{1,}', '\\1', text)
	text = re.sub(r'[^a-zA-Z0-9\s@\-/,.\\]', ' ', text)
	return text.strip()
	except Exception as e:
	logger.error(f"Error cleaning text: {e}")
	return text

	def get_tables(res_json):
	page_tables = {pg:[] for pg in res_json['pages'].keys()}
	try:
	tables = res_json.get('tables', [])
	if not isinstance(tables, list):
	raise ValueError("Expected 'tables' to be a list.")
	for table in tables:
	try:
	# Ensure 'prov' exists and has the necessary structure
	prov = table.get('prov', [])
	if not prov or not isinstance(prov, list):
	raise ValueError("Missing or invalid 'prov' structure in table.")
	page_no = str(prov[0].get('page_no'))
	if not page_no:
	raise ValueError("Missing or invalid 'page_no' in 'prov'.")
	# Ensure 'data' and 'grid' exist
	data = table.get('data', {})
	grid = data.get('grid', [])
	if not isinstance(grid, list):
	raise ValueError("Missing or invalid 'grid' structure in 'data'.")
	# Add text to page_texts
	page_tables[f'{page_no}'].append(grid)

	except Exception as table_error:
	print(f"Error processing table: {table_error}")

	except Exception as e:
	print(f"Error processing tables: {e}")

	return page_tables

	def table_to_text_or_json(table, rtrn_type="text"):
	"""
	Converts a table to a single string or JSON format.

	Args:
	table (dict): The table object to convert.
	rtrn_type (str): The return type, either "text" or "json". Default is "text".

	Returns:
	str: The table converted to the specified format.
	"""
	table_text = "Here is a Table : \n"
	for row in table:
	for col in row:
	val = col.get('text')
	table_text+=f'{val} ,'
	table_text+='\n'
	return table_text

	def clean_file_name(text: str):
	"""
	Cleans the file name by removing any special characters.

	Args:
	text (str): The original file name.

	Returns:
	str: The cleaned file name.
	"""
	try:
	text = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text)
	return text
	except Exception as e:
	logger.error(f"Error cleaning file name: {e}")
	return text

	def find_and_remove_header_footer(
	text: str, n_chars: int, n_first_pages_to_ignore: int, n_last_pages_to_ignore: int
	) -> str:
	"""
	Heuristic to find footers and headers across different pages by searching for the longest common string.
	For headers we only search in the first n_chars characters (for footer: last n_chars).
	Note: This heuristic uses exact matches and therefore works well for footers like "Copyright 2019 by XXX",
	but won't detect "Page 3 of 4" or similar.

	:param n_chars: number of first/last characters where the header/footer shall be searched in
	:param n_first_pages_to_ignore: number of first pages to ignore (e.g. TOCs often don't contain footer/header)
	:param n_last_pages_to_ignore: number of last pages to ignore
	:return: (cleaned pages, found_header_str, found_footer_str)
	"""

	pages = text.split("\f")

	# header
	start_of_pages = [p[:n_chars] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
	found_header = find_longest_common_ngram(start_of_pages)
	if found_header:
	pages = [page.replace(found_header, "") for page in pages]

	# footer
	end_of_pages = [p[-n_chars:] for p in pages[n_first_pages_to_ignore:-n_last_pages_to_ignore]]
	found_footer = find_longest_common_ngram(end_of_pages)
	if found_footer:
	pages = [page.replace(found_footer, "") for page in pages]
	logger.debug(f"Removed header '{found_header}' and footer '{found_footer}' in document")
	text = "\f".join(pages)
	return text

	def ngram(self, seq: str, n: int) -> Generator[str, None, None]:
	"""
	Return ngram (of tokens - currently split by whitespace)
	:param seq: str, string from which the ngram shall be created
	:param n: int, n of ngram
	:return: str, ngram as string
	"""

	# In order to maintain the original whitespace, but still consider \n and \t for n-gram tokenization,
	# we add a space here and remove it after creation of the ngrams again (see below)
	seq = seq.replace("\n", " \n")
	seq = seq.replace("\t", " \t")

	words = seq.split(" ")
	ngrams = (
	" ".join(words[i : i + n]).replace(" \n", "\n").replace(" \t", "\t") for i in range(0, len(words) - n + 1)
	)

	return ngrams

	def allngram(self, seq: str, min_ngram: int, max_ngram: int) -> Set[str]:
	lengths = range(min_ngram, max_ngram) if max_ngram else range(min_ngram, len(seq))
	ngrams = map(partial(self.ngram, seq), lengths)
	res = set(chain.from_iterable(ngrams))
	return res

	def find_longest_common_ngram(
	sequences: List[str], max_ngram: int = 30, min_ngram: int = 3
	) -> Optional[str]:
	"""
	Find the longest common ngram across different text sequences (e.g. start of pages).
	Considering all ngrams between the specified range. Helpful for finding footers, headers etc.

	:param sequences: list[str], list of strings that shall be searched for common n_grams
	:param max_ngram: int, maximum length of ngram to consider
	:param min_ngram: minimum length of ngram to consider
	:return: str, common string of all sections
	"""
	sequences = [s for s in sequences if s] # filter empty sequences
	if not sequences:
	return None
	seqs_ngrams = map(partial(allngram, min_ngram=min_ngram, max_ngram=max_ngram), sequences)
	intersection = reduce(set.intersection, seqs_ngrams)

	try:
	longest = max(intersection, key=len)
	except ValueError:
	# no common sequence found
	longest = ""
	return longest if longest.strip() else None


	class PdfToSectionConverter():
	def __int__(self):
	"""
	Initializes the PdfToSectionConverter class.
	"""
	pass

	def convert(self, downloaded_pdf_path: str, file_title: str, doc_id: str = None, start_page_no: int = 0,
	end_page_no: int = 0):
	"""
	Converts a PDF document to sections with metadata.

	Args:
	doc_obj (BytesIO): The PDF document object.
	downloaded_pdf_path (str): Path to the downloaded PDF file.
	file_title (str): The title of the file.
	doc_id (str, optional): The document ID. Defaults to None.
	start_page_no (int, optional): The starting page number. Defaults to 0.
	end_page_no (int, optional): The ending page number. Defaults to 0.

	Returns:
	list: A list of dictionaries containing sections and metadata.
	"""
	try:
	print(f"Splitting pdf from page {start_page_no+1} to {end_page_no+1}")
	output_path = "/tmp/splitted.pdf"
	split_pdf(downloaded_pdf_path, output_path, start_page_no, end_page_no)
	print("OCR Started ....")
	result = converter.convert(output_path)
	json_objects = result.document.export_to_dict()
	pages = list(json_objects['pages'].keys())
	texts = get_texts(json_objects)
	tables = get_tables(json_objects)
	except Exception as e:
	logger.error(f"Error getting JSON result from parser: {e}")
	return []

	output_doc_lst = []
	page_no = start_page_no
	try:
	for page in pages:
	if page_no > end_page_no:
	break
	page_no += 1
	print(f"Page Number to be processed: {page_no}")
	meta = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": []}
	meta_table = {"doc_id": doc_id, "page_no": page_no, "img_count": 0, "img_lst": "[]"}

	# Extract text from the page
	text_to_append = texts[page]
	text_to_append = clean_the_text(text_to_append)

	# Detect and extract tables
	tables_to_append = tables[page]
	if tables_to_append:
	tables_to_append = [table_to_text_or_json(table=i, rtrn_type="text") for i in tables_to_append]


	# Add the processed section to the output list
	output_doc_lst.append(
	{"doc_id": doc_id, "text": text_to_append, "vector_id": str(uuid4()),
	"meta": meta, "content_type": 'text'})
	for table in tables_to_append:
	output_doc_lst.append(
	{"doc_id": doc_id, "text": table, "vector_id": str(uuid4()),
	"meta": meta_table, "content_type": 'table'})

	# Post-process text to remove headers and footers
	text_to_append_list = "\f".join([i['text'] for i in output_doc_lst])
	text_to_append_list = find_and_remove_header_footer(text=text_to_append_list, n_chars=10,
	n_first_pages_to_ignore=0,
	n_last_pages_to_ignore=0).split("\f")

	for i in range(len(output_doc_lst)):
	output_doc_lst[i]['text'] = clean_file_name(file_title) + "\n" + text_to_append_list[i]

	except Exception as e:
	logger.error(f"Error converting PDF to sections: {e}")

	return output_doc_lst