Spaces:

ashishbangwal
/

Investor-API

Build error

App Files Files Community

Investor-API / utils /PdfUtils.py

ashishbangwal

id error resolved

c926830 over 1 year ago

raw

history blame contribute delete

3.35 kB

	import numpy
	from PIL import Image
	from typing import List, Tuple

	import pymupdf

	from .ChartClassifier import Classifier
	from .HelperFunctions import CountTokens
	from .ModelCallingFunctions import image_data_extractor


	def extract_image_content(pixmap_list: List[pymupdf.Pixmap], text: str) -> List[str]:
	"Takes image path and extract information from it, and return it as text."

	# Start Classifier inference session
	classifier = Classifier("utils/graph_classifierV2_B.onnx")

	img_list = []

	for pixmap in pixmap_list:
	try:
	img_list.append(
	Image.frombytes(
	mode="RGB", size=(pixmap.width, pixmap.height), data=pixmap.samples
	)
	)
	except Exception as e:
	print(e)

	graph_image = classifier.classify(img_list)
	print(graph_image)

	response_list = []

	for idx, is_graph in enumerate(graph_image):
	if is_graph:
	response = image_data_extractor(img=img_list[idx], text=text)
	response_list.append(str(response))

	return response_list


	def ProcessPdf(pdf_content: bytes) -> List[Tuple[str, int]]:
	"""
	Takes PDF(bytes) and return a list of tuples containing text(including textual and image content)
	and page number containing that text.
	"""
	print("Extract content called ")
	pdf_doc = pymupdf.open(stream=pdf_content, filetype="pdf")

	pages_content = []
	refered_xref = []
	for page_number in range(pdf_doc.page_count):
	page_content = ""

	# extracting text content
	page = pdf_doc.load_page(page_number)
	text_content = str(page.get_text()).replace("\n", "\t")
	page_content += text_content

	# extracting image content
	image_list = page.get_image_info(xrefs=True)
	pixmap_list = []
	for img_info in image_list:
	xref = img_info["xref"]
	if xref not in refered_xref:
	# if xref not in refered_xref:
	try:
	img_pixmap = pymupdf.Pixmap(pdf_doc, xref)
	pixmap_list.append(img_pixmap)
	refered_xref.append(xref)
	except ValueError as e:
	print(f"Skipping image with due to error: {e}")
	if len(pixmap_list) > 0:
	img_content = extract_image_content(
	pixmap_list=pixmap_list, text=text_content.replace("\n", "\t")
	)
	page_content = page_content + "\n\n" + "\n\n".join(img_content)

	pages_content.append(page_content)

	num_tokens = CountTokens(pages_content)

	final_data = []

	# Logic to handle case when page content > 512 tokens
	for e, n_token in enumerate(num_tokens):
	if n_token > 500:
	n_parts = numpy.ceil(n_token / 500).astype(int)
	len_content = len(pages_content[e])
	part_size = len_content // n_parts
	start, end = 0, part_size
	temp = []
	for nth_part in range(n_parts):
	temp.append((pages_content[e][start:end], str(e) + "_" + str(nth_part)))
	start = end
	end = end + part_size
	final_data += temp
	else:
	final_data.append((pages_content[e], str(e)))

	pdf_doc.close()
	return final_data