Spaces:

tech5
/

docu-backend

Running

docu-backend / rag /combine.py

Deploy FastAPI RAG backend

e27c97c 4 months ago

2.07 kB

	import pdfplumber
	import fitz
	import camelot
	import pytesseract
	from PIL import Image
	import io


	# Raw Documents
	def raw_document_text(pdf_path: str):
	documents = []

	# Open PDF
	with pdfplumber.open(pdf_path) as pdf:
	doc_fitz = fitz.open(pdf_path)

	for page_index, page in enumerate(pdf.pages, start=1):

	# TEXT
	text = page.extract_text()
	if text:
	documents.append({
	"content": text,
	"metadata": {
	"page": page_index,
	"type": "text"
	}
	})


	# TABLES
	tables = camelot.read_pdf(
	pdf_path,
	pages=str(page_index),
	flavor="stream"
	)

	for t_idx, table in enumerate(tables):
	table_text = table.df.to_string(index=False)
	documents.append({
	"content": table_text,
	"metadata": {
	"page": page_index,
	"type": "table",
	"ref": f"Table {t_idx + 1}"
	}
	})


	# IMAGES + OCR
	page_fitz = doc_fitz[page_index - 1]
	images = page_fitz.get_images(full=True)

	for img_idx, img in enumerate(images):
	xref = img[0]
	base_image = doc_fitz.extract_image(xref)
	image_bytes = base_image["image"]

	image = Image.open(io.BytesIO(image_bytes))
	ocr_text = pytesseract.image_to_string(image)

	if ocr_text.strip():
	documents.append({
	"content": ocr_text,
	"metadata": {
	"page": page_index,
	"type": "image",
	"ref": f"Image {img_idx + 1}"
	}
	})

	return documents