async_pdf_chunck_api

Paused

App Files Files Community

async_pdf_chunck_api / main.py

Arafath10

Update main.py

da2debc verified over 1 year ago

raw

history blame contribute delete

4.31 kB

	import io
	import asyncio
	import os
	import uuid
	import logging
	from pathlib import Path
	import aiohttp
	from PyPDF2 import PdfReader, PdfWriter
	from fastapi import FastAPI, UploadFile, Form
	from fastapi.responses import JSONResponse
	from aiohttp import FormData
	import shutil

	app = FastAPI()

	async def call_pdfscraper(session, file_contents, pdf_name, processTables):
	headers = {"Origin": "http://localhost:8080"}
	url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3"
	data = FormData()
	data.add_field(
	"pdf",
	file_contents,
	filename=os.path.basename(pdf_name),
	content_type="application/pdf",
	)
	data.add_field("processTables", processTables)

	async with session.post(url, data=data, headers=headers) as resp:
	if resp.status == 200:
	response = await resp.json()
	else:
	return {}, pdf_name

	return response, pdf_name

	async def execute_pdfscraper_async(file_path: str, processTables: str):
	chunk_list = os.listdir(file_path)
	chunk_byte_list = [
	(open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list
	]
	response_list = []
	async with aiohttp.ClientSession() as session:
	tasks = [
	call_pdfscraper(session, file_all[0], file_all[1], processTables)
	for file_all in chunk_byte_list
	]
	responses = await asyncio.gather(*tasks)
	for i, response in enumerate(responses):
	response_list.append(response[0])

	return response_list

	def collect_pdfscraper_response(scrape_response_list):
	content_list = []
	tables_dict = {}
	table_count = 1
	for response in scrape_response_list:
	content = response.get("corpus", "")
	table_content = response.get("tables_raw", {})

	content_list.append(content)
	try:
	for table_key in table_content.keys():
	tables_dict[str(table_count)] = table_content[table_key]
	table_count += 1

	except AttributeError:
	pass

	content_str = "\n".join(content_list)

	return content_str, tables_dict

	def split_pdf(file_contents, file_name, pages_per_chunk):
	file_bytes = io.BytesIO(file_contents)
	reader = PdfReader(file_bytes)
	total_pages = len(reader.pages)

	# Generate a unique directory for each request to avoid conflicts
	unique_dir = str(uuid.uuid4())
	output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
	os.makedirs(output_dir, exist_ok=True)

	num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk

	for i in range(num_chunks):
	writer = PdfWriter()
	start_page = i * pages_per_chunk
	end_page = min(start_page + pages_per_chunk, total_pages)

	for page_number in range(start_page, end_page):
	writer.add_page(reader.pages[page_number])

	chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf"
	output_path = output_dir / chunk_file_name
	with open(output_path, "wb") as output_pdf:
	writer.write(output_pdf)

	return str(output_dir)

	@app.post("/process-pdf/")
	async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
	# Read the PDF file
	file_contents = await pdf_file.read()

	# Split the PDF into chunks
	chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)

	# Asynchronously process the PDF chunks
	scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)

	# Collect the results
	content, table_string = collect_pdfscraper_response(scrape_response_list)

	# Ensure the directory exists before attempting to delete it
	if os.path.exists(chunks_dir):
	try:
	shutil.rmtree(chunks_dir) # Clean up chunks after processing
	except Exception as e:
	# Log any errors during cleanup
	logging.error(f"Error deleting directory {chunks_dir}: {e}")

	return JSONResponse(content={"content": content, "tables": table_string})

	# If you want to run this locally, uncomment the lines below.
	# if __name__ == "__main__":
	# import uvicorn
	# uvicorn.run(app, host="0.0.0.0", port=8000)
	#uvicorn main:app --workers 2