| import io |
| import asyncio |
| import os |
| import uuid |
| import logging |
| from pathlib import Path |
| import aiohttp |
| from PyPDF2 import PdfReader, PdfWriter |
| from fastapi import FastAPI, UploadFile, Form |
| from fastapi.responses import JSONResponse |
| from aiohttp import FormData |
| import shutil |
|
|
| app = FastAPI() |
|
|
| async def call_pdfscraper(session, file_contents, pdf_name, processTables): |
| headers = {"Origin": "http://localhost:8080"} |
| url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v3" |
| data = FormData() |
| data.add_field( |
| "pdf", |
| file_contents, |
| filename=os.path.basename(pdf_name), |
| content_type="application/pdf", |
| ) |
| data.add_field("processTables", processTables) |
|
|
| async with session.post(url, data=data, headers=headers) as resp: |
| if resp.status == 200: |
| response = await resp.json() |
| else: |
| return {}, pdf_name |
|
|
| return response, pdf_name |
|
|
| async def execute_pdfscraper_async(file_path: str, processTables: str): |
| chunk_list = os.listdir(file_path) |
| chunk_byte_list = [ |
| (open(f"{file_path}/{file}", "rb").read(), file) for file in chunk_list |
| ] |
| response_list = [] |
| async with aiohttp.ClientSession() as session: |
| tasks = [ |
| call_pdfscraper(session, file_all[0], file_all[1], processTables) |
| for file_all in chunk_byte_list |
| ] |
| responses = await asyncio.gather(*tasks) |
| for i, response in enumerate(responses): |
| response_list.append(response[0]) |
|
|
| return response_list |
|
|
| def collect_pdfscraper_response(scrape_response_list): |
| content_list = [] |
| tables_dict = {} |
| table_count = 1 |
| for response in scrape_response_list: |
| content = response.get("corpus", "") |
| table_content = response.get("tables_raw", {}) |
|
|
| content_list.append(content) |
| try: |
| for table_key in table_content.keys(): |
| tables_dict[str(table_count)] = table_content[table_key] |
| table_count += 1 |
|
|
| except AttributeError: |
| pass |
|
|
| content_str = "\n".join(content_list) |
|
|
| return content_str, tables_dict |
|
|
| def split_pdf(file_contents, file_name, pages_per_chunk): |
| file_bytes = io.BytesIO(file_contents) |
| reader = PdfReader(file_bytes) |
| total_pages = len(reader.pages) |
| |
| |
| unique_dir = str(uuid.uuid4()) |
| output_dir = Path(file_name).parent / f"chunks_{unique_dir}" |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk |
|
|
| for i in range(num_chunks): |
| writer = PdfWriter() |
| start_page = i * pages_per_chunk |
| end_page = min(start_page + pages_per_chunk, total_pages) |
|
|
| for page_number in range(start_page, end_page): |
| writer.add_page(reader.pages[page_number]) |
|
|
| chunk_file_name = f"{Path(file_name).stem}_{i + 1}.pdf" |
| output_path = output_dir / chunk_file_name |
| with open(output_path, "wb") as output_pdf: |
| writer.write(output_pdf) |
|
|
| return str(output_dir) |
|
|
| @app.post("/process-pdf/") |
| async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")): |
| |
| file_contents = await pdf_file.read() |
|
|
| |
| chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk) |
| |
| |
| scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables) |
| |
| |
| content, table_string = collect_pdfscraper_response(scrape_response_list) |
| |
| |
| if os.path.exists(chunks_dir): |
| try: |
| shutil.rmtree(chunks_dir) |
| except Exception as e: |
| |
| logging.error(f"Error deleting directory {chunks_dir}: {e}") |
| |
| return JSONResponse(content={"content": content, "tables": table_string}) |
|
|
| |
| |
| |
| |
| |