Update main.py
Browse files
main.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import io
|
| 2 |
import asyncio
|
| 3 |
import os
|
|
|
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
import aiohttp
|
| 6 |
from PyPDF2 import PdfReader, PdfWriter
|
|
@@ -31,7 +33,6 @@ async def call_pdfscraper(session, file_contents, pdf_name, processTables):
|
|
| 31 |
|
| 32 |
return response, pdf_name
|
| 33 |
|
| 34 |
-
|
| 35 |
async def execute_pdfscraper_async(file_path: str, processTables: str):
|
| 36 |
chunk_list = os.listdir(file_path)
|
| 37 |
chunk_byte_list = [
|
|
@@ -49,7 +50,6 @@ async def execute_pdfscraper_async(file_path: str, processTables: str):
|
|
| 49 |
|
| 50 |
return response_list
|
| 51 |
|
| 52 |
-
|
| 53 |
def collect_pdfscraper_response(scrape_response_list):
|
| 54 |
content_list = []
|
| 55 |
tables_dict = {}
|
|
@@ -71,12 +71,14 @@ def collect_pdfscraper_response(scrape_response_list):
|
|
| 71 |
|
| 72 |
return content_str, tables_dict
|
| 73 |
|
| 74 |
-
|
| 75 |
def split_pdf(file_contents, file_name, pages_per_chunk):
|
| 76 |
file_bytes = io.BytesIO(file_contents)
|
| 77 |
reader = PdfReader(file_bytes)
|
| 78 |
total_pages = len(reader.pages)
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
| 80 |
os.makedirs(output_dir, exist_ok=True)
|
| 81 |
|
| 82 |
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
|
|
@@ -96,21 +98,32 @@ def split_pdf(file_contents, file_name, pages_per_chunk):
|
|
| 96 |
|
| 97 |
return str(output_dir)
|
| 98 |
|
| 99 |
-
|
| 100 |
@app.post("/process-pdf/")
|
| 101 |
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
|
|
|
|
| 102 |
file_contents = await pdf_file.read()
|
| 103 |
|
|
|
|
| 104 |
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
|
|
|
|
|
|
|
| 105 |
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
|
|
|
|
|
|
|
| 106 |
content, table_string = collect_pdfscraper_response(scrape_response_list)
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
return JSONResponse(content={"content": content, "tables": table_string})
|
| 111 |
|
| 112 |
-
|
| 113 |
-
# Starting point for running the FastAPI app
|
| 114 |
# if __name__ == "__main__":
|
| 115 |
# import uvicorn
|
| 116 |
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
|
|
| 1 |
import io
|
| 2 |
import asyncio
|
| 3 |
import os
|
| 4 |
+
import uuid
|
| 5 |
+
import logging
|
| 6 |
from pathlib import Path
|
| 7 |
import aiohttp
|
| 8 |
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
| 33 |
|
| 34 |
return response, pdf_name
|
| 35 |
|
|
|
|
| 36 |
async def execute_pdfscraper_async(file_path: str, processTables: str):
|
| 37 |
chunk_list = os.listdir(file_path)
|
| 38 |
chunk_byte_list = [
|
|
|
|
| 50 |
|
| 51 |
return response_list
|
| 52 |
|
|
|
|
| 53 |
def collect_pdfscraper_response(scrape_response_list):
|
| 54 |
content_list = []
|
| 55 |
tables_dict = {}
|
|
|
|
| 71 |
|
| 72 |
return content_str, tables_dict
|
| 73 |
|
|
|
|
| 74 |
def split_pdf(file_contents, file_name, pages_per_chunk):
|
| 75 |
file_bytes = io.BytesIO(file_contents)
|
| 76 |
reader = PdfReader(file_bytes)
|
| 77 |
total_pages = len(reader.pages)
|
| 78 |
+
|
| 79 |
+
# Generate a unique directory for each request to avoid conflicts
|
| 80 |
+
unique_dir = str(uuid.uuid4())
|
| 81 |
+
output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
|
| 82 |
os.makedirs(output_dir, exist_ok=True)
|
| 83 |
|
| 84 |
num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
|
|
|
|
| 98 |
|
| 99 |
return str(output_dir)
|
| 100 |
|
|
|
|
| 101 |
@app.post("/process-pdf/")
|
| 102 |
async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
|
| 103 |
+
# Read the PDF file
|
| 104 |
file_contents = await pdf_file.read()
|
| 105 |
|
| 106 |
+
# Split the PDF into chunks
|
| 107 |
chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
|
| 108 |
+
|
| 109 |
+
# Asynchronously process the PDF chunks
|
| 110 |
scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
|
| 111 |
+
|
| 112 |
+
# Collect the results
|
| 113 |
content, table_string = collect_pdfscraper_response(scrape_response_list)
|
| 114 |
|
| 115 |
+
# Ensure the directory exists before attempting to delete it
|
| 116 |
+
if os.path.exists(chunks_dir):
|
| 117 |
+
try:
|
| 118 |
+
shutil.rmtree(chunks_dir) # Clean up chunks after processing
|
| 119 |
+
except Exception as e:
|
| 120 |
+
# Log any errors during cleanup
|
| 121 |
+
logging.error(f"Error deleting directory {chunks_dir}: {e}")
|
| 122 |
|
| 123 |
return JSONResponse(content={"content": content, "tables": table_string})
|
| 124 |
|
| 125 |
+
# If you want to run this locally, uncomment the lines below.
|
|
|
|
| 126 |
# if __name__ == "__main__":
|
| 127 |
# import uvicorn
|
| 128 |
# uvicorn.run(app, host="0.0.0.0", port=8000)
|
| 129 |
+
#uvicorn main:app --workers 2
|