async_pdf_chunck_api

Paused

App Files Files Community

Arafath10 commited on Aug 13, 2024

Commit

da2debc

verified ·

1 Parent(s): 1d9b15e

Update main.py

Browse files

Files changed (1) hide show

main.py +21 -8

main.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import io
 import asyncio
 import os
 from pathlib import Path
 import aiohttp
 from PyPDF2 import PdfReader, PdfWriter
@@ -31,7 +33,6 @@ async def call_pdfscraper(session, file_contents, pdf_name, processTables):
     return response, pdf_name
 async def execute_pdfscraper_async(file_path: str, processTables: str):
     chunk_list = os.listdir(file_path)
     chunk_byte_list = [
@@ -49,7 +50,6 @@ async def execute_pdfscraper_async(file_path: str, processTables: str):
     return response_list
 def collect_pdfscraper_response(scrape_response_list):
     content_list = []
     tables_dict = {}
@@ -71,12 +71,14 @@ def collect_pdfscraper_response(scrape_response_list):
     return content_str, tables_dict
 def split_pdf(file_contents, file_name, pages_per_chunk):
     file_bytes = io.BytesIO(file_contents)
     reader = PdfReader(file_bytes)
     total_pages = len(reader.pages)
-    output_dir = Path(file_name).parent / "chunks"
     os.makedirs(output_dir, exist_ok=True)
     num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
@@ -96,21 +98,32 @@ def split_pdf(file_contents, file_name, pages_per_chunk):
     return str(output_dir)
 @app.post("/process-pdf/")
 async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
     file_contents = await pdf_file.read()
     chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
     scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
     content, table_string = collect_pdfscraper_response(scrape_response_list)
-    shutil.rmtree(chunks_dir)  # Clean up chunks after processing
     return JSONResponse(content={"content": content, "tables": table_string})
-# Starting point for running the FastAPI app
 # if __name__ == "__main__":
 #     import uvicorn
 #     uvicorn.run(app, host="0.0.0.0", port=8000)

 import io
 import asyncio
 import os
+import uuid
+import logging
 from pathlib import Path
 import aiohttp
 from PyPDF2 import PdfReader, PdfWriter
     return response, pdf_name
 async def execute_pdfscraper_async(file_path: str, processTables: str):
     chunk_list = os.listdir(file_path)
     chunk_byte_list = [
     return response_list
 def collect_pdfscraper_response(scrape_response_list):
     content_list = []
     tables_dict = {}
     return content_str, tables_dict
 def split_pdf(file_contents, file_name, pages_per_chunk):
     file_bytes = io.BytesIO(file_contents)
     reader = PdfReader(file_bytes)
     total_pages = len(reader.pages)
+    # Generate a unique directory for each request to avoid conflicts
+    unique_dir = str(uuid.uuid4())
+    output_dir = Path(file_name).parent / f"chunks_{unique_dir}"
     os.makedirs(output_dir, exist_ok=True)
     num_chunks = (total_pages + pages_per_chunk - 1) // pages_per_chunk
     return str(output_dir)
 @app.post("/process-pdf/")
 async def process_pdf(pdf_file: UploadFile, pages_per_chunk: int = Form(2), processTables: str = Form("True")):
+    # Read the PDF file
     file_contents = await pdf_file.read()
+    # Split the PDF into chunks
     chunks_dir = split_pdf(file_contents, pdf_file.filename, pages_per_chunk)
+    # Asynchronously process the PDF chunks
     scrape_response_list = await execute_pdfscraper_async(chunks_dir, processTables)
+    # Collect the results
     content, table_string = collect_pdfscraper_response(scrape_response_list)
+    # Ensure the directory exists before attempting to delete it
+    if os.path.exists(chunks_dir):
+        try:
+            shutil.rmtree(chunks_dir)  # Clean up chunks after processing
+        except Exception as e:
+            # Log any errors during cleanup
+            logging.error(f"Error deleting directory {chunks_dir}: {e}")
     return JSONResponse(content={"content": content, "tables": table_string})
+# If you want to run this locally, uncomment the lines below.
 # if __name__ == "__main__":
 #     import uvicorn
 #     uvicorn.run(app, host="0.0.0.0", port=8000)
+#uvicorn main:app --workers 2