Spaces:
Sleeping
Sleeping
| import base64 | |
| from pdf2image import convert_from_path | |
| from extract_table_from_image import process_image_using_llm | |
| from pymongo import MongoClient | |
| from datetime import datetime | |
| import uuid | |
| import os | |
| import re | |
| import csv | |
| import requests | |
| from io import StringIO, BytesIO | |
| from dotenv import load_dotenv | |
| import boto3 | |
| load_dotenv() | |
| AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
| AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
| AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") | |
| MONGO_URI = os.getenv("MONGO_URI") | |
| DB_NAME = os.getenv("DB_NAME") | |
| COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
| mongo_client = MongoClient(MONGO_URI) | |
| db = mongo_client[DB_NAME] | |
| collection = db[COLLECTION_NAME] | |
| s3 = boto3.client( | |
| 's3', | |
| aws_access_key_id=AWS_ACCESS_KEY_ID, | |
| aws_secret_access_key=AWS_SECRET_ACCESS_KEY | |
| ) | |
| pdf_temp_dir = 'temp/pdf_files' | |
| image_temp_dir = 'temp/page_images' | |
| os.makedirs(pdf_temp_dir, exist_ok=True) | |
| os.makedirs(image_temp_dir, exist_ok=True) | |
| pdf_path = os.path.join(pdf_temp_dir, 'downloaded_file.pdf') | |
| def cleanup_directory(directory_path): | |
| try: | |
| for filename in os.listdir(directory_path): | |
| file_path = os.path.join(directory_path, filename) | |
| if os.path.isfile(file_path): | |
| os.remove(file_path) | |
| print(f"Cleaned up files in {directory_path}") | |
| except Exception as e: | |
| print(f"Error cleaning up directory {directory_path}: {e}") | |
| def download_and_split_pdf_to_image(url): | |
| try: | |
| response = requests.get(url) | |
| with open(pdf_path, 'wb') as pdf_file: | |
| pdf_file.write(response.content) | |
| except Exception as e: | |
| print(f"error occured during downloading pdf from object url : {e}") | |
| return None | |
| try: | |
| images = convert_from_path(pdf_path) | |
| for i, image in enumerate(images): | |
| image_path = os.path.join(image_temp_dir, f'page_{i + 1}.png') | |
| image.save(image_path, 'PNG') | |
| print(f'Saved image: {image_path}') | |
| return True | |
| except Exception as e: | |
| print(f"error occured in converting pdf pages to image : {e}") | |
| return None | |
| def upload_csv_file(file, csv_filename, content_type): | |
| try: | |
| # Generate a unique key for the file using UUID | |
| uuid_str = str(uuid.uuid4()) | |
| s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' | |
| # Upload the CSV to S3 | |
| s3.upload_fileobj( | |
| file, | |
| AWS_BUCKET_NAME, | |
| s3_key, | |
| ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file | |
| ) | |
| upload_time = datetime.now() | |
| # Metadata for MongoDB | |
| metadata = { | |
| 'name': csv_filename, | |
| 'type': content_type, | |
| 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', | |
| 's3_key': s3_key, | |
| 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', | |
| 'date_uploaded': upload_time.strftime('%Y-%m-%d'), | |
| 'time_uploaded': upload_time.strftime('%H:%M:%S') | |
| } | |
| return metadata | |
| except Exception as e: | |
| print(f"An error occurred during upload: {e}") | |
| return None | |
| def process_pdf(url,filename): | |
| split=download_and_split_pdf_to_image(url) | |
| if split: | |
| image_files = sorted( | |
| os.listdir(image_temp_dir), | |
| key=lambda x: int(re.search(r'page_(\d+)', x).group(1)) | |
| ) | |
| table_datas= [] | |
| for count, image_name in enumerate(image_files, start=1): | |
| print(f"Processing page {count} of the PDF") | |
| image_path = os.path.join(image_temp_dir, image_name) | |
| with open(image_path, "rb") as image_file: | |
| image_data = base64.b64encode(image_file.read()).decode("utf-8") | |
| result = process_image_using_llm(image_data,count,3) | |
| has_table_data=result.get("has_table_data") | |
| if has_table_data: | |
| table_data=result.get("table_data") | |
| page_number=result.get("page_number") | |
| description = result.get("description") | |
| column_summary=result.get("column_summary") | |
| best_col1 = result.get("best_col1") | |
| best_col2 = result.get("best_col2") | |
| csv_buffer = StringIO() | |
| csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) | |
| csv_writer.writeheader() | |
| csv_writer.writerows(table_data) | |
| csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) | |
| csv_filename = f"{filename}_pageNumber_{str(page_number)}.csv" | |
| s3_metadata = upload_csv_file(csv_bytes, csv_filename, "text/csv") | |
| if s3_metadata: | |
| object_url=s3_metadata.get("object_url") | |
| s3_url=s3_metadata.get("s3_url") | |
| data = { | |
| "table_data": table_data, | |
| "description": description, | |
| "column_summary": column_summary, | |
| "page_number": page_number, | |
| "csv_object_url":object_url, | |
| "csv_s3_url":s3_url, | |
| "best_col1": best_col1, | |
| "best_col2": best_col2 | |
| } | |
| table_datas.append(data) | |
| else: | |
| print(f"no table data found at page {count}") | |
| if table_datas: | |
| collection.update_one({"object_url":url},{"$set":{"table_data":table_datas}}) | |
| cleanup_directory(pdf_temp_dir) | |
| cleanup_directory(image_temp_dir) | |
| return True | |
| else: | |
| print(f"found no table data in whole pdf") | |
| cleanup_directory(pdf_temp_dir) | |
| cleanup_directory(image_temp_dir) | |
| return False | |