Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from upload_file_to_s3 import upload_file | |
| import base64 | |
| import httpx | |
| from extract_table_from_image import process_image_using_llm | |
| from process_pdf import process_pdf | |
| from pymongo import MongoClient | |
| from datetime import datetime | |
| from table_analysis_for_image import view_table_analysis_page | |
| from table_analysis_for_pdf import view_pdf_table_analysis_page | |
| from table_analysis_for_excel import display_csv_analysis | |
| from view_excel import view_excel | |
| from copy import deepcopy | |
| import uuid | |
| import os | |
| import csv | |
| from view_pdf import view_pdfs | |
| from view_image import view_images | |
| from io import StringIO, BytesIO | |
| from dotenv import load_dotenv | |
| import boto3 | |
| import pandas as pd | |
| st.set_page_config(layout='wide',page_title="MoSPI", page_icon="📄") | |
| load_dotenv() | |
| AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") | |
| AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") | |
| AWS_BUCKET_NAME = os.getenv("AWS_BUCKET_NAME") | |
| MONGO_URI = os.getenv("MONGO_URI") | |
| DB_NAME = os.getenv("DB_NAME") | |
| COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
| mongo_client = MongoClient(MONGO_URI) | |
| db = mongo_client[DB_NAME] | |
| collection = db[COLLECTION_NAME] | |
| s3 = boto3.client( | |
| 's3', | |
| aws_access_key_id=AWS_ACCESS_KEY_ID, | |
| aws_secret_access_key=AWS_SECRET_ACCESS_KEY | |
| ) | |
| path_to_logo='logo.png' | |
| if "page" not in st.session_state: | |
| st.session_state.page = "home" | |
| def upload_csv_file(file, csv_filename, content_type): | |
| try: | |
| # Generate a unique key for the file using UUID | |
| uuid_str = str(uuid.uuid4()) | |
| s3_key = f'MoSPI_csv_files/{uuid_str}-{csv_filename}' | |
| # Upload the CSV to S3 | |
| s3.upload_fileobj( | |
| file, | |
| AWS_BUCKET_NAME, | |
| s3_key, | |
| ExtraArgs={'ContentType': content_type} # Set the MIME type of the uploaded file | |
| ) | |
| upload_time = datetime.now() | |
| # Metadata for MongoDB | |
| metadata = { | |
| 'name': csv_filename, | |
| 'type': content_type, | |
| 's3_url': f's3://{AWS_BUCKET_NAME}/{s3_key}', | |
| 's3_key': s3_key, | |
| 'object_url': f'https://{AWS_BUCKET_NAME}.s3.amazonaws.com/{s3_key}', | |
| 'date_uploaded': upload_time.strftime('%Y-%m-%d'), | |
| 'time_uploaded': upload_time.strftime('%H:%M:%S') | |
| } | |
| return metadata | |
| except Exception as e: | |
| print(f"An error occurred during upload: {e}") | |
| return None | |
| def process_image(url, filename): | |
| try: | |
| image_data = base64.b64encode(httpx.get(url).content).decode("utf-8") | |
| if image_data: | |
| result = process_image_using_llm(image_data, 1, 3) | |
| has_table_data = result.get("has_table_data") | |
| if has_table_data: | |
| table_data = result.get("table_data") | |
| page_number = result.get("page_number") | |
| description = result.get("description") | |
| column_summary = result.get("column_summary") | |
| best_col1=result.get("best_col1") | |
| best_col2=result.get("best_col2") | |
| data={ | |
| "table_data":table_data, | |
| "page_number":page_number, | |
| "description":description, | |
| "column_summary":column_summary, | |
| "best_col1":best_col1, | |
| "best_col2":best_col2 | |
| } | |
| collection.update_one({"object_url": url}, {"$set": {"table_data": data}}) | |
| print("Successfully extracted data from image and inserted into MongoDB") | |
| # Generate CSV from table data | |
| csv_buffer = StringIO() | |
| csv_writer = csv.DictWriter(csv_buffer, fieldnames=table_data[0].keys()) | |
| csv_writer.writeheader() | |
| csv_writer.writerows(table_data) | |
| # Convert CSV text to bytes for uploading | |
| csv_bytes = BytesIO(csv_buffer.getvalue().encode("utf-8")) | |
| # Upload CSV to S3 | |
| csv_filename = f"{filename}.csv" | |
| s3_metadata = upload_csv_file(csv_bytes, csv_filename, content_type="text/csv") | |
| if s3_metadata: | |
| # Update MongoDB with CSV S3 URL | |
| collection.update_one( | |
| {"object_url": url}, | |
| {"$set": { | |
| "csv_object_url": s3_metadata.get("object_url"), | |
| "csv_s3_url": s3_metadata.get("s3_url") | |
| }} | |
| ) | |
| print("CSV file uploaded to S3 and URL saved in MongoDB") | |
| return True | |
| else: | |
| print(f"No table data was found in the image {url}") | |
| return False | |
| else: | |
| print(f"No image data found in uploaded image") | |
| return False | |
| except Exception as e: | |
| print(f"Error occurred in processing image: {e}") | |
| return False | |
| def convert_excel_to_csv(file, filename): | |
| # Determine the appropriate engine based on file extension | |
| file_extension = filename.split('.')[-1].lower() | |
| if file_extension == 'xlsx': | |
| engine = 'openpyxl' | |
| elif file_extension == 'xls': | |
| engine = 'xlrd' | |
| else: | |
| raise ValueError("Unsupported file format for Excel. Please upload an .xls or .xlsx file.") | |
| # Load the Excel file into a DataFrame | |
| df = pd.read_excel(file, engine=engine) | |
| # Convert the DataFrame to CSV format in memory | |
| csv_buffer = BytesIO() | |
| df.to_csv(csv_buffer, index=False) | |
| csv_buffer.seek(0) # Move to the start of the buffer | |
| # Generate a new filename for CSV | |
| csv_filename = filename.replace(".xlsx", ".csv").replace(".xls", ".csv") | |
| return csv_buffer, csv_filename | |
| if st.session_state.page=="home": | |
| col1,col2=st.columns([1,13]) | |
| with col1: | |
| st.image(path_to_logo, width=100) | |
| with col2: | |
| st.title("Smart Data Extraction and Analysis tool") | |
| uploaded_file = st.file_uploader( | |
| "Upload a file", | |
| type=["png", "jpg", "jpeg", "pdf", "xlsx", "xls", "csv"], | |
| accept_multiple_files=False, | |
| help="Please upload only one file of type image, PDF, Excel, or CSV." | |
| ) | |
| if uploaded_file and st.button("Upload"): | |
| with st.spinner("Processing your file"): | |
| file_copy = BytesIO(uploaded_file.getvalue()) | |
| file_type = uploaded_file.type | |
| metadata = upload_file(uploaded_file, file_type) | |
| if metadata: | |
| object_url = metadata.get("object_url") | |
| filename = metadata.get("name") | |
| if "image" in file_type: # Process image files | |
| processed = process_image(object_url, filename) | |
| if processed: | |
| collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) | |
| st.success("Image processed and CSV file uploaded to S3 successfully.") | |
| else: | |
| collection.update_one({"object_url":object_url},{"$set":{"status":"failed"}}) | |
| st.error("Error occured in processing Image, please try again later") | |
| elif "pdf" in file_type: | |
| processed=process_pdf(object_url,filename) | |
| if processed: | |
| collection.update_one({"object_url": object_url}, {"$set": {"status": "processed"}}) | |
| st.success("Successfully processed pdf") | |
| else: | |
| collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) | |
| st.error("Error occured in processing pdf") | |
| elif file_type in ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet","application/vnd.ms-excel"]: | |
| csv_buffer, csv_filename = convert_excel_to_csv(file_copy, filename) | |
| s3_metadata = upload_csv_file(csv_buffer, csv_filename, content_type="text/csv") | |
| if s3_metadata: | |
| collection.update_one({"object_url": object_url}, { | |
| "$set": {"csv_object_url": s3_metadata["object_url"], "csv_s3_url": s3_metadata["s3_url"], | |
| "filetype": "excel","status":"processed"} | |
| }) | |
| st.success("Excel file uploaded to S3 successfully.") | |
| else: | |
| collection.update_one({"object_url": object_url}, {"$set": {"status": "failed"}}) | |
| elif "csv" in file_type: | |
| collection.update_one({"object_url": object_url}, { | |
| "$set": {"csv_object_url": object_url,"filetype": "csv","status":"processed"}}) | |
| st.success("CSV file uploaded to S3 successfully.") | |
| st.markdown("<hr>",unsafe_allow_html=True) | |
| col1, col2, col3 = st.columns([1, 1, 1], gap="small") | |
| with col1: | |
| if st.button("View PDFs", key="View pdf button"): | |
| st.session_state.page = "view_pdf" | |
| st.rerun() | |
| with col2: | |
| if st.button("View Images", key="View image button"): | |
| st.session_state.page = "view_image" | |
| st.rerun() | |
| with col3: | |
| if st.button("View Excel", key="View excel button"): | |
| st.session_state.page = "view_excel" | |
| st.rerun() | |
| #in case of csv we are already uploading it. | |
| if st.session_state.page=="view_pdf": | |
| view_pdfs() | |
| elif st.session_state.page=="view_image": | |
| view_images() | |
| elif st.session_state.page=="view_excel": | |
| view_excel() | |
| if st.session_state.page=="view_image_analysis" and "image_url" in st.session_state: | |
| image_url = st.session_state.image_url | |
| view_table_analysis_page(image_url) | |
| if st.session_state.page=="pdf_analysis" and "pdf_url" in st.session_state: | |
| pdf_url=st.session_state.pdf_url | |
| view_pdf_table_analysis_page(pdf_url) | |
| if st.session_state.page=="view_excel_analysis" and "excel_url" in st.session_state: | |
| excel_url=st.session_state.excel_url | |
| display_csv_analysis(excel_url) | |