Spaces:
Sleeping
Sleeping
| from pdf_to_image import pdf_to_image | |
| from image_to_text import image_to_text | |
| from mirascope_extractor import extractor | |
| import google.generativeai as genai | |
| import pandas as pd | |
| import glob | |
| import os | |
| from dotenv import load_dotenv | |
| import streamlit as st | |
| load_dotenv() | |
| global empty_df | |
| openai_api_key = os.getenv('OPENAI_API_KEY') | |
| genai.configure(api_key=openai_api_key) | |
| folder_name = "/project/workspace/pdfs" | |
| invoice_pdfs = glob.glob(os.path.join(folder_name, '*.pdf')) + glob.glob(os.path.join(folder_name, '*.PDF')) | |
| print(f'Invoices_pdfs: {invoice_pdfs}') | |
| for pdf_path in invoice_pdfs: | |
| convert_image = pdf_to_image(pdf_path) | |
| convert_image.save_image(f'{pdf_path}image') | |
| print('one_pdf_converted') | |
| all_images = glob.glob(os.path.join(folder_name, '*.jpg')) | |
| all_texts = [] | |
| for image_path in all_images: | |
| text = image_to_text(image_path) | |
| all_texts.append(text) | |
| print('one text appended') | |
| empty_df = pd.DataFrame() | |
| for text in all_texts: | |
| extracted_text = extractor(text) | |
| task_details_dict = extracted_text.dict() | |
| df = pd.DataFrame([task_details_dict]) | |
| empty_df = pd.concat([empty_df, df]) | |
| empty_df.to_csv('extracted_data.csv') | |