Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from pdf_to_image import pdf_to_image | |
| from image_to_text import image_to_text | |
| from mirascope_extractor import extractor | |
| import google.generativeai as genai | |
| import pandas as pd | |
| import glob | |
| import os | |
| from dotenv import load_dotenv | |
| import streamlit as st | |
| load_dotenv() | |
| global empty_df | |
| openai_api_key = os.getenv('OPENAI_API_KEY') | |
| genai.configure(api_key=openai_api_key) | |
| st.set_page_config(page_title="Invoice Extractor") | |
| st.title("Gen AI CV Extraction") | |
| uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type="pdf") | |
| if uploaded_files: | |
| if st.button('Extract'): | |
| image_bytes = pdf_to_image(uploaded_files) | |
| all_texts = [] | |
| for image_byte in image_bytes: | |
| print('This is image_byte: ', image_byte) | |
| combine_text = '' | |
| for image in image_byte: | |
| text = image_to_text(image) | |
| combine_text += text | |
| print('This is the text from single PDF: ', combine_text) | |
| all_texts.append(combine_text) | |
| empty_df = pd.DataFrame() | |
| for text in all_texts: | |
| extracted_text = extractor(text) | |
| task_details_dict = extracted_text.dict() | |
| df = pd.DataFrame([task_details_dict]) | |
| empty_df = pd.concat([empty_df, df]) | |
| st.write(empty_df) | |
| csv = empty_df.to_csv(index=False) | |
| st.download_button( | |
| label = 'Click to Download CSV', | |
| data = csv, | |
| file_name = 'Extracted_data.csv', | |
| mime='text/csv', | |
| ) | |