Aqdas commited on
Commit
75beed0
·
verified ·
1 Parent(s): 5a11367

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +63 -0
  2. image_to_text.py +9 -0
  3. mirascope_extractor.py +32 -0
  4. packages.txt +1 -0
  5. pdf_to_image.py +22 -0
  6. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from pdf_to_image import pdf_to_image
4
+ from image_to_text import image_to_text
5
+ from mirascope_extractor import extractor
6
+
7
+ import google.generativeai as genai
8
+ import pandas as pd
9
+
10
+ import glob
11
+ import os
12
+ from dotenv import load_dotenv
13
+ import streamlit as st
14
+ # import subprocess
15
+
16
+ # Example installation command (adjust based on your environment)
17
+ # subprocess.run(['apt-get', 'install', 'tesseract-ocr'])
18
+
19
+
20
+ load_dotenv()
21
+
22
+ global empty_df
23
+ openai_api_key = os.getenv('OPENAI_API_KEY')
24
+ genai.configure(api_key=openai_api_key)
25
+
26
+
27
+ # Verify that Poppler is installed and in PATH
28
+
29
+ # folder_name = "/project/workspace/pdfs"
30
+ # invoice_pdfs = glob.glob(os.path.join(folder_name, '*.pdf')) + glob.glob(os.path.join(folder_name, '*.PDF'))
31
+ # print(f'Invoices_pdfs: {invoice_pdfs}')
32
+
33
+ st.set_page_config(page_title="Invoice Extractor")
34
+ st.title("Gen AI Invoice Extraction")
35
+ uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type="pdf")
36
+ if uploaded_files:
37
+ # st.write(f'This is {uploaded_files}')
38
+ if st.button('Extract'):
39
+ image_bytes = pdf_to_image(uploaded_files)
40
+
41
+ all_texts = []
42
+ for image_byte in image_bytes:
43
+ text = image_to_text(image_byte)
44
+ all_texts.append(text)
45
+ print('one text appended')
46
+
47
+ empty_df = pd.DataFrame()
48
+
49
+ for text in all_texts:
50
+ extracted_text = extractor(text)
51
+ task_details_dict = extracted_text.dict()
52
+ df = pd.DataFrame([task_details_dict])
53
+ empty_df = pd.concat([empty_df, df])
54
+
55
+ st.write(empty_df)
56
+ csv = empty_df.to_csv(index=False)
57
+ st.download_button(
58
+ label = 'Click to Download CSV',
59
+ data = csv,
60
+ file_name = 'Extracted_data.csv',
61
+ mime='text/csv',
62
+ )
63
+
image_to_text.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+ import io
4
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
5
+ def image_to_text(images):
6
+ img_bytes = io.BytesIO()
7
+ images.save(img_bytes, format='PNG')
8
+ text = pytesseract.image_to_string(Image.open(img_bytes))
9
+ return text
mirascope_extractor.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mirascope.openai import OpenAIExtractor
2
+ from mirascope.gemini import GeminiExtractor
3
+ from mirascope.groq import GroqExtractor
4
+
5
+ from pydantic import FilePath, BaseModel
6
+ from typing import List, Type
7
+
8
+
9
+ class TaskDetails(BaseModel):
10
+ seller_company_name: str
11
+ receiver_company: str
12
+ description: List[str]
13
+ invoice_date: str
14
+ invoice_number: str
15
+ net_amount : float
16
+ vat_amount : float
17
+ vat_rate: str
18
+ total_amount : float
19
+
20
+ class TaskExtractor(OpenAIExtractor[TaskDetails]):
21
+ extract_schema: Type[TaskDetails] = TaskDetails
22
+ prompt_template = """
23
+ Extract the invoice details from the following invoice:
24
+ {invoice}
25
+ """
26
+ invoice: str
27
+
28
+ def extractor(text):
29
+ task_details = TaskExtractor(invoice=text).extract()
30
+ assert isinstance(task_details, TaskDetails)
31
+ return task_details
32
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr
pdf_to_image.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from PIL import Image
3
+
4
+ def pdf_to_image(pdf_files, dpi=300):
5
+ pdf_images = []
6
+ for pdf_file in pdf_files:
7
+ pdf_bytes = pdf_file.read() # Read the uploaded file as bytes
8
+ pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
9
+ images = []
10
+ for page_num in range(len(pdf_document)):
11
+ page = pdf_document.load_page(page_num)
12
+ zoom = dpi / 72 # 72 is the default DPI of the PDF
13
+ mat = fitz.Matrix(zoom, zoom)
14
+ pix = page.get_pixmap(matrix=mat)
15
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
16
+ images.append(img)
17
+
18
+ pdf_images.extend(images)
19
+ return pdf_images
20
+
21
+
22
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pytesseract
2
+ pillow
3
+ mirascope
4
+ groq
5
+ google-generativeai
6
+ streamlit
7
+ pyMuPDF