danial2020 commited on
Commit
3f4a167
·
verified ·
1 Parent(s): fe5c2b9

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +58 -0
  2. image_to_text.py +9 -0
  3. mirascope_extractor.py +33 -0
  4. packages.txt +1 -0
  5. pdf_to_image.py +22 -0
  6. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from pdf_to_image import pdf_to_image
4
+ from image_to_text import image_to_text
5
+ from mirascope_extractor import extractor
6
+
7
+ import google.generativeai as genai
8
+ import pandas as pd
9
+
10
+ import glob
11
+ import os
12
+ from dotenv import load_dotenv
13
+ import streamlit as st
14
+
15
+
16
+
17
+ load_dotenv()
18
+
19
+ global empty_df
20
+ openai_api_key = os.getenv('OPENAI_API_KEY')
21
+ genai.configure(api_key=openai_api_key)
22
+
23
+
24
+ st.set_page_config(page_title="Invoice Extractor")
25
+ st.title("Gen AI CV Extraction")
26
+ uploaded_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type="pdf")
27
+ if uploaded_files:
28
+ if st.button('Extract'):
29
+ image_bytes = pdf_to_image(uploaded_files)
30
+
31
+ all_texts = []
32
+ for image_byte in image_bytes:
33
+ print('This is image_byte: ', image_byte)
34
+
35
+ combine_text = ''
36
+ for image in image_byte:
37
+ text = image_to_text(image)
38
+ combine_text += text
39
+ print('This is the text from single PDF: ', combine_text)
40
+ all_texts.append(combine_text)
41
+
42
+ empty_df = pd.DataFrame()
43
+
44
+ for text in all_texts:
45
+ extracted_text = extractor(text)
46
+ task_details_dict = extracted_text.dict()
47
+ df = pd.DataFrame([task_details_dict])
48
+ empty_df = pd.concat([empty_df, df])
49
+
50
+ st.write(empty_df)
51
+ csv = empty_df.to_csv(index=False)
52
+ st.download_button(
53
+ label = 'Click to Download CSV',
54
+ data = csv,
55
+ file_name = 'Extracted_data.csv',
56
+ mime='text/csv',
57
+ )
58
+
image_to_text.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+ import io
4
+ pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
5
+ def image_to_text(images):
6
+ img_bytes = io.BytesIO()
7
+ images.save(img_bytes, format='PNG')
8
+ text = pytesseract.image_to_string(Image.open(img_bytes))
9
+ return text
mirascope_extractor.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mirascope.openai import OpenAIExtractor
2
+ from mirascope.gemini import GeminiExtractor
3
+ from mirascope.groq import GroqExtractor
4
+
5
+ from retry import retry
6
+
7
+ from pydantic import FilePath, BaseModel
8
+ from typing import List, Type
9
+
10
+
11
+ class TaskDetails(BaseModel):
12
+ name: str
13
+ email: str
14
+ phone_number: str
15
+ skills: List[str]
16
+ education: str
17
+ past_company_experience: str
18
+ about_section: str
19
+
20
+ class TaskExtractor(OpenAIExtractor[TaskDetails]):
21
+ extract_schema: Type[TaskDetails] = TaskDetails
22
+ prompt_template = """
23
+ Extract the Resume details from the following Resume:
24
+ {resume}
25
+ """
26
+ resume: str
27
+
28
+ @retry(tries=3, delay=2, backoff=2)
29
+ def extractor(text):
30
+ task_details = TaskExtractor(resume=text).extract()
31
+ assert isinstance(task_details, TaskDetails)
32
+ return task_details
33
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr
pdf_to_image.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from PIL import Image
3
+
4
+ def pdf_to_image(pdf_files, dpi=300):
5
+ pdf_images = []
6
+ for pdf_file in pdf_files:
7
+ pdf_bytes = pdf_file.read() # Read the uploaded file as bytes
8
+ pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
9
+ images = []
10
+ for page_num in range(len(pdf_document)):
11
+ page = pdf_document.load_page(page_num)
12
+ zoom = dpi / 72 # 72 is the default DPI of the PDF
13
+ mat = fitz.Matrix(zoom, zoom)
14
+ pix = page.get_pixmap(matrix=mat)
15
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
16
+ images.append(img)
17
+
18
+ pdf_images.append(images)
19
+ return pdf_images
20
+
21
+
22
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pytesseract
2
+ pillow
3
+ mirascope
4
+ groq
5
+ google-generativeai
6
+ streamlit
7
+ pyMuPDF
8
+ retry