Krish30 commited on
Commit
270f367
·
verified ·
1 Parent(s): f58ed7c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +121 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import google.generativeai as genai
4
+ import PyPDF2 as pdf
5
+ import pandas as pd
6
+ import tempfile
7
+ import openpyxl
8
+ from openpyxl.utils import get_column_letter
9
+ from PIL import Image
10
+ import easyocr
11
+ import numpy as np # Make sure to import numpy
12
+
13
+ # Configure API key
14
+ genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8")
15
+
16
+ # Initialize the OCR reader
17
+ reader = easyocr.Reader(['en'])
18
+
19
+ # Function to get response from Generative AI model
20
+ def get_gemini_response(input):
21
+ model = genai.GenerativeModel('gemini-pro')
22
+ response = model.generate_content(input)
23
+ return response
24
+
25
+ # Convert PDF to text
26
+ def input_pdf_text(uploaded_file):
27
+ reader_pdf = pdf.PdfReader(uploaded_file)
28
+ text = ""
29
+ for page in range(len(reader_pdf.pages)):
30
+ page = reader_pdf.pages[page]
31
+ text += str(page.extract_text())
32
+ return text
33
+
34
+ # Extract text from images using EasyOCR
35
+ def input_image_text(uploaded_file):
36
+ # Open the image using PIL
37
+ image = Image.open(uploaded_file)
38
+ # Convert the image to a NumPy array
39
+ image_np = np.array(image)
40
+ # Perform OCR on the image
41
+ text = reader.readtext(image_np, detail=0) # Extract text as a list of strings
42
+ return ' '.join(text) # Join the extracted text into a single string
43
+
44
+ # Extract information based on each criterion
45
+ def extract_information_per_criterion(text, criteria_list):
46
+ extracted_data = {}
47
+ for criterion in criteria_list:
48
+ prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}"
49
+ response = get_gemini_response(prompt)
50
+ extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '') # Remove asterisks
51
+ extracted_data[criterion] = extracted_text
52
+ return extracted_data
53
+
54
+
55
+
56
+ # Store extracted information into a DataFrame
57
+ def information_to_df(extracted_data, sr_no):
58
+ data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data}
59
+ df = pd.DataFrame(data)
60
+ df.insert(0, "Sr. No", sr_no)
61
+ return df
62
+
63
+ # Adjust Excel columns to fit content
64
+ def adjust_excel_columns(writer, df):
65
+ worksheet = writer.sheets['Sheet1']
66
+ for idx, col in enumerate(df.columns, 1): # 1-indexed.
67
+ max_length = max(df[col].astype(str).map(len).max(), len(col))
68
+ worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2
69
+
70
+ # Streamlit App
71
+ st.title("File Information Extractor")
72
+ st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction")
73
+
74
+ uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True)
75
+
76
+ if uploaded_files:
77
+ user_input = st.text_area("Enter the criteria for extracting information, separated by commas.")
78
+
79
+ if user_input:
80
+ criteria_list = [criterion.strip() for criterion in user_input.split(',')] # Split and clean criteria
81
+ all_dfs = []
82
+
83
+ for i, uploaded_file in enumerate(uploaded_files, start=1):
84
+ # Determine file type and handle accordingly
85
+ if uploaded_file.type == "application/pdf":
86
+ text = input_pdf_text(uploaded_file)
87
+ extracted_data = extract_information_per_criterion(text, criteria_list)
88
+
89
+ st.subheader(f"Extracted Information from PDF File {i}")
90
+ st.write(extracted_data)
91
+
92
+ df = information_to_df(extracted_data, i)
93
+ all_dfs.append(df)
94
+
95
+ elif uploaded_file.type in ["image/jpeg", "image/png"]:
96
+ text = input_image_text(uploaded_file) # Extract text from image using OCR
97
+ extracted_data = extract_information_per_criterion(text, criteria_list)
98
+
99
+ st.subheader(f"Extracted Information from Image File {i}")
100
+ st.write(extracted_data)
101
+
102
+ df = information_to_df(extracted_data, i)
103
+ all_dfs.append(df)
104
+
105
+ # Combine all DataFrames into one
106
+ combined_df = pd.concat(all_dfs, ignore_index=True)
107
+
108
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file:
109
+ with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer:
110
+ combined_df.to_excel(writer, index=False)
111
+ adjust_excel_columns(writer, combined_df)
112
+
113
+ excel_path = tmp_file.name
114
+
115
+ with open(excel_path, "rb") as file:
116
+ st.download_button(
117
+ label="Download Extracted Information as Excel",
118
+ data=file,
119
+ file_name="extracted_information.xlsx",
120
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
121
+ )
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ google.generativeai
4
+ python-dotenv
5
+ openpyxl
6
+ numpy
7
+ pandas
8
+ easyocr