Spaces:
Sleeping
Sleeping
File size: 4,897 Bytes
270f367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import streamlit as st
import google.generativeai as genai
import PyPDF2 as pdf
import pandas as pd
import tempfile
import openpyxl
from openpyxl.utils import get_column_letter
from PIL import Image
import easyocr
import numpy as np # Make sure to import numpy
# Configure API key
genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8")
# Initialize the OCR reader
reader = easyocr.Reader(['en'])
# Function to get response from Generative AI model
def get_gemini_response(input):
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(input)
return response
# Convert PDF to text
def input_pdf_text(uploaded_file):
reader_pdf = pdf.PdfReader(uploaded_file)
text = ""
for page in range(len(reader_pdf.pages)):
page = reader_pdf.pages[page]
text += str(page.extract_text())
return text
# Extract text from images using EasyOCR
def input_image_text(uploaded_file):
# Open the image using PIL
image = Image.open(uploaded_file)
# Convert the image to a NumPy array
image_np = np.array(image)
# Perform OCR on the image
text = reader.readtext(image_np, detail=0) # Extract text as a list of strings
return ' '.join(text) # Join the extracted text into a single string
# Extract information based on each criterion
def extract_information_per_criterion(text, criteria_list):
extracted_data = {}
for criterion in criteria_list:
prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}"
response = get_gemini_response(prompt)
extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '') # Remove asterisks
extracted_data[criterion] = extracted_text
return extracted_data
# Store extracted information into a DataFrame
def information_to_df(extracted_data, sr_no):
data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data}
df = pd.DataFrame(data)
df.insert(0, "Sr. No", sr_no)
return df
# Adjust Excel columns to fit content
def adjust_excel_columns(writer, df):
worksheet = writer.sheets['Sheet1']
for idx, col in enumerate(df.columns, 1): # 1-indexed.
max_length = max(df[col].astype(str).map(len).max(), len(col))
worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2
# Streamlit App
st.title("File Information Extractor")
st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction")
uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True)
if uploaded_files:
user_input = st.text_area("Enter the criteria for extracting information, separated by commas.")
if user_input:
criteria_list = [criterion.strip() for criterion in user_input.split(',')] # Split and clean criteria
all_dfs = []
for i, uploaded_file in enumerate(uploaded_files, start=1):
# Determine file type and handle accordingly
if uploaded_file.type == "application/pdf":
text = input_pdf_text(uploaded_file)
extracted_data = extract_information_per_criterion(text, criteria_list)
st.subheader(f"Extracted Information from PDF File {i}")
st.write(extracted_data)
df = information_to_df(extracted_data, i)
all_dfs.append(df)
elif uploaded_file.type in ["image/jpeg", "image/png"]:
text = input_image_text(uploaded_file) # Extract text from image using OCR
extracted_data = extract_information_per_criterion(text, criteria_list)
st.subheader(f"Extracted Information from Image File {i}")
st.write(extracted_data)
df = information_to_df(extracted_data, i)
all_dfs.append(df)
# Combine all DataFrames into one
combined_df = pd.concat(all_dfs, ignore_index=True)
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file:
with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer:
combined_df.to_excel(writer, index=False)
adjust_excel_columns(writer, combined_df)
excel_path = tmp_file.name
with open(excel_path, "rb") as file:
st.download_button(
label="Download Extracted Information as Excel",
data=file,
file_name="extracted_information.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
|