Spaces:

Krish30
/

File-Information-Extractor

Sleeping

App Files Files Community

File-Information-Extractor / app.py

Krish30

Upload 2 files

270f367 verified about 1 year ago

raw

history blame contribute delete

4.9 kB


	import streamlit as st
	import google.generativeai as genai
	import PyPDF2 as pdf
	import pandas as pd
	import tempfile
	import openpyxl
	from openpyxl.utils import get_column_letter
	from PIL import Image
	import easyocr
	import numpy as np # Make sure to import numpy

	# Configure API key
	genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8")

	# Initialize the OCR reader
	reader = easyocr.Reader(['en'])

	# Function to get response from Generative AI model
	def get_gemini_response(input):
	model = genai.GenerativeModel('gemini-pro')
	response = model.generate_content(input)
	return response

	# Convert PDF to text
	def input_pdf_text(uploaded_file):
	reader_pdf = pdf.PdfReader(uploaded_file)
	text = ""
	for page in range(len(reader_pdf.pages)):
	page = reader_pdf.pages[page]
	text += str(page.extract_text())
	return text

	# Extract text from images using EasyOCR
	def input_image_text(uploaded_file):
	# Open the image using PIL
	image = Image.open(uploaded_file)
	# Convert the image to a NumPy array
	image_np = np.array(image)
	# Perform OCR on the image
	text = reader.readtext(image_np, detail=0) # Extract text as a list of strings
	return ' '.join(text) # Join the extracted text into a single string

	# Extract information based on each criterion
	def extract_information_per_criterion(text, criteria_list):
	extracted_data = {}
	for criterion in criteria_list:
	prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}"
	response = get_gemini_response(prompt)
	extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '') # Remove asterisks
	extracted_data[criterion] = extracted_text
	return extracted_data



	# Store extracted information into a DataFrame
	def information_to_df(extracted_data, sr_no):
	data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data}
	df = pd.DataFrame(data)
	df.insert(0, "Sr. No", sr_no)
	return df

	# Adjust Excel columns to fit content
	def adjust_excel_columns(writer, df):
	worksheet = writer.sheets['Sheet1']
	for idx, col in enumerate(df.columns, 1): # 1-indexed.
	max_length = max(df[col].astype(str).map(len).max(), len(col))
	worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2

	# Streamlit App
	st.title("File Information Extractor")
	st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction")

	uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True)

	if uploaded_files:
	user_input = st.text_area("Enter the criteria for extracting information, separated by commas.")

	if user_input:
	criteria_list = [criterion.strip() for criterion in user_input.split(',')] # Split and clean criteria
	all_dfs = []

	for i, uploaded_file in enumerate(uploaded_files, start=1):
	# Determine file type and handle accordingly
	if uploaded_file.type == "application/pdf":
	text = input_pdf_text(uploaded_file)
	extracted_data = extract_information_per_criterion(text, criteria_list)

	st.subheader(f"Extracted Information from PDF File {i}")
	st.write(extracted_data)

	df = information_to_df(extracted_data, i)
	all_dfs.append(df)

	elif uploaded_file.type in ["image/jpeg", "image/png"]:
	text = input_image_text(uploaded_file) # Extract text from image using OCR
	extracted_data = extract_information_per_criterion(text, criteria_list)

	st.subheader(f"Extracted Information from Image File {i}")
	st.write(extracted_data)

	df = information_to_df(extracted_data, i)
	all_dfs.append(df)

	# Combine all DataFrames into one
	combined_df = pd.concat(all_dfs, ignore_index=True)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file:
	with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer:
	combined_df.to_excel(writer, index=False)
	adjust_excel_columns(writer, combined_df)

	excel_path = tmp_file.name

	with open(excel_path, "rb") as file:
	st.download_button(
	label="Download Extracted Information as Excel",
	data=file,
	file_name="extracted_information.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)