Spaces:

Krish30
/

File-Information-Extractor

Sleeping

File size: 4,897 Bytes

270f367


import streamlit as st
import google.generativeai as genai
import PyPDF2 as pdf
import pandas as pd
import tempfile
import openpyxl
from openpyxl.utils import get_column_letter
from PIL import Image
import easyocr
import numpy as np  # Make sure to import numpy

# Configure API key
genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8")

# Initialize the OCR reader
reader = easyocr.Reader(['en'])

# Function to get response from Generative AI model
def get_gemini_response(input):
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(input)
    return response

# Convert PDF to text
def input_pdf_text(uploaded_file):
    reader_pdf = pdf.PdfReader(uploaded_file)
    text = ""
    for page in range(len(reader_pdf.pages)):
        page = reader_pdf.pages[page]
        text += str(page.extract_text())
    return text

# Extract text from images using EasyOCR
def input_image_text(uploaded_file):
    # Open the image using PIL
    image = Image.open(uploaded_file)
    # Convert the image to a NumPy array
    image_np = np.array(image)
    # Perform OCR on the image
    text = reader.readtext(image_np, detail=0)  # Extract text as a list of strings
    return ' '.join(text)  # Join the extracted text into a single string

# Extract information based on each criterion
def extract_information_per_criterion(text, criteria_list):
    extracted_data = {}
    for criterion in criteria_list:
        prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}"
        response = get_gemini_response(prompt)
        extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '')  # Remove asterisks
        extracted_data[criterion] = extracted_text
    return extracted_data



# Store extracted information into a DataFrame
def information_to_df(extracted_data, sr_no):
    data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data}
    df = pd.DataFrame(data)
    df.insert(0, "Sr. No", sr_no)
    return df

# Adjust Excel columns to fit content
def adjust_excel_columns(writer, df):
    worksheet = writer.sheets['Sheet1']
    for idx, col in enumerate(df.columns, 1):  # 1-indexed.
        max_length = max(df[col].astype(str).map(len).max(), len(col))
        worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2

# Streamlit App
st.title("File Information Extractor")
st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction")

uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True)

if uploaded_files:
    user_input = st.text_area("Enter the criteria for extracting information, separated by commas.")

    if user_input:
        criteria_list = [criterion.strip() for criterion in user_input.split(',')]  # Split and clean criteria
        all_dfs = []

        for i, uploaded_file in enumerate(uploaded_files, start=1):
            # Determine file type and handle accordingly
            if uploaded_file.type == "application/pdf":
                text = input_pdf_text(uploaded_file)
                extracted_data = extract_information_per_criterion(text, criteria_list)

                st.subheader(f"Extracted Information from PDF File {i}")
                st.write(extracted_data)

                df = information_to_df(extracted_data, i)
                all_dfs.append(df)

            elif uploaded_file.type in ["image/jpeg", "image/png"]:
                text = input_image_text(uploaded_file)  # Extract text from image using OCR
                extracted_data = extract_information_per_criterion(text, criteria_list)

                st.subheader(f"Extracted Information from Image File {i}")
                st.write(extracted_data)

                df = information_to_df(extracted_data, i)
                all_dfs.append(df)

        # Combine all DataFrames into one
        combined_df = pd.concat(all_dfs, ignore_index=True)

        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file:
            with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer:
                combined_df.to_excel(writer, index=False)
                adjust_excel_columns(writer, combined_df)

            excel_path = tmp_file.name

        with open(excel_path, "rb") as file:
            st.download_button(
                label="Download Extracted Information as Excel",
                data=file,
                file_name="extracted_information.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )