File size: 4,897 Bytes
270f367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

import streamlit as st
import google.generativeai as genai
import PyPDF2 as pdf
import pandas as pd
import tempfile
import openpyxl
from openpyxl.utils import get_column_letter
from PIL import Image
import easyocr
import numpy as np  # Make sure to import numpy

# Configure API key
genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8")

# Initialize the OCR reader
reader = easyocr.Reader(['en'])

# Function to get response from Generative AI model
def get_gemini_response(input):
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(input)
    return response

# Convert PDF to text
def input_pdf_text(uploaded_file):
    reader_pdf = pdf.PdfReader(uploaded_file)
    text = ""
    for page in range(len(reader_pdf.pages)):
        page = reader_pdf.pages[page]
        text += str(page.extract_text())
    return text

# Extract text from images using EasyOCR
def input_image_text(uploaded_file):
    # Open the image using PIL
    image = Image.open(uploaded_file)
    # Convert the image to a NumPy array
    image_np = np.array(image)
    # Perform OCR on the image
    text = reader.readtext(image_np, detail=0)  # Extract text as a list of strings
    return ' '.join(text)  # Join the extracted text into a single string

# Extract information based on each criterion
def extract_information_per_criterion(text, criteria_list):
    extracted_data = {}
    for criterion in criteria_list:
        prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}"
        response = get_gemini_response(prompt)
        extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '')  # Remove asterisks
        extracted_data[criterion] = extracted_text
    return extracted_data



# Store extracted information into a DataFrame
def information_to_df(extracted_data, sr_no):
    data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data}
    df = pd.DataFrame(data)
    df.insert(0, "Sr. No", sr_no)
    return df

# Adjust Excel columns to fit content
def adjust_excel_columns(writer, df):
    worksheet = writer.sheets['Sheet1']
    for idx, col in enumerate(df.columns, 1):  # 1-indexed.
        max_length = max(df[col].astype(str).map(len).max(), len(col))
        worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2

# Streamlit App
st.title("File Information Extractor")
st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction")

uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True)

if uploaded_files:
    user_input = st.text_area("Enter the criteria for extracting information, separated by commas.")

    if user_input:
        criteria_list = [criterion.strip() for criterion in user_input.split(',')]  # Split and clean criteria
        all_dfs = []

        for i, uploaded_file in enumerate(uploaded_files, start=1):
            # Determine file type and handle accordingly
            if uploaded_file.type == "application/pdf":
                text = input_pdf_text(uploaded_file)
                extracted_data = extract_information_per_criterion(text, criteria_list)

                st.subheader(f"Extracted Information from PDF File {i}")
                st.write(extracted_data)

                df = information_to_df(extracted_data, i)
                all_dfs.append(df)

            elif uploaded_file.type in ["image/jpeg", "image/png"]:
                text = input_image_text(uploaded_file)  # Extract text from image using OCR
                extracted_data = extract_information_per_criterion(text, criteria_list)

                st.subheader(f"Extracted Information from Image File {i}")
                st.write(extracted_data)

                df = information_to_df(extracted_data, i)
                all_dfs.append(df)

        # Combine all DataFrames into one
        combined_df = pd.concat(all_dfs, ignore_index=True)

        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file:
            with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer:
                combined_df.to_excel(writer, index=False)
                adjust_excel_columns(writer, combined_df)

            excel_path = tmp_file.name

        with open(excel_path, "rb") as file:
            st.download_button(
                label="Download Extracted Information as Excel",
                data=file,
                file_name="extracted_information.xlsx",
                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
            )