Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import google.generativeai as genai | |
| import PyPDF2 as pdf | |
| import pandas as pd | |
| import tempfile | |
| import openpyxl | |
| from openpyxl.utils import get_column_letter | |
| from PIL import Image | |
| import easyocr | |
| import numpy as np # Make sure to import numpy | |
| # Configure API key | |
| genai.configure(api_key="AIzaSyDm0pOQKmzLMPU9omEOIr8nsFdGld9cuG8") | |
| # Initialize the OCR reader | |
| reader = easyocr.Reader(['en']) | |
| # Function to get response from Generative AI model | |
| def get_gemini_response(input): | |
| model = genai.GenerativeModel('gemini-pro') | |
| response = model.generate_content(input) | |
| return response | |
| # Convert PDF to text | |
| def input_pdf_text(uploaded_file): | |
| reader_pdf = pdf.PdfReader(uploaded_file) | |
| text = "" | |
| for page in range(len(reader_pdf.pages)): | |
| page = reader_pdf.pages[page] | |
| text += str(page.extract_text()) | |
| return text | |
| # Extract text from images using EasyOCR | |
| def input_image_text(uploaded_file): | |
| # Open the image using PIL | |
| image = Image.open(uploaded_file) | |
| # Convert the image to a NumPy array | |
| image_np = np.array(image) | |
| # Perform OCR on the image | |
| text = reader.readtext(image_np, detail=0) # Extract text as a list of strings | |
| return ' '.join(text) # Join the extracted text into a single string | |
| # Extract information based on each criterion | |
| def extract_information_per_criterion(text, criteria_list): | |
| extracted_data = {} | |
| for criterion in criteria_list: | |
| prompt = f"Please analyze the following text and extract the key points related to '{criterion}'. Provide the output as a simple string without any extra formatting or labels. Here’s the text:\n{text}" | |
| response = get_gemini_response(prompt) | |
| extracted_text = response.candidates[0].content.parts[0].text.strip().replace('*', '') # Remove asterisks | |
| extracted_data[criterion] = extracted_text | |
| return extracted_data | |
| # Store extracted information into a DataFrame | |
| def information_to_df(extracted_data, sr_no): | |
| data = {criterion: [extracted_data.get(criterion, "")] for criterion in extracted_data} | |
| df = pd.DataFrame(data) | |
| df.insert(0, "Sr. No", sr_no) | |
| return df | |
| # Adjust Excel columns to fit content | |
| def adjust_excel_columns(writer, df): | |
| worksheet = writer.sheets['Sheet1'] | |
| for idx, col in enumerate(df.columns, 1): # 1-indexed. | |
| max_length = max(df[col].astype(str).map(len).max(), len(col)) | |
| worksheet.column_dimensions[get_column_letter(idx)].width = max_length + 2 | |
| # Streamlit App | |
| st.title("File Information Extractor") | |
| st.text("Upload PDFs, JPGs, or PNGs and specify criteria for information extraction") | |
| uploaded_files = st.file_uploader("Upload your files (PDF, JPG, PNG)", type=["pdf", "jpg", "png"], accept_multiple_files=True) | |
| if uploaded_files: | |
| user_input = st.text_area("Enter the criteria for extracting information, separated by commas.") | |
| if user_input: | |
| criteria_list = [criterion.strip() for criterion in user_input.split(',')] # Split and clean criteria | |
| all_dfs = [] | |
| for i, uploaded_file in enumerate(uploaded_files, start=1): | |
| # Determine file type and handle accordingly | |
| if uploaded_file.type == "application/pdf": | |
| text = input_pdf_text(uploaded_file) | |
| extracted_data = extract_information_per_criterion(text, criteria_list) | |
| st.subheader(f"Extracted Information from PDF File {i}") | |
| st.write(extracted_data) | |
| df = information_to_df(extracted_data, i) | |
| all_dfs.append(df) | |
| elif uploaded_file.type in ["image/jpeg", "image/png"]: | |
| text = input_image_text(uploaded_file) # Extract text from image using OCR | |
| extracted_data = extract_information_per_criterion(text, criteria_list) | |
| st.subheader(f"Extracted Information from Image File {i}") | |
| st.write(extracted_data) | |
| df = information_to_df(extracted_data, i) | |
| all_dfs.append(df) | |
| # Combine all DataFrames into one | |
| combined_df = pd.concat(all_dfs, ignore_index=True) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp_file: | |
| with pd.ExcelWriter(tmp_file.name, engine='openpyxl') as writer: | |
| combined_df.to_excel(writer, index=False) | |
| adjust_excel_columns(writer, combined_df) | |
| excel_path = tmp_file.name | |
| with open(excel_path, "rb") as file: | |
| st.download_button( | |
| label="Download Extracted Information as Excel", | |
| data=file, | |
| file_name="extracted_information.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |