Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import pandas as pd | |
| from io import BytesIO | |
| from azure.ai.formrecognizer import DocumentAnalysisClient | |
| from azure.core.credentials import AzureKeyCredential | |
| from PyPDF2 import PdfReader, PdfWriter | |
| from openai import OpenAI | |
| import re | |
| import json | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| OPENAI_API_KEY=os.getenv("OPENAI_API_KEY") | |
| os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY | |
| AZURE_KEY=os.getenv("AZURE_KEY") | |
| openaiClient = OpenAI() | |
| # Initialize the DocumentAnalysisClient | |
| document_analysis_client = DocumentAnalysisClient( | |
| endpoint="https://youdata-demo.cognitiveservices.azure.com/", | |
| credential=AzureKeyCredential(AZURE_KEY) # Replace with your Azure key | |
| ) | |
| # Function to split PDF and extract the first 4 pages | |
| # def split_pdf_to_first_4_pages(pdf_file, output_pdf_path): | |
| # reader = PdfReader(pdf_file) | |
| # writer = PdfWriter() | |
| # # Only add the first 4 pages | |
| # for i in range(min(4, len(reader.pages))): # Ensure it doesn't exceed the total pages | |
| # writer.add_page(reader.pages[i]) | |
| # # Write the small PDF to a file | |
| # with open(output_pdf_path, 'wb') as output_pdf: | |
| # writer.write(output_pdf) | |
| def split_pdf_to_first_4_pages(pdf_file, output_pdf_path): | |
| reader = PdfReader(pdf_file) | |
| writer = PdfWriter() | |
| for i in range(start_page - 1, end_page): | |
| writer.add_page(reader.pages[i]) | |
| with open(output_pdf_path, 'wb') as output_pdf: | |
| writer.write(output_pdf) | |
| def split_pdf(pdf_path, start_page, end_page, output_pdf_path): | |
| reader = PdfReader(pdf_path) | |
| writer = PdfWriter() | |
| for i in range(start_page - 1, end_page): | |
| writer.add_page(reader.pages[i]) | |
| with open(output_pdf_path, 'wb') as output_pdf: | |
| writer.write(output_pdf) | |
| # Function to extract text from the first 4 pages of a PDF | |
| def extract_text_from_pdf(pdf_file): | |
| # Split the original PDF to get a smaller PDF with only the first 4 pages | |
| small_pdf_path = "small_document.pdf" | |
| split_pdf(pdf_file, 1, 4, "small_document.pdf") | |
| extracted_text = "" | |
| # Check if the small PDF has the correct number of pages | |
| with open("small_document.pdf", "rb") as f: | |
| reader = PdfReader(f) | |
| number_of_pages = len(reader.pages) | |
| print(f"Number of pages in the small PDF: {number_of_pages}") | |
| # Read the smaller PDF for analysis | |
| with open("small_document.pdf", "rb") as f: | |
| document = f.read() | |
| # Analyze the smaller document | |
| poller = document_analysis_client.begin_analyze_document("prebuilt-document", document) | |
| result = poller.result() | |
| # Check how many pages were actually processed by Azure | |
| print(f"Number of pages processed: {len(result.pages)}") | |
| # Extract and print text for each page that was processed | |
| for page_number, page in enumerate(result.pages, start=1): | |
| # print(f"--- Page {page_number} ---") | |
| for line in page.lines: | |
| extracted_text+=line.content | |
| # print("-" * 40) | |
| # Optional: Analyze each page separately if needed | |
| for i in range(1, number_of_pages + 1): | |
| split_pdf(pdf_file, i, i, f"page_{i}.pdf") | |
| with open(f"page_{i}.pdf", "rb") as f: | |
| document = f.read() | |
| poller = document_analysis_client.begin_analyze_document("prebuilt-document", document) | |
| result = poller.result() | |
| # Extract and print text for each page individually | |
| # print(f"--- Separate Analysis for Page {i} ---") | |
| for page in result.pages: | |
| for line in page.lines: | |
| extracted_text+=line.content | |
| # print("-" * 40) | |
| # # Clean up the small PDF file if needed | |
| # os.remove(small_pdf_path) | |
| return extracted_text | |
| output_structure = { | |
| "Name": "String", | |
| "Phone No": "List", | |
| "Designation": "String", | |
| "Date Of Joining": "String", | |
| "Present Address": "String", | |
| "Permanent Address": "String", | |
| "PAN No": "String", | |
| "UAN No": "String", | |
| "AADHAR No": "String", | |
| "Site Code": "String", | |
| "Is Mobile Linked with UAN": "String", | |
| "Uniform Type": "String", | |
| "Shoe Size": "String", | |
| "Height": "String", | |
| "Weight": "String", | |
| "Waist Size": "String", | |
| "Chest Size": "String", | |
| "Do you have any major/minor surgery?": "String", | |
| "Surgey Details": "String", | |
| "Identification Mark": "String", | |
| "Have you ever worked with Govt?": "String", | |
| "Have you ever worked with State Govt?": "String", | |
| "Have you ever worked with PSU?": "String", | |
| "Have you ever worked with Statutory Bodies?": "String", | |
| "Have you ever been convicted?": "String", | |
| "Details of the conviction": "String", | |
| "Father Details": "Dict", | |
| "Mother Details": "Dict", | |
| "Spouse Details": "Dict", | |
| "Brother/Sister Details": "Dict", | |
| "Children Details": "Dict", | |
| "Noinee 1": "String", | |
| "Noinee 2": "String", | |
| "Reference 1": "String", | |
| "Reference 2": "String", | |
| "Account Holder Name": "String", | |
| "Bank Account No": "String", | |
| "IFSC Code": "String", | |
| "Bank Name": "String", | |
| "Branch Location": "String", | |
| # "Is Signed?": "String" | |
| } | |
| # Function to create key-value pairs using GPT-3.5 Turbo | |
| def create_key_value_pairs_from_text(text): | |
| # Make a request to the OpenAI GPT-3.5 Turbo model | |
| response = openaiClient.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": f"""Extract the important details of a person from this text. Always return the response in JSON key-value pairs from the following text: {text}. | |
| here is the desired output strcutre: | |
| {output_structure}. Always write "No" for the surgery and conviction. Only add the details which are there in the given text. Always write "No" for the surgery and conviction.""", | |
| } | |
| ] | |
| ) | |
| # Extract the content from the response | |
| response_content = response.choices[0].message.content | |
| # Attempt to parse the response content as JSON | |
| try: | |
| key_value_pairs = json.loads(response_content) | |
| except json.JSONDecodeError: | |
| # If the response is not valid JSON, return the raw text instead | |
| key_value_pairs = response_content | |
| return key_value_pairs | |
| # Function to extract JSON from text using regex | |
| def extract_json_from_text(text): | |
| # Use regex to find the JSON block within the text | |
| json_match = re.search(r'```(.*?)```', text, re.DOTALL) | |
| if json_match: | |
| json_str = json_match.group(1).strip() # Extract the JSON string and strip any leading/trailing whitespace | |
| try: | |
| # Parse the JSON string into a Python dictionary | |
| data = json.loads(json_str) | |
| return data | |
| except json.JSONDecodeError as e: | |
| print("Failed to decode JSON:", e) | |
| return None | |
| else: | |
| print("No JSON block found in the text.") | |
| return None | |
| # Streamlit app interface | |
| st.title("Joining Form Details Extractor") | |
| uploaded_files = st.file_uploader("Upload PDF files", accept_multiple_files=True, type="pdf") | |
| if uploaded_files: | |
| st.write("Processing...") | |
| all_data = [] | |
| for pdf_file in uploaded_files: | |
| # Extract text from the PDF | |
| extracted_text = extract_text_from_pdf(pdf_file) | |
| # Get key-value pairs using OpenAI GPT-3.5 Turbo | |
| key_value_pairs = create_key_value_pairs_from_text(extracted_text) | |
| # Extract JSON from the returned content | |
| if key_value_pairs: | |
| # data = extract_json_from_text(key_value_pairs) | |
| # if data: | |
| all_data.append(key_value_pairs) | |
| # Convert the list of dictionaries to a DataFrame | |
| if all_data: | |
| df = pd.DataFrame(all_data) | |
| # Display the DataFrame in Streamlit | |
| st.write("Extracted Data:") | |
| st.dataframe(df) | |
| # Convert DataFrame to Excel | |
| output = BytesIO() | |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name="Extracted Data") | |
| # Download link for the Excel file | |
| st.download_button( | |
| label="Download Excel", | |
| data=output.getvalue(), | |
| file_name="extracted_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| else: | |
| st.write("No data extracted.") | |