Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import re | |
| import pandas as pd | |
| import gradio as gr | |
| def extract_data(pdf_file): | |
| """ | |
| Extract data from the uploaded PDF and return it as an Excel file. | |
| """ | |
| data = [] | |
| columns = ["SI No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value"] | |
| start_si, end_si = 10, 1150 # Define SI No range for processing | |
| with pdfplumber.open(pdf_file) as pdf: | |
| for page in pdf.pages: | |
| full_text = page.extract_text() # Get the text content for the page | |
| lines = full_text.splitlines() if full_text else [] | |
| for line in lines: | |
| try: | |
| # Parse the first column for SI No | |
| si_no_match = re.match(r"^\s*(\d+)\s", line) | |
| if not si_no_match: | |
| continue | |
| si_no = int(si_no_match.group(1)) | |
| if not (start_si <= si_no <= end_si): | |
| continue # Skip rows outside the range | |
| # Extract Material Description and details dynamically | |
| material_desc = extract_material_description(full_text, si_no) | |
| # Extract remaining fields | |
| parts = line.split() | |
| unit = parts[3] | |
| quantity = int(parts[4]) | |
| dely_qty = int(parts[5]) | |
| dely_date = parts[6] | |
| unit_rate = float(parts[7]) | |
| value = float(parts[8]) | |
| # Append row data | |
| data.append([si_no, material_desc, unit, quantity, dely_qty, dely_date, unit_rate, value]) | |
| except (ValueError, IndexError): | |
| # Skip invalid rows or rows with missing data | |
| continue | |
| # Convert data to DataFrame and save as Excel | |
| df = pd.DataFrame(data, columns=columns) | |
| excel_path = "/tmp/Extracted_PO_Data_Accurate.xlsx" | |
| df.to_excel(excel_path, index=False) | |
| return excel_path | |
| def extract_material_description(full_text, si_no): | |
| """ | |
| Extract Material Description, including Material Number, HSN Code, and IGST, using unique patterns. | |
| """ | |
| material_desc = "" | |
| # Match the specific SI No row to extract details | |
| si_no_pattern = rf"{si_no}\s+(BPS\s+\d+).*?Material\s+Number:\s+(\d+)" | |
| match = re.search(si_no_pattern, full_text, re.DOTALL) | |
| if match: | |
| bps_code = match.group(1) | |
| material_number = match.group(2) | |
| material_desc += f"{bps_code}\nMaterial Number: {material_number}\n" | |
| # Extract HSN Code | |
| hsn_code_match = re.search(r"HSN\s+Code:\s*(\d+)", full_text) | |
| if hsn_code_match: | |
| hsn_code = hsn_code_match.group(1) | |
| material_desc += f"HSN Code: {hsn_code}\n" | |
| else: | |
| material_desc += "HSN Code: Not Found\n" | |
| # Extract IGST | |
| igst_match = re.search(r"IGST\s*:\s*(\d+)\s*%", full_text) | |
| if igst_match: | |
| igst = igst_match.group(1) | |
| material_desc += f"IGST: {igst} %" | |
| else: | |
| material_desc += "IGST: Not Found" | |
| return material_desc.strip() | |
| # Gradio Interface function | |
| def gradio_interface(pdf_file): | |
| """ | |
| Interface function for Gradio to process the PDF and return the Excel file. | |
| """ | |
| return process_po(pdf_file.name) | |
| # Define Gradio interface | |
| interface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=gr.File(label="Download Accurate Excel"), | |
| title="BHEL PO Data Extractor", | |
| description="Upload a BHEL Purchase Order (PO) PDF to extract material numbers, descriptions, and IGST information into an Excel file." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |