import pdfplumber import pandas as pd import gradio as gr import tempfile def extract_data(pdf_file, company): # Open PDF with pdfplumber.open(pdf_file) as pdf: pages = pdf.pages data_rows = [] for page in pages: text = page.extract_text().splitlines() if company == 'Toshiba': # Parse Toshiba format for line in text: if line.startswith("Pos."): # Extract primary data line parts = line.split() pos = parts[1] item_code = parts[2] unit = parts[3] delivery_date = parts[4] quantity = parts[5] basic_price = parts[6] discount = parts[7] currency = parts[8] amount = parts[9] # Extract additional description and calculation details description = "" calc_method = "" for i, l in enumerate(text): if "TERMINAL MARKING" in l or "Calculation Method:" in l: description = text[i] calc_method = text[i + 1] if "Calculation Method:" in text[i + 1] else "" break # Append row to data_rows data_rows.append({ "Pos.": pos, "Item Code": item_code, "Unit": unit, "Delivery Date": delivery_date, "Quantity": quantity, "Basic Price": basic_price, "Discount": discount, "Cur.": currency, "Amount": amount, "Description": description, "Calculation Method": calc_method }) # Convert to DataFrame df = pd.DataFrame(data_rows, columns=["Pos.", "Item Code", "Unit", "Delivery Date", "Quantity", "Basic Price", "Discount", "Cur.", "Amount", "Description", "Calculation Method"]) elif company == 'BHEL': # Parse BHEL format for line in text: if line.startswith("Sl No"): parts = line.split() sl_no = parts[2] material_desc = " ".join(parts[3:6]) # Assuming fixed-length split for description unit = parts[6] quantity = parts[7] dely_qty = parts[8] dely_date = parts[9] unit_rate = parts[10] value = parts[11] # Additional data such as material number, HSN code, IGST material_number = "" hsn_code = "" igst = "" for i, l in enumerate(text): if "Material Number:" in l: material_number = l.split(":")[1].strip() if "HSN Code:" in l: hsn_code = l.split(":")[1].strip() if "IGST" in l: igst = l.split(":")[1].strip() # Append row to data_rows data_rows.append({ "Sl No": sl_no, "Material Description": material_desc, "Unit": unit, "Quantity": quantity, "Dely Qty": dely_qty, "Dely Date": dely_date, "Unit Rate": unit_rate, "Value": value, "Material Number": material_number, "HSN Code": hsn_code, "IGST": igst }) # Convert to DataFrame df = pd.DataFrame(data_rows, columns=["Sl No", "Material Description", "Unit", "Quantity", "Dely Qty", "Dely Date", "Unit Rate", "Value", "Material Number", "HSN Code", "IGST"]) # Save as Excel file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") with pd.ExcelWriter(temp_file.name, engine='xlsxwriter') as writer: df.to_excel(writer, index=False) return temp_file.name # Set up Gradio interface company_options = ['Toshiba', 'BHEL'] interface = gr.Interface( fn=extract_data, inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")], outputs=gr.File(label="Download Extracted Data as Excel"), title="PDF Data Extractor for Toshiba and BHEL", description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements." ) if __name__ == "__main__": interface.launch()