Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import pandas as pd | |
| import gradio as gr | |
| import tempfile | |
| def extract_data(pdf_file, company): | |
| # Open PDF | |
| with pdfplumber.open(pdf_file) as pdf: | |
| pages = pdf.pages | |
| data_rows = [] | |
| for page in pages: | |
| text = page.extract_text().splitlines() | |
| if company == 'Toshiba': | |
| # Parse Toshiba format | |
| for line in text: | |
| if line.startswith("Pos."): | |
| # Extract primary data line | |
| parts = line.split() | |
| pos = parts[1] | |
| item_code = parts[2] | |
| unit = parts[3] | |
| delivery_date = parts[4] | |
| quantity = parts[5] | |
| basic_price = parts[6] | |
| discount = parts[7] | |
| currency = parts[8] | |
| amount = parts[9] | |
| # Extract additional description and calculation details | |
| description = "" | |
| calc_method = "" | |
| for i, l in enumerate(text): | |
| if "TERMINAL MARKING" in l or "Calculation Method:" in l: | |
| description = text[i] | |
| calc_method = text[i + 1] if "Calculation Method:" in text[i + 1] else "" | |
| break | |
| # Append row to data_rows | |
| data_rows.append({ | |
| "Pos.": pos, | |
| "Item Code": item_code, | |
| "Unit": unit, | |
| "Delivery Date": delivery_date, | |
| "Quantity": quantity, | |
| "Basic Price": basic_price, | |
| "Discount": discount, | |
| "Cur.": currency, | |
| "Amount": amount, | |
| "Description": description, | |
| "Calculation Method": calc_method | |
| }) | |
| # Convert to DataFrame | |
| df = pd.DataFrame(data_rows, columns=["Pos.", "Item Code", "Unit", "Delivery Date", "Quantity", | |
| "Basic Price", "Discount", "Cur.", "Amount", "Description", | |
| "Calculation Method"]) | |
| elif company == 'BHEL': | |
| # Parse BHEL format | |
| for line in text: | |
| if line.startswith("Sl No"): | |
| parts = line.split() | |
| sl_no = parts[2] | |
| material_desc = " ".join(parts[3:6]) # Assuming fixed-length split for description | |
| unit = parts[6] | |
| quantity = parts[7] | |
| dely_qty = parts[8] | |
| dely_date = parts[9] | |
| unit_rate = parts[10] | |
| value = parts[11] | |
| # Additional data such as material number, HSN code, IGST | |
| material_number = "" | |
| hsn_code = "" | |
| igst = "" | |
| for i, l in enumerate(text): | |
| if "Material Number:" in l: | |
| material_number = l.split(":")[1].strip() | |
| if "HSN Code:" in l: | |
| hsn_code = l.split(":")[1].strip() | |
| if "IGST" in l: | |
| igst = l.split(":")[1].strip() | |
| # Append row to data_rows | |
| data_rows.append({ | |
| "Sl No": sl_no, | |
| "Material Description": material_desc, | |
| "Unit": unit, | |
| "Quantity": quantity, | |
| "Dely Qty": dely_qty, | |
| "Dely Date": dely_date, | |
| "Unit Rate": unit_rate, | |
| "Value": value, | |
| "Material Number": material_number, | |
| "HSN Code": hsn_code, | |
| "IGST": igst | |
| }) | |
| # Convert to DataFrame | |
| df = pd.DataFrame(data_rows, columns=["Sl No", "Material Description", "Unit", "Quantity", | |
| "Dely Qty", "Dely Date", "Unit Rate", "Value", | |
| "Material Number", "HSN Code", "IGST"]) | |
| # Save as Excel file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
| with pd.ExcelWriter(temp_file.name, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, index=False) | |
| return temp_file.name | |
| # Set up Gradio interface | |
| company_options = ['Toshiba', 'BHEL'] | |
| interface = gr.Interface( | |
| fn=extract_data, | |
| inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")], | |
| outputs=gr.File(label="Download Extracted Data as Excel"), | |
| title="PDF Data Extractor for Toshiba and BHEL", | |
| description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements." | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |