Twopos / app.py
jithenderchoudary's picture
Update app.py
c992709 verified
import pdfplumber
import pandas as pd
import gradio as gr
import tempfile
def extract_data(pdf_file, company):
# Open PDF
with pdfplumber.open(pdf_file) as pdf:
pages = pdf.pages
data_rows = []
for page in pages:
text = page.extract_text().splitlines()
if company == 'Toshiba':
# Parse Toshiba format
for line in text:
if line.startswith("Pos."):
# Extract primary data line
parts = line.split()
pos = parts[1]
item_code = parts[2]
unit = parts[3]
delivery_date = parts[4]
quantity = parts[5]
basic_price = parts[6]
discount = parts[7]
currency = parts[8]
amount = parts[9]
# Extract additional description and calculation details
description = ""
calc_method = ""
for i, l in enumerate(text):
if "TERMINAL MARKING" in l or "Calculation Method:" in l:
description = text[i]
calc_method = text[i + 1] if "Calculation Method:" in text[i + 1] else ""
break
# Append row to data_rows
data_rows.append({
"Pos.": pos,
"Item Code": item_code,
"Unit": unit,
"Delivery Date": delivery_date,
"Quantity": quantity,
"Basic Price": basic_price,
"Discount": discount,
"Cur.": currency,
"Amount": amount,
"Description": description,
"Calculation Method": calc_method
})
# Convert to DataFrame
df = pd.DataFrame(data_rows, columns=["Pos.", "Item Code", "Unit", "Delivery Date", "Quantity",
"Basic Price", "Discount", "Cur.", "Amount", "Description",
"Calculation Method"])
elif company == 'BHEL':
# Parse BHEL format
for line in text:
if line.startswith("Sl No"):
parts = line.split()
sl_no = parts[2]
material_desc = " ".join(parts[3:6]) # Assuming fixed-length split for description
unit = parts[6]
quantity = parts[7]
dely_qty = parts[8]
dely_date = parts[9]
unit_rate = parts[10]
value = parts[11]
# Additional data such as material number, HSN code, IGST
material_number = ""
hsn_code = ""
igst = ""
for i, l in enumerate(text):
if "Material Number:" in l:
material_number = l.split(":")[1].strip()
if "HSN Code:" in l:
hsn_code = l.split(":")[1].strip()
if "IGST" in l:
igst = l.split(":")[1].strip()
# Append row to data_rows
data_rows.append({
"Sl No": sl_no,
"Material Description": material_desc,
"Unit": unit,
"Quantity": quantity,
"Dely Qty": dely_qty,
"Dely Date": dely_date,
"Unit Rate": unit_rate,
"Value": value,
"Material Number": material_number,
"HSN Code": hsn_code,
"IGST": igst
})
# Convert to DataFrame
df = pd.DataFrame(data_rows, columns=["Sl No", "Material Description", "Unit", "Quantity",
"Dely Qty", "Dely Date", "Unit Rate", "Value",
"Material Number", "HSN Code", "IGST"])
# Save as Excel file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
with pd.ExcelWriter(temp_file.name, engine='xlsxwriter') as writer:
df.to_excel(writer, index=False)
return temp_file.name
# Set up Gradio interface
company_options = ['Toshiba', 'BHEL']
interface = gr.Interface(
fn=extract_data,
inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
outputs=gr.File(label="Download Extracted Data as Excel"),
title="PDF Data Extractor for Toshiba and BHEL",
description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
)
if __name__ == "__main__":
interface.launch()