Spaces:

jithenderchoudary
/

Twopos

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 7, 2024

Commit

c992709

verified ·

1 Parent(s): 0d09737

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -36

app.py CHANGED Viewed

@@ -1,45 +1,125 @@
-import gradio as gr
-import pandas as pd
 import pdfplumber
 import tempfile
-def process_pdfs(pdf1, pdf2):
-    data = []
-    files = [pdf1, pdf2]
-    for file in files:
-        with pdfplumber.open(file.name) as pdf:
-            for page in pdf.pages:
-                table = page.extract_table()
-                if table:
-                    df = pd.DataFrame(table[1:], columns=table[0])  # Assumes first row is headers
-                    data.append(df)
-    combined_data = pd.concat(data, ignore_index=True)
-    # Rename columns if needed to match the sample format
-    combined_data.columns = [
-        "Sl No", "Pos.", "Item code", "Unit", "Delivery Date",
-        "Quantity", "Basic Price", "Discount", "Cur.",
-        "Amount", "Central GST 9%", "State GST%", "SUB TOTAL"
-    ]
-    # Save to Excel
-    output_file = tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False)
-    combined_data.to_excel(output_file.name, index=False)
-    return output_file.name
-iface = gr.Interface(
-    fn=process_pdfs,
-    inputs=[
-        gr.inputs.File(label="Toshiba PO PDF"),
-        gr.inputs.File(label="BHEL PO PDF")
-    ],
-    outputs=gr.outputs.File(label="Excel Sheet"),
-    title="PDF to Excel PO Extractor",
-    description="Upload two PO PDF files to extract data and download as an Excel file."
 )
-iface.launch()

 import pdfplumber
+import pandas as pd
+import gradio as gr
 import tempfile
+def extract_data(pdf_file, company):
+    # Open PDF
+    with pdfplumber.open(pdf_file) as pdf:
+        pages = pdf.pages
+        data_rows = []
+        for page in pages:
+            text = page.extract_text().splitlines()
+            if company == 'Toshiba':
+                # Parse Toshiba format
+                for line in text:
+                    if line.startswith("Pos."):
+                        # Extract primary data line
+                        parts = line.split()
+                        pos = parts[1]
+                        item_code = parts[2]
+                        unit = parts[3]
+                        delivery_date = parts[4]
+                        quantity = parts[5]
+                        basic_price = parts[6]
+                        discount = parts[7]
+                        currency = parts[8]
+                        amount = parts[9]
+                        # Extract additional description and calculation details
+                        description = ""
+                        calc_method = ""
+                        for i, l in enumerate(text):
+                            if "TERMINAL MARKING" in l or "Calculation Method:" in l:
+                                description = text[i]
+                                calc_method = text[i + 1] if "Calculation Method:" in text[i + 1] else ""
+                                break
+                        # Append row to data_rows
+                        data_rows.append({
+                            "Pos.": pos,
+                            "Item Code": item_code,
+                            "Unit": unit,
+                            "Delivery Date": delivery_date,
+                            "Quantity": quantity,
+                            "Basic Price": basic_price,
+                            "Discount": discount,
+                            "Cur.": currency,
+                            "Amount": amount,
+                            "Description": description,
+                            "Calculation Method": calc_method
+                        })
+                # Convert to DataFrame
+                df = pd.DataFrame(data_rows, columns=["Pos.", "Item Code", "Unit", "Delivery Date", "Quantity",
+                                                      "Basic Price", "Discount", "Cur.", "Amount", "Description",
+                                                      "Calculation Method"])
+            elif company == 'BHEL':
+                # Parse BHEL format
+                for line in text:
+                    if line.startswith("Sl No"):
+                        parts = line.split()
+                        sl_no = parts[2]
+                        material_desc = " ".join(parts[3:6])  # Assuming fixed-length split for description
+                        unit = parts[6]
+                        quantity = parts[7]
+                        dely_qty = parts[8]
+                        dely_date = parts[9]
+                        unit_rate = parts[10]
+                        value = parts[11]
+                        # Additional data such as material number, HSN code, IGST
+                        material_number = ""
+                        hsn_code = ""
+                        igst = ""
+                        for i, l in enumerate(text):
+                            if "Material Number:" in l:
+                                material_number = l.split(":")[1].strip()
+                            if "HSN Code:" in l:
+                                hsn_code = l.split(":")[1].strip()
+                            if "IGST" in l:
+                                igst = l.split(":")[1].strip()
+                        # Append row to data_rows
+                        data_rows.append({
+                            "Sl No": sl_no,
+                            "Material Description": material_desc,
+                            "Unit": unit,
+                            "Quantity": quantity,
+                            "Dely Qty": dely_qty,
+                            "Dely Date": dely_date,
+                            "Unit Rate": unit_rate,
+                            "Value": value,
+                            "Material Number": material_number,
+                            "HSN Code": hsn_code,
+                            "IGST": igst
+                        })
+                # Convert to DataFrame
+                df = pd.DataFrame(data_rows, columns=["Sl No", "Material Description", "Unit", "Quantity",
+                                                      "Dely Qty", "Dely Date", "Unit Rate", "Value",
+                                                      "Material Number", "HSN Code", "IGST"])
+    # Save as Excel file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
+    with pd.ExcelWriter(temp_file.name, engine='xlsxwriter') as writer:
+        df.to_excel(writer, index=False)
+    return temp_file.name
+# Set up Gradio interface
+company_options = ['Toshiba', 'BHEL']
+interface = gr.Interface(
+    fn=extract_data,
+    inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
+    outputs=gr.File(label="Download Extracted Data as Excel"),
+    title="PDF Data Extractor for Toshiba and BHEL",
+    description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
 )
+if __name__ == "__main__":
+    interface.launch()