jithenderchoudary commited on
Commit
c992709
·
verified ·
1 Parent(s): 0d09737

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -36
app.py CHANGED
@@ -1,45 +1,125 @@
1
- import gradio as gr
2
- import pandas as pd
3
  import pdfplumber
 
 
4
  import tempfile
5
 
6
- def process_pdfs(pdf1, pdf2):
7
- data = []
8
- files = [pdf1, pdf2]
9
-
10
- for file in files:
11
- with pdfplumber.open(file.name) as pdf:
12
- for page in pdf.pages:
13
- table = page.extract_table()
14
- if table:
15
- df = pd.DataFrame(table[1:], columns=table[0]) # Assumes first row is headers
16
- data.append(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- combined_data = pd.concat(data, ignore_index=True)
19
-
20
- # Rename columns if needed to match the sample format
21
- combined_data.columns = [
22
- "Sl No", "Pos.", "Item code", "Unit", "Delivery Date",
23
- "Quantity", "Basic Price", "Discount", "Cur.",
24
- "Amount", "Central GST 9%", "State GST%", "SUB TOTAL"
25
- ]
26
 
27
- # Save to Excel
28
- output_file = tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False)
29
- combined_data.to_excel(output_file.name, index=False)
30
-
31
- return output_file.name
32
 
33
- iface = gr.Interface(
34
- fn=process_pdfs,
35
- inputs=[
36
- gr.inputs.File(label="Toshiba PO PDF"),
37
- gr.inputs.File(label="BHEL PO PDF")
38
- ],
39
- outputs=gr.outputs.File(label="Excel Sheet"),
40
- title="PDF to Excel PO Extractor",
41
- description="Upload two PO PDF files to extract data and download as an Excel file."
42
  )
43
 
44
- iface.launch()
 
45
 
 
 
 
1
  import pdfplumber
2
+ import pandas as pd
3
+ import gradio as gr
4
  import tempfile
5
 
6
+ def extract_data(pdf_file, company):
7
+ # Open PDF
8
+ with pdfplumber.open(pdf_file) as pdf:
9
+ pages = pdf.pages
10
+ data_rows = []
11
+
12
+ for page in pages:
13
+ text = page.extract_text().splitlines()
14
+
15
+ if company == 'Toshiba':
16
+ # Parse Toshiba format
17
+ for line in text:
18
+ if line.startswith("Pos."):
19
+ # Extract primary data line
20
+ parts = line.split()
21
+ pos = parts[1]
22
+ item_code = parts[2]
23
+ unit = parts[3]
24
+ delivery_date = parts[4]
25
+ quantity = parts[5]
26
+ basic_price = parts[6]
27
+ discount = parts[7]
28
+ currency = parts[8]
29
+ amount = parts[9]
30
+
31
+ # Extract additional description and calculation details
32
+ description = ""
33
+ calc_method = ""
34
+ for i, l in enumerate(text):
35
+ if "TERMINAL MARKING" in l or "Calculation Method:" in l:
36
+ description = text[i]
37
+ calc_method = text[i + 1] if "Calculation Method:" in text[i + 1] else ""
38
+ break
39
+
40
+ # Append row to data_rows
41
+ data_rows.append({
42
+ "Pos.": pos,
43
+ "Item Code": item_code,
44
+ "Unit": unit,
45
+ "Delivery Date": delivery_date,
46
+ "Quantity": quantity,
47
+ "Basic Price": basic_price,
48
+ "Discount": discount,
49
+ "Cur.": currency,
50
+ "Amount": amount,
51
+ "Description": description,
52
+ "Calculation Method": calc_method
53
+ })
54
+
55
+ # Convert to DataFrame
56
+ df = pd.DataFrame(data_rows, columns=["Pos.", "Item Code", "Unit", "Delivery Date", "Quantity",
57
+ "Basic Price", "Discount", "Cur.", "Amount", "Description",
58
+ "Calculation Method"])
59
+
60
+ elif company == 'BHEL':
61
+ # Parse BHEL format
62
+ for line in text:
63
+ if line.startswith("Sl No"):
64
+ parts = line.split()
65
+ sl_no = parts[2]
66
+ material_desc = " ".join(parts[3:6]) # Assuming fixed-length split for description
67
+ unit = parts[6]
68
+ quantity = parts[7]
69
+ dely_qty = parts[8]
70
+ dely_date = parts[9]
71
+ unit_rate = parts[10]
72
+ value = parts[11]
73
+
74
+ # Additional data such as material number, HSN code, IGST
75
+ material_number = ""
76
+ hsn_code = ""
77
+ igst = ""
78
+ for i, l in enumerate(text):
79
+ if "Material Number:" in l:
80
+ material_number = l.split(":")[1].strip()
81
+ if "HSN Code:" in l:
82
+ hsn_code = l.split(":")[1].strip()
83
+ if "IGST" in l:
84
+ igst = l.split(":")[1].strip()
85
+
86
+ # Append row to data_rows
87
+ data_rows.append({
88
+ "Sl No": sl_no,
89
+ "Material Description": material_desc,
90
+ "Unit": unit,
91
+ "Quantity": quantity,
92
+ "Dely Qty": dely_qty,
93
+ "Dely Date": dely_date,
94
+ "Unit Rate": unit_rate,
95
+ "Value": value,
96
+ "Material Number": material_number,
97
+ "HSN Code": hsn_code,
98
+ "IGST": igst
99
+ })
100
+
101
+ # Convert to DataFrame
102
+ df = pd.DataFrame(data_rows, columns=["Sl No", "Material Description", "Unit", "Quantity",
103
+ "Dely Qty", "Dely Date", "Unit Rate", "Value",
104
+ "Material Number", "HSN Code", "IGST"])
105
 
106
+ # Save as Excel file
107
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
108
+ with pd.ExcelWriter(temp_file.name, engine='xlsxwriter') as writer:
109
+ df.to_excel(writer, index=False)
 
 
 
 
110
 
111
+ return temp_file.name
 
 
 
 
112
 
113
+ # Set up Gradio interface
114
+ company_options = ['Toshiba', 'BHEL']
115
+ interface = gr.Interface(
116
+ fn=extract_data,
117
+ inputs=[gr.File(label="Upload PDF"), gr.Dropdown(choices=company_options, label="Select Company")],
118
+ outputs=gr.File(label="Download Extracted Data as Excel"),
119
+ title="PDF Data Extractor for Toshiba and BHEL",
120
+ description="Upload a PDF file and select the company to extract and format data into an Excel file according to specific requirements."
 
121
  )
122
 
123
+ if __name__ == "__main__":
124
+ interface.launch()
125