Update app.py
Browse files
app.py
CHANGED
|
@@ -1,144 +1,109 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import pandas as pd
|
| 3 |
import pdfplumber
|
|
|
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
-
|
| 7 |
def extract_text_from_pdf(pdf_file):
|
| 8 |
-
"""
|
| 9 |
-
Extracts text from an uploaded PDF file.
|
| 10 |
-
Args:
|
| 11 |
-
pdf_file: The uploaded PDF file.
|
| 12 |
-
Returns:
|
| 13 |
-
str: The extracted text from the PDF.
|
| 14 |
-
"""
|
| 15 |
with pdfplumber.open(pdf_file.name) as pdf:
|
| 16 |
text = ""
|
| 17 |
for page in pdf.pages:
|
| 18 |
-
text += page.extract_text()
|
| 19 |
-
print("\nExtracted Text:\n", text) # Debugging: Print
|
| 20 |
return text
|
| 21 |
|
| 22 |
-
|
| 23 |
-
def
|
| 24 |
-
"""
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"""
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
for line in lines:
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
if current_row:
|
| 43 |
-
combined_rows.append(current_row.strip())
|
| 44 |
-
|
| 45 |
-
return combined_rows
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
def parse_po_items(rows):
|
| 49 |
-
"""
|
| 50 |
-
Parses purchase order items from reconstructed rows.
|
| 51 |
-
Args:
|
| 52 |
-
rows (list): List of reconstructed rows.
|
| 53 |
-
Returns:
|
| 54 |
-
tuple: DataFrame with extracted data and a status message.
|
| 55 |
-
"""
|
| 56 |
-
data = []
|
| 57 |
-
for row in rows:
|
| 58 |
-
try:
|
| 59 |
-
# Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
|
| 60 |
-
match = re.match(
|
| 61 |
-
r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
|
| 62 |
-
row,
|
| 63 |
-
)
|
| 64 |
-
if match:
|
| 65 |
-
data.append(
|
| 66 |
-
{
|
| 67 |
-
"ITEM": match.group("Item"),
|
| 68 |
-
"DESCRIPTION": match.group("Description"),
|
| 69 |
-
"QTY": match.group("Qty"),
|
| 70 |
-
"UNIT": match.group("Unit"),
|
| 71 |
-
"UNIT PRICE": match.group("UnitPrice"),
|
| 72 |
-
"TOTAL PRICE": match.group("TotalPrice"),
|
| 73 |
-
}
|
| 74 |
)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
if not data:
|
| 81 |
-
|
|
|
|
| 82 |
return pd.DataFrame(data), "Data extracted successfully."
|
| 83 |
|
| 84 |
-
|
| 85 |
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
|
| 86 |
-
"""
|
| 87 |
-
Saves the extracted data to an Excel file.
|
| 88 |
-
Args:
|
| 89 |
-
df (pd.DataFrame): DataFrame containing the structured data.
|
| 90 |
-
output_path (str): Path to save the Excel file.
|
| 91 |
-
Returns:
|
| 92 |
-
str: Path to the saved file.
|
| 93 |
-
"""
|
| 94 |
df.to_excel(output_path, index=False)
|
| 95 |
return output_path
|
| 96 |
|
| 97 |
-
|
| 98 |
def process_pdf(file):
|
| 99 |
-
"""
|
| 100 |
-
Processes the uploaded PDF file, extracts data, and saves it to an Excel file.
|
| 101 |
-
Args:
|
| 102 |
-
file: The uploaded PDF file.
|
| 103 |
-
Returns:
|
| 104 |
-
tuple: Path to the saved Excel file and a status message.
|
| 105 |
-
"""
|
| 106 |
try:
|
| 107 |
-
# Extract text from the uploaded PDF
|
| 108 |
text = extract_text_from_pdf(file)
|
| 109 |
-
|
| 110 |
-
lines = text.splitlines()
|
| 111 |
-
# Preprocess lines to reconstruct rows
|
| 112 |
-
rows = preprocess_lines(lines)
|
| 113 |
-
# Parse reconstructed rows
|
| 114 |
-
df, status = parse_po_items(rows)
|
| 115 |
if df is not None:
|
| 116 |
output_path = save_to_excel(df)
|
| 117 |
return output_path, status
|
| 118 |
return None, status
|
| 119 |
except Exception as e:
|
| 120 |
-
return None, f"Error: {str(e)}"
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
"""
|
| 126 |
-
Creates a Gradio interface for processing PO data from PDF files.
|
| 127 |
-
"""
|
| 128 |
-
interface = gr.Interface(
|
| 129 |
fn=process_pdf,
|
| 130 |
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
|
| 131 |
outputs=[
|
| 132 |
-
gr.File(label="Download Extracted
|
| 133 |
gr.Textbox(label="Status"),
|
| 134 |
],
|
| 135 |
title="PO Data Extraction",
|
| 136 |
-
description="Upload a
|
| 137 |
)
|
| 138 |
-
return interface
|
| 139 |
-
|
| 140 |
|
| 141 |
if __name__ == "__main__":
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
app.launch()
|
|
|
|
|
|
|
|
|
|
| 1 |
import pdfplumber
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import re
|
| 4 |
import gradio as gr
|
| 5 |
|
| 6 |
+
# Function: Extract Text from PDF
|
| 7 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
with pdfplumber.open(pdf_file.name) as pdf:
|
| 9 |
text = ""
|
| 10 |
for page in pdf.pages:
|
| 11 |
+
text += page.extract_text()
|
| 12 |
+
print("\nExtracted Text:\n", text) # Debugging: Print extracted text
|
| 13 |
return text
|
| 14 |
|
| 15 |
+
# Function: Clean Description
|
| 16 |
+
def clean_description(description, item_number=None):
|
| 17 |
+
description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Remove Qty + Unit + Price
|
| 18 |
+
description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
|
| 19 |
+
description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
|
| 20 |
+
description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
|
| 21 |
+
description = re.sub(r"NOTES:.*", "", description) # Remove notes section
|
| 22 |
+
description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
|
| 23 |
+
description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
|
| 24 |
+
return description.strip()
|
| 25 |
+
|
| 26 |
+
# Function: Parse PO Items with Filters
|
| 27 |
+
def parse_po_items_with_filters(text):
|
| 28 |
+
lines = text.splitlines()
|
| 29 |
+
data = []
|
| 30 |
+
current_item = {}
|
| 31 |
+
description_accumulator = []
|
| 32 |
|
| 33 |
for line in lines:
|
| 34 |
+
print(f"Processing Line: {line}") # Debugging
|
| 35 |
+
item_match = re.match(r"^\s*(?P<Item>\d+)\s+(?P<Description>.+)", line)
|
| 36 |
+
if item_match:
|
| 37 |
+
if current_item:
|
| 38 |
+
current_item["Description"] = clean_description(
|
| 39 |
+
" ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
+
data.append(current_item)
|
| 42 |
+
description_accumulator = []
|
| 43 |
+
|
| 44 |
+
current_item = {
|
| 45 |
+
"Item": item_match.group("Item"),
|
| 46 |
+
"Description": "",
|
| 47 |
+
"Qty": "",
|
| 48 |
+
"Unit": "",
|
| 49 |
+
"Unit Price": "",
|
| 50 |
+
"Total Price": "",
|
| 51 |
+
}
|
| 52 |
+
description_accumulator.append(item_match.group("Description"))
|
| 53 |
+
elif current_item:
|
| 54 |
+
description_accumulator.append(line.strip())
|
| 55 |
+
|
| 56 |
+
qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
|
| 57 |
+
if qty_match:
|
| 58 |
+
current_item["Qty"] = qty_match.group("Qty")
|
| 59 |
+
current_item["Unit"] = qty_match.group(2)
|
| 60 |
+
|
| 61 |
+
price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
|
| 62 |
+
if price_match:
|
| 63 |
+
current_item["Unit Price"] = price_match.group("UnitPrice")
|
| 64 |
+
current_item["Total Price"] = price_match.group("TotalPrice")
|
| 65 |
+
|
| 66 |
+
if current_item:
|
| 67 |
+
current_item["Description"] = clean_description(
|
| 68 |
+
" ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
|
| 69 |
+
)
|
| 70 |
+
data.append(current_item)
|
| 71 |
|
| 72 |
if not data:
|
| 73 |
+
print("No items found. Check PDF format.") # Debugging
|
| 74 |
+
return None, "No items found. Please check the PDF file format."
|
| 75 |
return pd.DataFrame(data), "Data extracted successfully."
|
| 76 |
|
| 77 |
+
# Function: Save to Excel
|
| 78 |
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
df.to_excel(output_path, index=False)
|
| 80 |
return output_path
|
| 81 |
|
| 82 |
+
# Gradio Interface Function
|
| 83 |
def process_pdf(file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
try:
|
|
|
|
| 85 |
text = extract_text_from_pdf(file)
|
| 86 |
+
df, status = parse_po_items_with_filters(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if df is not None:
|
| 88 |
output_path = save_to_excel(df)
|
| 89 |
return output_path, status
|
| 90 |
return None, status
|
| 91 |
except Exception as e:
|
| 92 |
+
return None, f"Error during processing: {str(e)}"
|
| 93 |
|
| 94 |
+
# Gradio Interface Setup
|
| 95 |
+
def create_gradio_interface():
|
| 96 |
+
return gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
fn=process_pdf,
|
| 98 |
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
|
| 99 |
outputs=[
|
| 100 |
+
gr.File(label="Download Extracted Data"),
|
| 101 |
gr.Textbox(label="Status"),
|
| 102 |
],
|
| 103 |
title="PO Data Extraction",
|
| 104 |
+
description="Upload a Purchase Order PDF to extract items into an Excel file.",
|
| 105 |
)
|
|
|
|
|
|
|
| 106 |
|
| 107 |
if __name__ == "__main__":
|
| 108 |
+
interface = create_gradio_interface()
|
| 109 |
+
interface.launch()
|
|
|