Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

a715551

verified ·

1 Parent(s): a537fa5

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -106

app.py CHANGED Viewed

@@ -1,144 +1,109 @@
-import re
-import pandas as pd
 import pdfplumber
 import gradio as gr
 def extract_text_from_pdf(pdf_file):
-    """
-    Extracts text from an uploaded PDF file.
-    Args:
-        pdf_file: The uploaded PDF file.
-    Returns:
-        str: The extracted text from the PDF.
-    """
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
-            text += page.extract_text() + "\n"
-    print("\nExtracted Text:\n", text)  # Debugging: Print the extracted text
     return text
-def preprocess_lines(lines):
-    """
-    Combines multi-line rows into single rows for better parsing.
-    Args:
-        lines (list): List of text lines from the PDF.
-    Returns:
-        list: Preprocessed list of single-row strings.
-    """
-    combined_rows = []
-    current_row = ""
     for line in lines:
-        if re.match(r"^\d+\s", line):  # If line starts with an item number
-            if current_row:
-                combined_rows.append(current_row.strip())
-            current_row = line
-        else:
-            current_row += " " + line.strip()
-    if current_row:
-        combined_rows.append(current_row.strip())
-    return combined_rows
-def parse_po_items(rows):
-    """
-    Parses purchase order items from reconstructed rows.
-    Args:
-        rows (list): List of reconstructed rows.
-    Returns:
-        tuple: DataFrame with extracted data and a status message.
-    """
-    data = []
-    for row in rows:
-        try:
-            # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
-            match = re.match(
-                r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
-                row,
-            )
-            if match:
-                data.append(
-                    {
-                        "ITEM": match.group("Item"),
-                        "DESCRIPTION": match.group("Description"),
-                        "QTY": match.group("Qty"),
-                        "UNIT": match.group("Unit"),
-                        "UNIT PRICE": match.group("UnitPrice"),
-                        "TOTAL PRICE": match.group("TotalPrice"),
-                    }
                 )
-            else:
-                print(f"Skipped row: {row}")  # Log skipped rows
-        except Exception as e:
-            print(f"Error parsing row: {row}, Error: {e}")
     if not data:
-        return None, "No valid data found in the provided text."
     return pd.DataFrame(data), "Data extracted successfully."
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):
-    """
-    Saves the extracted data to an Excel file.
-    Args:
-        df (pd.DataFrame): DataFrame containing the structured data.
-        output_path (str): Path to save the Excel file.
-    Returns:
-        str: Path to the saved file.
-    """
     df.to_excel(output_path, index=False)
     return output_path
 def process_pdf(file):
-    """
-    Processes the uploaded PDF file, extracts data, and saves it to an Excel file.
-    Args:
-        file: The uploaded PDF file.
-    Returns:
-        tuple: Path to the saved Excel file and a status message.
-    """
     try:
-        # Extract text from the uploaded PDF
         text = extract_text_from_pdf(file)
-        # Split text into lines
-        lines = text.splitlines()
-        # Preprocess lines to reconstruct rows
-        rows = preprocess_lines(lines)
-        # Parse reconstructed rows
-        df, status = parse_po_items(rows)
         if df is not None:
             output_path = save_to_excel(df)
             return output_path, status
         return None, status
     except Exception as e:
-        return None, f"Error: {str(e)}"
-# Gradio Interface
-def create_interface():
-    """
-    Creates a Gradio interface for processing PO data from PDF files.
-    """
-    interface = gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
         outputs=[
-            gr.File(label="Download Extracted Excel"),
             gr.Textbox(label="Status"),
         ],
         title="PO Data Extraction",
-        description="Upload a purchase order PDF file to extract data into an Excel file.",
     )
-    return interface
 if __name__ == "__main__":
-    # Run the Gradio app
-    app = create_interface()
-    app.launch()

 import pdfplumber
+import pandas as pd
+import re
 import gradio as gr
+# Function: Extract Text from PDF
 def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
+            text += page.extract_text()
+    print("\nExtracted Text:\n", text)  # Debugging: Print extracted text
     return text
+# Function: Clean Description
+def clean_description(description, item_number=None):
+    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)  # Remove Qty + Unit + Price
+    description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
+    description = re.sub(r"\(Q\. No:.*?\)", "", description)  # Remove Q.No-related data
+    description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
+    description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
+    description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
+    description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions
+    return description.strip()
+# Function: Parse PO Items with Filters
+def parse_po_items_with_filters(text):
+    lines = text.splitlines()
+    data = []
+    current_item = {}
+    description_accumulator = []
     for line in lines:
+        print(f"Processing Line: {line}")  # Debugging
+        item_match = re.match(r"^\s*(?P<Item>\d+)\s+(?P<Description>.+)", line)
+        if item_match:
+            if current_item:
+                current_item["Description"] = clean_description(
+                    " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
                 )
+                data.append(current_item)
+                description_accumulator = []
+            current_item = {
+                "Item": item_match.group("Item"),
+                "Description": "",
+                "Qty": "",
+                "Unit": "",
+                "Unit Price": "",
+                "Total Price": "",
+            }
+            description_accumulator.append(item_match.group("Description"))
+        elif current_item:
+            description_accumulator.append(line.strip())
+        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
+        if qty_match:
+            current_item["Qty"] = qty_match.group("Qty")
+            current_item["Unit"] = qty_match.group(2)
+        price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
+        if price_match:
+            current_item["Unit Price"] = price_match.group("UnitPrice")
+            current_item["Total Price"] = price_match.group("TotalPrice")
+    if current_item:
+        current_item["Description"] = clean_description(
+            " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
+        )
+        data.append(current_item)
     if not data:
+        print("No items found. Check PDF format.")  # Debugging
+        return None, "No items found. Please check the PDF file format."
     return pd.DataFrame(data), "Data extracted successfully."
+# Function: Save to Excel
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):
     df.to_excel(output_path, index=False)
     return output_path
+# Gradio Interface Function
 def process_pdf(file):
     try:
         text = extract_text_from_pdf(file)
+        df, status = parse_po_items_with_filters(text)
         if df is not None:
             output_path = save_to_excel(df)
             return output_path, status
         return None, status
     except Exception as e:
+        return None, f"Error during processing: {str(e)}"
+# Gradio Interface Setup
+def create_gradio_interface():
+    return gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
         outputs=[
+            gr.File(label="Download Extracted Data"),
             gr.Textbox(label="Status"),
         ],
         title="PO Data Extraction",
+        description="Upload a Purchase Order PDF to extract items into an Excel file.",
     )
 if __name__ == "__main__":
+    interface = create_gradio_interface()
+    interface.launch()