Spaces:

dschandra
/

ToshibaPO

Sleeping

App Files Files Community

dschandra commited on Dec 12, 2024

Commit

f705371

verified ·

1 Parent(s): b61deb6

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -123

app.py CHANGED Viewed

@@ -3,150 +3,104 @@ import pdfplumber
 import pandas as pd
 import re
-def extract_item_code(lines, start_index):
-    """
-    Extract the numeric part of the Item Code with better handling of multi-line rows.
-    """
-    item_code = ""
-    for line in lines[start_index:]:
-        # Stop processing if a new row starts
-        if line.strip().isdigit():  # Check for new row start
-            break
-        # Skip lines with unwanted keywords
-        if any(keyword in line for keyword in ["Calculation Method", "Landed Cost", "SUB TOTAL", "Central GST", "State GST"]):
-            continue
-        # Concatenate valid lines
-        item_code += " " + line.strip()
-    print(f"Concatenated Item Code Line: {item_code}")  # Debugging
-    # Regex to extract numeric Item Code
-    pattern = r"(\d{6,12})"
-    match = re.search(pattern, item_code)
-    if match:
-        return match.group(1)  # Return the numeric Item Code
-    else:
-        print(f"Failed to extract numeric Item Code from: {item_code}")
-        return "MISSING"  # Indicate missing Item Code
-def extract_row_fields(line):
-    """
-    Extract fields like Unit, Delivery Date, Quantity, Basic Price, etc.
-    """
-    parts = line.split()
-    try:
-        pos = parts[0] if len(parts) > 0 else ""
-        unit = parts[-7] if len(parts) > 6 else ""
-        delivery_date = parts[-6] if len(parts) > 5 else ""
-        quantity = float(parts[-5]) if len(parts) > 4 else 0.0
-        basic_price = float(parts[-4]) if len(parts) > 3 else 0.0
-        discount = float(parts[-3]) if len(parts) > 2 else 0.0
-        cur = parts[-2] if len(parts) > 1 else ""
-        amount = float(parts[-1]) if len(parts) > 0 else 0.0
-        return pos, unit, delivery_date, quantity, basic_price, discount, cur, amount
-    except (ValueError, IndexError) as e:
-        print(f"Error extracting row fields: {e}")
-        return "", "", "", 0.0, 0.0, 0.0, "", 0.0
-def calculate_totals(amount):
     """
-    Calculate CGST, SGST, and Sub Total.
     """
-    cgst = amount * 0.09  # 9% of Amount
-    sgst = amount * 0.09  # 9% of Amount
-    sub_total = amount + cgst + sgst
-    return cgst, sgst, sub_total
-def extract_data(pdf_file):
-    """
-    Extract data from the uploaded PDF.
-    """
-    data = []
-    skipped_rows = []  # Track rows with missing Item Codes
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
-            text = page.extract_text().splitlines()
-            print(f"Page {page.page_number} Text: {text}")  # Debug raw text
-            current_row = {}
-            for i, line in enumerate(text):
-                parts = line.split()
-                try:
-                    pos = int(parts[0]) if parts[0].isdigit() else None
-                    if pos and 10 <= pos <= 450:
-                        # Extract numeric Item Code
-                        item_code = extract_item_code(text, i + 1)
-                        # Clean the description and append it to the current row
                         if current_row and "Item Code" in current_row:
                             clean_line = re.sub(
-                                r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|"
-                                r"\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|"
-                                r"\s{2,}|[A-Za-z]+:[0-9\.]+)",
                                 "",
                                 line
                             ).strip()
                             if clean_line:
                                 current_row["Description"] += f" {clean_line}".strip()
-                        # Extract other row-specific fields
-                        pos, unit, delivery_date, quantity, basic_price, discount, cur, amount = extract_row_fields(line)
-                        # Calculate totals
-                        cgst, sgst, sub_total = calculate_totals(amount)
-                        # Append the row to the data list
-                        data.append([pos, item_code, unit, delivery_date, quantity, basic_price, discount, cur, amount, cgst, sgst, sub_total])
-                        # Combine Item Code and Description once processing is complete
-                        if "Description" in current_row:
-                            current_row["Item Code"] = f"{item_code}\n{current_row['Description']}".strip()
-                            del current_row["Description"]
-                except Exception as e:
-                    print(f"Error processing line: {line} | Error: {e}")
-                    skipped_rows.append(line)  # Track skipped rows
-                    continue
-    # Create DataFrame
-    df = pd.DataFrame(data, columns=["Pos", "Item Code", "Unit", "Delivery Date",
-                                     "Quantity", "Basic Price", "Discount", "Cur", "Amount",
-                                     "Central GST", "State GST", "Sub Total"])
-    # Log skipped rows for debugging
-    if skipped_rows:
-        print(f"Skipped Rows: {skipped_rows}")
-    # Save to Excel
-    excel_path = "/tmp/Extracted_PO_Data.xlsx"
-    df.to_excel(excel_path, index=False)
-    return excel_path
 # Gradio interface
-def run_gradio_interface():
-    """
-    Gradio interface for PDF upload and data extraction.
-    """
-    iface = gr.Interface(
-        fn=extract_data,
-        inputs=gr.File(label="Upload PDF"),
-        outputs=gr.File(label="Download Excel"),
-        title="PO Data Extractor",
-        description="Upload a PDF file to extract Purchase Order data."
-    )
-    iface.launch()
 if __name__ == "__main__":
-    run_gradio_interface()

 import pandas as pd
 import re
+def extract_cleaned_po_data(pdf_file):
     """
+    Extract and clean data from a Toshiba PO PDF file.
     """
+    extracted_data = []
+    current_row = {}
     with pdfplumber.open(pdf_file) as pdf:
         for page in pdf.pages:
+            text = page.extract_text()
+            if text:
+                lines = text.split("\n")
+                for line in lines:
+                    line = line.strip()
+                    # Match rows starting with POS and numeric Item Code
+                    if re.match(r"^\d+\s+\d{12}\s+", line):
+                        parts = re.split(r'\s+', line, maxsplit=9)  # Split only the first 9 elements to handle descriptions correctly
+                        if len(parts) >= 9:
+                            # Save the previous row if exists
+                            if current_row:
+                                extracted_data.append(current_row)
+                            current_row = {
+                                "Pos": parts[0],
+                                "Item Code": parts[1],
+                                "Description": "",
+                                "Unit": parts[2],
+                                "Delivery Date": parts[3],
+                                "Quantity": parts[4],
+                                "Basic Price": parts[5],
+                                "Discount": parts[6],
+                                "Cur": parts[7],
+                                "Amount": parts[8],
+                                "Sub Total": ""
+                            }
+                    elif "SUB TOTAL" in line and current_row:
+                        # Capture the Sub Total
+                        sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line)
+                        if sub_total_match:
+                            current_row["Sub Total"] = sub_total_match.group(1)
+                        extracted_data.append(current_row)
+                        current_row = {}
+                    else:
+                        # Clean and append descriptions only
                         if current_row and "Item Code" in current_row:
                             clean_line = re.sub(
+                                r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)",
                                 "",
                                 line
                             ).strip()
                             if clean_line:
                                 current_row["Description"] += f" {clean_line}".strip()
+    # Add the last row if exists
+    if current_row:
+        extracted_data.append(current_row)
+    # Combine Item Code and Description
+    for row in extracted_data:
+        if "Description" in row:
+            row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip()
+            del row["Description"]
+    # Convert to DataFrame
+    columns = [
+        "Pos", "Item Code", "Unit", "Delivery Date", "Quantity",
+        "Basic Price", "Discount", "Cur", "Amount", "Sub Total"
+    ]
+    df = pd.DataFrame(extracted_data, columns=columns)
+    # Ensure Pos is numeric and filter rows for POS 10 to POS 450
+    df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce')
+    df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
+    # Identify missing POS numbers
+    expected_pos = set(range(10, 451))
+    extracted_pos = set(df['Pos'].dropna().astype(int))
+    missing_pos = sorted(expected_pos - extracted_pos)
+    print("Missing POS numbers:", missing_pos)  # Debug output to identify skipped POS numbers
+    # Save as Excel for download
+    output_path = "cleaned_extracted_po_data.xlsx"
+    df.to_excel(output_path, index=False)
+    return output_path
 # Gradio interface
+def process_pdf(file):
+    excel_path = extract_cleaned_po_data(file.name)
+    return excel_path
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.File(label="Upload Toshiba PO PDF"),
+    outputs=gr.File(label="Download Cleaned Extracted Excel"),
+    title="Toshiba PO Data Extraction",
+    description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.",
+)
 if __name__ == "__main__":
+    iface.launch()