Spaces:

jithenderchoudary
/

poext

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 4, 2024

Commit

8ff450e

verified ·

1 Parent(s): 47d2da2

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -6

app.py CHANGED Viewed

@@ -2,28 +2,30 @@ import fitz  # PyMuPDF
 import pandas as pd
 import gradio as gr
 import tempfile
 def extract_po_to_excel(pdf_file):
     try:
-        # Load PDF and extract text
         with fitz.open(pdf_file.name) as pdf:
             data = []
             for page_num in range(pdf.page_count):
                 page = pdf[page_num]
                 text = page.get_text("text")
                 # Simple example of extraction (customize parsing as needed)
                 lines = text.splitlines()
                 for line in lines:
-                    # Only extract lines with known keywords (sample logic; adjust as necessary)
                     if "Pos." in line or "Item Code" in line:
                         data.append(line)
         # Example structure, parse `data` into structured format
         structured_data = []
         for line in data:
-            # Custom parsing logic goes here; here's a basic split by spaces
-            # Adjust parsing to match your actual data needs
             parts = line.split()
             if len(parts) > 1:
                 structured_data.append({
@@ -34,16 +36,20 @@ def extract_po_to_excel(pdf_file):
         # Create DataFrame and export to Excel
         df = pd.DataFrame(structured_data)
         # Save to temporary file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
         df.to_excel(temp_file.name, index=False)
         temp_file.close()
         return temp_file.name
     except Exception as e:
-        print(f"Error: {e}")
         return None
 def main(pdf_file):
@@ -66,3 +72,4 @@ if __name__ == "__main__":
     interface.launch()

 import pandas as pd
 import gradio as gr
 import tempfile
+import traceback
 def extract_po_to_excel(pdf_file):
     try:
+        # Attempt to open and read the PDF file
+        print("Starting PDF extraction process.")
         with fitz.open(pdf_file.name) as pdf:
             data = []
+            print("PDF opened successfully.")
             for page_num in range(pdf.page_count):
                 page = pdf[page_num]
                 text = page.get_text("text")
+                print(f"Extracted text from page {page_num + 1}")
                 # Simple example of extraction (customize parsing as needed)
                 lines = text.splitlines()
                 for line in lines:
                     if "Pos." in line or "Item Code" in line:
                         data.append(line)
         # Example structure, parse `data` into structured format
         structured_data = []
         for line in data:
             parts = line.split()
             if len(parts) > 1:
                 structured_data.append({
         # Create DataFrame and export to Excel
         df = pd.DataFrame(structured_data)
+        print("DataFrame created successfully.")
         # Save to temporary file
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
         df.to_excel(temp_file.name, index=False)
         temp_file.close()
+        print(f"Excel file saved at {temp_file.name}")
         return temp_file.name
     except Exception as e:
+        # Capture and print the full traceback for debugging
+        print("An error occurred during PDF to Excel conversion.")
+        traceback.print_exc()
         return None
 def main(pdf_file):
     interface.launch()