Spaces:

jithenderchoudary
/

pdfextract1

Sleeping

App Files Files Community

jithenderchoudary commited on Nov 7, 2024

Commit

fc6645c

verified ·

1 Parent(s): 9c8fd6c

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -65

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import gradio as gr
 import pdfplumber
 import pandas as pd
-import openpyxl  # Explicitly import openpyxl for Excel support
 import os
-import pandas as pd
-# Define the path
 input_file_path = '/mnt/data/extracted_data (1).xlsx'
-# Check if the directory exists and create it if it doesn't
 if not os.path.exists('/mnt/data/'):
     os.makedirs('/mnt/data/')
@@ -20,53 +19,19 @@ if os.path.exists(input_file_path):
 else:
     print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
 directory_path = "/mnt/data/"
 if os.path.exists(directory_path):
-    # List files if the directory exists
     files = os.listdir(directory_path)
     print("Files in /mnt/data/:", files)
 else:
-    print("Directory does not exist:", directory_path)
-    # Optionally, create the directory if needed
     os.makedirs(directory_path)
     print("Directory created:", directory_path)
-# Primary directory
-directory_path = "/mnt/data/"
-# Check if primary directory exists; use current directory as fallback
-if not os.path.exists(directory_path):
-    print(f"Directory {directory_path} does not exist. Using current directory instead.")
-    directory_path = "."
-# List files in the confirmed directory path
-files = os.listdir(directory_path)
-print("Files in directory:", files)
-directory_path = "/mnt/data/"
-try:
-    files = os.listdir(directory_path)
-    print("Files in /mnt/data/:", files)
-except FileNotFoundError:
-    print(f"Error: The directory {directory_path} does not exist.")
-    # Handle the error by either using another directory or creating the directory
-    os.makedirs(directory_path)
-    print("Created directory:", directory_path)
-# List all files in the /mnt/data/ directory
-files = os.listdir("/mnt/data/")
-print("Files in /mnt/data/:", files)
 def extract_data(pdf_file_path, start_pos, end_pos):
     try:
-        # Attempt to load and process the PDF
         with pdfplumber.open(pdf_file_path) as pdf:
             data = []
             for page in pdf.pages:
@@ -74,15 +39,11 @@ def extract_data(pdf_file_path, start_pos, end_pos):
                 if text is None:
                     return "Error: Could not extract text from the PDF. Please check the file format."
-                # Print text for debugging
                 print("Extracted Text:", text)  # Debugging line
-                # Placeholder data extraction logic; replace with actual extraction
-                # Append rows within start_pos to end_pos range
-            # Example data structure to simulate output
             extracted_data = {
-                "Pos": [10, 20, 30],  # Replace with actual data
                 "Item Code": ["155569003011", "155569003012", "155569003013"],
                 "Quantity": [10, 10, 10],
                 "Basic Price": [57.66, 57.66, 57.66],
@@ -91,11 +52,9 @@ def extract_data(pdf_file_path, start_pos, end_pos):
             # Convert to DataFrame and save to Excel
             df = pd.DataFrame(extracted_data)
-            output_path = "/tmp/extracted_data.xlsx"
             df.to_excel(output_path, index=False)
-            # Verify that file exists
-            import os
             if os.path.exists(output_path):
                 print("File saved successfully:", output_path)
                 return output_path
@@ -103,11 +62,10 @@ def extract_data(pdf_file_path, start_pos, end_pos):
                 return "Error: Failed to save the Excel file."
     except Exception as e:
-        # Log and return any exceptions
         print("Error encountered:", str(e))
         return f"Error: {e}"
-# Gradio interface setup
 interface = gr.Interface(
     fn=extract_data,
     inputs=[
@@ -117,19 +75,15 @@ interface = gr.Interface(
     ],
     outputs=gr.File(label="Download Extracted Excel")
 )
-# Load the uploaded Excel file
-input_file_path = '/mnt/data/extracted_data (1).xlsx'  # Update with actual path if needed
-output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
-# Read the Excel file into a DataFrame
-df = pd.read_excel(input_file_path)
-# Filter the DataFrame for positions between 10 and 450
-filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
-# Save the filtered data to a new Excel file
-filtered_df.to_excel(output_file_path, index=False)
-output_file_path  # Path to the filtered Excel file
 interface.launch()

 import gradio as gr
 import pdfplumber
 import pandas as pd
 import os
+# Define the path for input and output
 input_file_path = '/mnt/data/extracted_data (1).xlsx'
+output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
+# Ensure the directory exists
 if not os.path.exists('/mnt/data/'):
     os.makedirs('/mnt/data/')
 else:
     print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
+# Check if directory exists and list files
 directory_path = "/mnt/data/"
 if os.path.exists(directory_path):
     files = os.listdir(directory_path)
     print("Files in /mnt/data/:", files)
 else:
     os.makedirs(directory_path)
     print("Directory created:", directory_path)
+# Define the function to extract data from PDF
 def extract_data(pdf_file_path, start_pos, end_pos):
     try:
+        # Load and process the PDF
         with pdfplumber.open(pdf_file_path) as pdf:
             data = []
             for page in pdf.pages:
                 if text is None:
                     return "Error: Could not extract text from the PDF. Please check the file format."
                 print("Extracted Text:", text)  # Debugging line
+            # Example extracted data structure
             extracted_data = {
+                "Pos": [10, 20, 30],
                 "Item Code": ["155569003011", "155569003012", "155569003013"],
                 "Quantity": [10, 10, 10],
                 "Basic Price": [57.66, 57.66, 57.66],
             # Convert to DataFrame and save to Excel
             df = pd.DataFrame(extracted_data)
+            output_path = "/mnt/data/extracted_data.xlsx"
             df.to_excel(output_path, index=False)
             if os.path.exists(output_path):
                 print("File saved successfully:", output_path)
                 return output_path
                 return "Error: Failed to save the Excel file."
     except Exception as e:
         print("Error encountered:", str(e))
         return f"Error: {e}"
+# Set up Gradio interface
 interface = gr.Interface(
     fn=extract_data,
     inputs=[
     ],
     outputs=gr.File(label="Download Extracted Excel")
 )
+# Additional Excel filtering logic
+if os.path.exists(input_file_path):
+    df = pd.read_excel(input_file_path)
+    # Filter for positions between 10 and 450
+    filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
+    filtered_df.to_excel(output_file_path, index=False)
+    print(f"Filtered data saved to: {output_file_path}")
+else:
+    print(f"Input file not found: {input_file_path}. Skipping filtering.")
 interface.launch()