Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pdfplumber
|
| 3 |
import pandas as pd
|
| 4 |
-
import openpyxl # Explicitly import openpyxl for Excel support
|
| 5 |
import os
|
| 6 |
-
import pandas as pd
|
| 7 |
|
| 8 |
-
# Define the path
|
| 9 |
input_file_path = '/mnt/data/extracted_data (1).xlsx'
|
|
|
|
| 10 |
|
| 11 |
-
#
|
| 12 |
if not os.path.exists('/mnt/data/'):
|
| 13 |
os.makedirs('/mnt/data/')
|
| 14 |
|
|
@@ -20,53 +19,19 @@ if os.path.exists(input_file_path):
|
|
| 20 |
else:
|
| 21 |
print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
|
| 22 |
|
| 23 |
-
|
| 24 |
directory_path = "/mnt/data/"
|
| 25 |
-
|
| 26 |
if os.path.exists(directory_path):
|
| 27 |
-
# List files if the directory exists
|
| 28 |
files = os.listdir(directory_path)
|
| 29 |
print("Files in /mnt/data/:", files)
|
| 30 |
else:
|
| 31 |
-
print("Directory does not exist:", directory_path)
|
| 32 |
-
# Optionally, create the directory if needed
|
| 33 |
os.makedirs(directory_path)
|
| 34 |
print("Directory created:", directory_path)
|
| 35 |
|
| 36 |
-
|
| 37 |
-
# Primary directory
|
| 38 |
-
directory_path = "/mnt/data/"
|
| 39 |
-
|
| 40 |
-
# Check if primary directory exists; use current directory as fallback
|
| 41 |
-
if not os.path.exists(directory_path):
|
| 42 |
-
print(f"Directory {directory_path} does not exist. Using current directory instead.")
|
| 43 |
-
directory_path = "."
|
| 44 |
-
|
| 45 |
-
# List files in the confirmed directory path
|
| 46 |
-
files = os.listdir(directory_path)
|
| 47 |
-
print("Files in directory:", files)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
directory_path = "/mnt/data/"
|
| 51 |
-
|
| 52 |
-
try:
|
| 53 |
-
files = os.listdir(directory_path)
|
| 54 |
-
print("Files in /mnt/data/:", files)
|
| 55 |
-
except FileNotFoundError:
|
| 56 |
-
print(f"Error: The directory {directory_path} does not exist.")
|
| 57 |
-
# Handle the error by either using another directory or creating the directory
|
| 58 |
-
os.makedirs(directory_path)
|
| 59 |
-
print("Created directory:", directory_path)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# List all files in the /mnt/data/ directory
|
| 63 |
-
files = os.listdir("/mnt/data/")
|
| 64 |
-
print("Files in /mnt/data/:", files)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
def extract_data(pdf_file_path, start_pos, end_pos):
|
| 68 |
try:
|
| 69 |
-
#
|
| 70 |
with pdfplumber.open(pdf_file_path) as pdf:
|
| 71 |
data = []
|
| 72 |
for page in pdf.pages:
|
|
@@ -74,15 +39,11 @@ def extract_data(pdf_file_path, start_pos, end_pos):
|
|
| 74 |
if text is None:
|
| 75 |
return "Error: Could not extract text from the PDF. Please check the file format."
|
| 76 |
|
| 77 |
-
# Print text for debugging
|
| 78 |
print("Extracted Text:", text) # Debugging line
|
| 79 |
|
| 80 |
-
|
| 81 |
-
# Append rows within start_pos to end_pos range
|
| 82 |
-
|
| 83 |
-
# Example data structure to simulate output
|
| 84 |
extracted_data = {
|
| 85 |
-
"Pos": [10, 20, 30],
|
| 86 |
"Item Code": ["155569003011", "155569003012", "155569003013"],
|
| 87 |
"Quantity": [10, 10, 10],
|
| 88 |
"Basic Price": [57.66, 57.66, 57.66],
|
|
@@ -91,11 +52,9 @@ def extract_data(pdf_file_path, start_pos, end_pos):
|
|
| 91 |
|
| 92 |
# Convert to DataFrame and save to Excel
|
| 93 |
df = pd.DataFrame(extracted_data)
|
| 94 |
-
output_path = "/
|
| 95 |
df.to_excel(output_path, index=False)
|
| 96 |
|
| 97 |
-
# Verify that file exists
|
| 98 |
-
import os
|
| 99 |
if os.path.exists(output_path):
|
| 100 |
print("File saved successfully:", output_path)
|
| 101 |
return output_path
|
|
@@ -103,11 +62,10 @@ def extract_data(pdf_file_path, start_pos, end_pos):
|
|
| 103 |
return "Error: Failed to save the Excel file."
|
| 104 |
|
| 105 |
except Exception as e:
|
| 106 |
-
# Log and return any exceptions
|
| 107 |
print("Error encountered:", str(e))
|
| 108 |
return f"Error: {e}"
|
| 109 |
|
| 110 |
-
# Gradio interface
|
| 111 |
interface = gr.Interface(
|
| 112 |
fn=extract_data,
|
| 113 |
inputs=[
|
|
@@ -117,19 +75,15 @@ interface = gr.Interface(
|
|
| 117 |
],
|
| 118 |
outputs=gr.File(label="Download Extracted Excel")
|
| 119 |
)
|
| 120 |
-
# Load the uploaded Excel file
|
| 121 |
-
input_file_path = '/mnt/data/extracted_data (1).xlsx' # Update with actual path if needed
|
| 122 |
-
output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
|
| 123 |
|
| 124 |
-
#
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
# Filter
|
| 128 |
-
filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
output_file_path # Path to the filtered Excel file
|
| 134 |
|
| 135 |
interface.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pdfplumber
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
+
# Define the path for input and output
|
| 7 |
input_file_path = '/mnt/data/extracted_data (1).xlsx'
|
| 8 |
+
output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
|
| 9 |
|
| 10 |
+
# Ensure the directory exists
|
| 11 |
if not os.path.exists('/mnt/data/'):
|
| 12 |
os.makedirs('/mnt/data/')
|
| 13 |
|
|
|
|
| 19 |
else:
|
| 20 |
print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
|
| 21 |
|
| 22 |
+
# Check if directory exists and list files
|
| 23 |
directory_path = "/mnt/data/"
|
|
|
|
| 24 |
if os.path.exists(directory_path):
|
|
|
|
| 25 |
files = os.listdir(directory_path)
|
| 26 |
print("Files in /mnt/data/:", files)
|
| 27 |
else:
|
|
|
|
|
|
|
| 28 |
os.makedirs(directory_path)
|
| 29 |
print("Directory created:", directory_path)
|
| 30 |
|
| 31 |
+
# Define the function to extract data from PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def extract_data(pdf_file_path, start_pos, end_pos):
|
| 33 |
try:
|
| 34 |
+
# Load and process the PDF
|
| 35 |
with pdfplumber.open(pdf_file_path) as pdf:
|
| 36 |
data = []
|
| 37 |
for page in pdf.pages:
|
|
|
|
| 39 |
if text is None:
|
| 40 |
return "Error: Could not extract text from the PDF. Please check the file format."
|
| 41 |
|
|
|
|
| 42 |
print("Extracted Text:", text) # Debugging line
|
| 43 |
|
| 44 |
+
# Example extracted data structure
|
|
|
|
|
|
|
|
|
|
| 45 |
extracted_data = {
|
| 46 |
+
"Pos": [10, 20, 30],
|
| 47 |
"Item Code": ["155569003011", "155569003012", "155569003013"],
|
| 48 |
"Quantity": [10, 10, 10],
|
| 49 |
"Basic Price": [57.66, 57.66, 57.66],
|
|
|
|
| 52 |
|
| 53 |
# Convert to DataFrame and save to Excel
|
| 54 |
df = pd.DataFrame(extracted_data)
|
| 55 |
+
output_path = "/mnt/data/extracted_data.xlsx"
|
| 56 |
df.to_excel(output_path, index=False)
|
| 57 |
|
|
|
|
|
|
|
| 58 |
if os.path.exists(output_path):
|
| 59 |
print("File saved successfully:", output_path)
|
| 60 |
return output_path
|
|
|
|
| 62 |
return "Error: Failed to save the Excel file."
|
| 63 |
|
| 64 |
except Exception as e:
|
|
|
|
| 65 |
print("Error encountered:", str(e))
|
| 66 |
return f"Error: {e}"
|
| 67 |
|
| 68 |
+
# Set up Gradio interface
|
| 69 |
interface = gr.Interface(
|
| 70 |
fn=extract_data,
|
| 71 |
inputs=[
|
|
|
|
| 75 |
],
|
| 76 |
outputs=gr.File(label="Download Extracted Excel")
|
| 77 |
)
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
# Additional Excel filtering logic
|
| 80 |
+
if os.path.exists(input_file_path):
|
| 81 |
+
df = pd.read_excel(input_file_path)
|
| 82 |
+
# Filter for positions between 10 and 450
|
| 83 |
+
filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
|
| 84 |
+
filtered_df.to_excel(output_file_path, index=False)
|
| 85 |
+
print(f"Filtered data saved to: {output_file_path}")
|
| 86 |
+
else:
|
| 87 |
+
print(f"Input file not found: {input_file_path}. Skipping filtering.")
|
|
|
|
| 88 |
|
| 89 |
interface.launch()
|