Spaces:
Sleeping
Sleeping
File size: 3,046 Bytes
e19bc17 69affcd 9c8fd6c fc6645c 9c8fd6c fc6645c 9c8fd6c fc6645c 9c8fd6c fc6645c b9170e5 fc6645c c2f4a04 18bd25a fc6645c 18bd25a e0379be e19bc17 fc6645c 18bd25a fc6645c 18bd25a fc6645c 18bd25a e0379be 18bd25a e0379be 18bd25a e19bc17 fc6645c e19bc17 c2f4a04 2c84c62 e19bc17 2c84c62 e19bc17 3e6ed9b fc6645c e19bc17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | import gradio as gr
import pdfplumber
import pandas as pd
import os
# Define the path for input and output
input_file_path = '/mnt/data/extracted_data (1).xlsx'
output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'
# Ensure the directory exists
if not os.path.exists('/mnt/data/'):
os.makedirs('/mnt/data/')
# Check if the file exists
if os.path.exists(input_file_path):
# Read the Excel file if it exists
df = pd.read_excel(input_file_path)
print("File loaded successfully.")
else:
print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")
# Check if directory exists and list files
directory_path = "/mnt/data/"
if os.path.exists(directory_path):
files = os.listdir(directory_path)
print("Files in /mnt/data/:", files)
else:
os.makedirs(directory_path)
print("Directory created:", directory_path)
# Define the function to extract data from PDF
def extract_data(pdf_file_path, start_pos, end_pos):
try:
# Load and process the PDF
with pdfplumber.open(pdf_file_path) as pdf:
data = []
for page in pdf.pages:
text = page.extract_text()
if text is None:
return "Error: Could not extract text from the PDF. Please check the file format."
print("Extracted Text:", text) # Debugging line
# Example extracted data structure
extracted_data = {
"Pos": [10, 20, 30],
"Item Code": ["155569003011", "155569003012", "155569003013"],
"Quantity": [10, 10, 10],
"Basic Price": [57.66, 57.66, 57.66],
"Sub Total": [576.60, 576.60, 576.60]
}
# Convert to DataFrame and save to Excel
df = pd.DataFrame(extracted_data)
output_path = "/mnt/data/extracted_data.xlsx"
df.to_excel(output_path, index=False)
if os.path.exists(output_path):
print("File saved successfully:", output_path)
return output_path
else:
return "Error: Failed to save the Excel file."
except Exception as e:
print("Error encountered:", str(e))
return f"Error: {e}"
# Set up Gradio interface
interface = gr.Interface(
fn=extract_data,
inputs=[
gr.File(type="filepath", label="Upload PDF File"),
gr.Number(value=10, label="Start Position"),
gr.Number(value=450, label="End Position")
],
outputs=gr.File(label="Download Extracted Excel")
)
# Additional Excel filtering logic
if os.path.exists(input_file_path):
df = pd.read_excel(input_file_path)
# Filter for positions between 10 and 450
filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
filtered_df.to_excel(output_file_path, index=False)
print(f"Filtered data saved to: {output_file_path}")
else:
print(f"Input file not found: {input_file_path}. Skipping filtering.")
interface.launch()
|