Spaces:

jithenderchoudary
/

pdfextract1

Sleeping

File size: 3,046 Bytes

e19bc17
 
 
69affcd
9c8fd6c
fc6645c
9c8fd6c
fc6645c
9c8fd6c
fc6645c
9c8fd6c
 
 
 
 
 
 
 
 
 
 
fc6645c
b9170e5
 
 
 
 
 
 
 
fc6645c
c2f4a04
18bd25a
fc6645c
18bd25a
 
 
 
 
 
 
e0379be
e19bc17
fc6645c
18bd25a
fc6645c
18bd25a
 
 
 
 
 
 
 
fc6645c
18bd25a
e0379be
 
 
 
 
 
 
18bd25a
e0379be
18bd25a
e19bc17
fc6645c
e19bc17
 
 
c2f4a04
2c84c62
 
e19bc17
2c84c62
e19bc17
3e6ed9b
fc6645c
 
 
 
 
 
 
 
 
e19bc17

import gradio as gr
import pdfplumber
import pandas as pd
import os

# Define the path for input and output
input_file_path = '/mnt/data/extracted_data (1).xlsx'
output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx'

# Ensure the directory exists
if not os.path.exists('/mnt/data/'):
    os.makedirs('/mnt/data/')

# Check if the file exists
if os.path.exists(input_file_path):
    # Read the Excel file if it exists
    df = pd.read_excel(input_file_path)
    print("File loaded successfully.")
else:
    print(f"File not found: {input_file_path}. Please ensure the file is in the directory.")

# Check if directory exists and list files
directory_path = "/mnt/data/"
if os.path.exists(directory_path):
    files = os.listdir(directory_path)
    print("Files in /mnt/data/:", files)
else:
    os.makedirs(directory_path)
    print("Directory created:", directory_path)

# Define the function to extract data from PDF
def extract_data(pdf_file_path, start_pos, end_pos):
    try:
        # Load and process the PDF
        with pdfplumber.open(pdf_file_path) as pdf:
            data = []
            for page in pdf.pages:
                text = page.extract_text()
                if text is None:
                    return "Error: Could not extract text from the PDF. Please check the file format."
                
                print("Extracted Text:", text)  # Debugging line

            # Example extracted data structure
            extracted_data = {
                "Pos": [10, 20, 30],
                "Item Code": ["155569003011", "155569003012", "155569003013"],
                "Quantity": [10, 10, 10],
                "Basic Price": [57.66, 57.66, 57.66],
                "Sub Total": [576.60, 576.60, 576.60]
            }
            
            # Convert to DataFrame and save to Excel
            df = pd.DataFrame(extracted_data)
            output_path = "/mnt/data/extracted_data.xlsx"
            df.to_excel(output_path, index=False)

            if os.path.exists(output_path):
                print("File saved successfully:", output_path)
                return output_path
            else:
                return "Error: Failed to save the Excel file."

    except Exception as e:
        print("Error encountered:", str(e))
        return f"Error: {e}"

# Set up Gradio interface
interface = gr.Interface(
    fn=extract_data,
    inputs=[
        gr.File(type="filepath", label="Upload PDF File"),
        gr.Number(value=10, label="Start Position"),
        gr.Number(value=450, label="End Position")
    ],
    outputs=gr.File(label="Download Extracted Excel")
)

# Additional Excel filtering logic
if os.path.exists(input_file_path):
    df = pd.read_excel(input_file_path)
    # Filter for positions between 10 and 450
    filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
    filtered_df.to_excel(output_file_path, index=False)
    print(f"Filtered data saved to: {output_file_path}")
else:
    print(f"Input file not found: {input_file_path}. Skipping filtering.")

interface.launch()