import gradio as gr import pdfplumber import pandas as pd import os # Define the path for input and output input_file_path = '/mnt/data/extracted_data (1).xlsx' output_file_path = '/mnt/data/filtered_positions_10_to_450.xlsx' # Ensure the directory exists if not os.path.exists('/mnt/data/'): os.makedirs('/mnt/data/') # Check if the file exists if os.path.exists(input_file_path): # Read the Excel file if it exists df = pd.read_excel(input_file_path) print("File loaded successfully.") else: print(f"File not found: {input_file_path}. Please ensure the file is in the directory.") # Check if directory exists and list files directory_path = "/mnt/data/" if os.path.exists(directory_path): files = os.listdir(directory_path) print("Files in /mnt/data/:", files) else: os.makedirs(directory_path) print("Directory created:", directory_path) # Define the function to extract data from PDF def extract_data(pdf_file_path, start_pos, end_pos): try: # Load and process the PDF with pdfplumber.open(pdf_file_path) as pdf: data = [] for page in pdf.pages: text = page.extract_text() if text is None: return "Error: Could not extract text from the PDF. Please check the file format." print("Extracted Text:", text) # Debugging line # Example extracted data structure extracted_data = { "Pos": [10, 20, 30], "Item Code": ["155569003011", "155569003012", "155569003013"], "Quantity": [10, 10, 10], "Basic Price": [57.66, 57.66, 57.66], "Sub Total": [576.60, 576.60, 576.60] } # Convert to DataFrame and save to Excel df = pd.DataFrame(extracted_data) output_path = "/mnt/data/extracted_data.xlsx" df.to_excel(output_path, index=False) if os.path.exists(output_path): print("File saved successfully:", output_path) return output_path else: return "Error: Failed to save the Excel file." except Exception as e: print("Error encountered:", str(e)) return f"Error: {e}" # Set up Gradio interface interface = gr.Interface( fn=extract_data, inputs=[ gr.File(type="filepath", label="Upload PDF File"), gr.Number(value=10, label="Start Position"), gr.Number(value=450, label="End Position") ], outputs=gr.File(label="Download Extracted Excel") ) # Additional Excel filtering logic if os.path.exists(input_file_path): df = pd.read_excel(input_file_path) # Filter for positions between 10 and 450 filtered_df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)] filtered_df.to_excel(output_file_path, index=False) print(f"Filtered data saved to: {output_file_path}") else: print(f"Input file not found: {input_file_path}. Skipping filtering.") interface.launch()