Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| from transformers import pipeline | |
| import base64 | |
| # Function to convert PDF to DataFrame | |
| def pdf_to_dataframe(uploaded_file): | |
| # Open the PDF document | |
| # doc = fitz.open(pdf_path) | |
| # # Initialize an empty list to store text blocks | |
| # text_blocks = [] | |
| # # Iterate through each page in the PDF | |
| # for page_num in range(len(doc)): | |
| # page = doc.load_page(page_num) | |
| # text = page.get_text("text") | |
| # print(text) | |
| # text_blocks.append(text) | |
| # # Join all text blocks into a single string | |
| # full_text = "\n".join(text_blocks) | |
| # # Split the text into lines | |
| # lines = full_text.split('\n') | |
| # # Create a DataFrame from the lines | |
| if uploaded_file is not None: | |
| ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large") | |
| extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"] | |
| lines = extracted_text.split("\n") | |
| data = [] | |
| for line in lines: | |
| data.append([line]) | |
| df = pd.DataFrame(data, columns=["Text"]) | |
| # df = pd.DataFrame(lines, columns=['Text']) | |
| return df | |
| # Function to save DataFrame to Excel | |
| def dataframe_to_excel(df, excel_path): | |
| # Save the DataFrame to an Excel file | |
| df.to_excel(excel_path, index=False) | |
| # Main function | |
| def main(): | |
| def pdf_to_excel_function(pdf_file): | |
| # Save the uploaded PDF to a temporary file | |
| pdf_path = "temp.pdf" | |
| # with open(pdf_path, "wb") as f: | |
| # f.write(pdf_file.read()) | |
| # Convert PDF to DataFrame | |
| df = pdf_to_dataframe(pdf_file) | |
| # Save DataFrame to Excel | |
| excel_path = "output.xlsx" | |
| dataframe_to_excel(df, excel_path) | |
| return excel_path | |
| # Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=pdf_to_excel_function, | |
| inputs=gr.File(label="Upload PDF File"), | |
| outputs=gr.File(label="Download Excel File"), | |
| title="PDF to Excel Converter", | |
| description="Convert a PDF file to an Excel file." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |
| if __name__ == "__main__": | |
| main() |