Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
| 4 |
-
from
|
| 5 |
import spaces
|
| 6 |
|
| 7 |
def extract_text_with_langchain_pdf(pdf_file):
|
| 8 |
"""Extract text from a PDF using LangChain's UnstructuredFileLoader."""
|
| 9 |
-
loader = UnstructuredFileLoader(pdf_file
|
| 10 |
documents = loader.load()
|
| 11 |
|
| 12 |
# Collect text per page and return as a list of tuples (page_num, paragraph)
|
|
@@ -23,7 +23,7 @@ def extract_text_with_langchain_pdf(pdf_file):
|
|
| 23 |
def process_pdf_with_batches(pdf_file, batch_size, wait_time):
|
| 24 |
"""Extract text, split into batches, and store in a DataFrame."""
|
| 25 |
extracted_data = extract_text_with_langchain_pdf(pdf_file)
|
| 26 |
-
doc_name = pdf_file.
|
| 27 |
|
| 28 |
# Create a DataFrame from the extracted data
|
| 29 |
df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
|
|
@@ -50,7 +50,7 @@ with gr.Blocks() as demo:
|
|
| 50 |
gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
|
| 51 |
|
| 52 |
with gr.Row():
|
| 53 |
-
pdf_file = gr.File(label="Upload PDF", type="
|
| 54 |
|
| 55 |
with gr.Row():
|
| 56 |
batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
|
|
@@ -62,7 +62,7 @@ with gr.Blocks() as demo:
|
|
| 62 |
with gr.Row():
|
| 63 |
output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
|
| 64 |
download_button = gr.File(label="Download Extracted CSV")
|
| 65 |
-
|
| 66 |
@spaces.GPU
|
| 67 |
def on_extract(pdf_file, batch_size, wait_time):
|
| 68 |
"""Callback function to extract text, display batches, and save CSV."""
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
| 4 |
+
from langchain_community.document_loaders import UnstructuredFileLoader # Updated import
|
| 5 |
import spaces
|
| 6 |
|
| 7 |
def extract_text_with_langchain_pdf(pdf_file):
|
| 8 |
"""Extract text from a PDF using LangChain's UnstructuredFileLoader."""
|
| 9 |
+
loader = UnstructuredFileLoader(pdf_file) # Pass the filepath directly
|
| 10 |
documents = loader.load()
|
| 11 |
|
| 12 |
# Collect text per page and return as a list of tuples (page_num, paragraph)
|
|
|
|
| 23 |
def process_pdf_with_batches(pdf_file, batch_size, wait_time):
|
| 24 |
"""Extract text, split into batches, and store in a DataFrame."""
|
| 25 |
extracted_data = extract_text_with_langchain_pdf(pdf_file)
|
| 26 |
+
doc_name = pdf_file.split("/")[-1]
|
| 27 |
|
| 28 |
# Create a DataFrame from the extracted data
|
| 29 |
df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
|
|
|
|
| 50 |
gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
|
| 51 |
|
| 52 |
with gr.Row():
|
| 53 |
+
pdf_file = gr.File(label="Upload PDF", type="filepath") # Updated type to 'filepath'
|
| 54 |
|
| 55 |
with gr.Row():
|
| 56 |
batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
|
|
|
|
| 62 |
with gr.Row():
|
| 63 |
output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
|
| 64 |
download_button = gr.File(label="Download Extracted CSV")
|
| 65 |
+
|
| 66 |
@spaces.GPU
|
| 67 |
def on_extract(pdf_file, batch_size, wait_time):
|
| 68 |
"""Callback function to extract text, display batches, and save CSV."""
|