sadickam commited on
Commit
3e9e19b
·
verified ·
1 Parent(s): c6d5ac7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import time
4
- from langchain.document_loaders import UnstructuredFileLoader
5
  import spaces
6
 
7
  def extract_text_with_langchain_pdf(pdf_file):
8
  """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
9
- loader = UnstructuredFileLoader(pdf_file.name)
10
  documents = loader.load()
11
 
12
  # Collect text per page and return as a list of tuples (page_num, paragraph)
@@ -23,7 +23,7 @@ def extract_text_with_langchain_pdf(pdf_file):
23
  def process_pdf_with_batches(pdf_file, batch_size, wait_time):
24
  """Extract text, split into batches, and store in a DataFrame."""
25
  extracted_data = extract_text_with_langchain_pdf(pdf_file)
26
- doc_name = pdf_file.name.split("/")[-1]
27
 
28
  # Create a DataFrame from the extracted data
29
  df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
@@ -50,7 +50,7 @@ with gr.Blocks() as demo:
50
  gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
51
 
52
  with gr.Row():
53
- pdf_file = gr.File(label="Upload PDF", type="file")
54
 
55
  with gr.Row():
56
  batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
@@ -62,7 +62,7 @@ with gr.Blocks() as demo:
62
  with gr.Row():
63
  output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
64
  download_button = gr.File(label="Download Extracted CSV")
65
-
66
  @spaces.GPU
67
  def on_extract(pdf_file, batch_size, wait_time):
68
  """Callback function to extract text, display batches, and save CSV."""
 
1
  import gradio as gr
2
  import pandas as pd
3
  import time
4
+ from langchain_community.document_loaders import UnstructuredFileLoader # Updated import
5
  import spaces
6
 
7
  def extract_text_with_langchain_pdf(pdf_file):
8
  """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
9
+ loader = UnstructuredFileLoader(pdf_file) # Pass the filepath directly
10
  documents = loader.load()
11
 
12
  # Collect text per page and return as a list of tuples (page_num, paragraph)
 
23
  def process_pdf_with_batches(pdf_file, batch_size, wait_time):
24
  """Extract text, split into batches, and store in a DataFrame."""
25
  extracted_data = extract_text_with_langchain_pdf(pdf_file)
26
+ doc_name = pdf_file.split("/")[-1]
27
 
28
  # Create a DataFrame from the extracted data
29
  df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
 
50
  gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
51
 
52
  with gr.Row():
53
+ pdf_file = gr.File(label="Upload PDF", type="filepath") # Updated type to 'filepath'
54
 
55
  with gr.Row():
56
  batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
 
62
  with gr.Row():
63
  output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
64
  download_button = gr.File(label="Download Extracted CSV")
65
+
66
  @spaces.GPU
67
  def on_extract(pdf_file, batch_size, wait_time):
68
  """Callback function to extract text, display batches, and save CSV."""