sadickam commited on
Commit
9847598
·
verified ·
1 Parent(s): 542ad54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -18
app.py CHANGED
@@ -3,14 +3,14 @@ import pandas as pd
3
  import io
4
  import tempfile
5
  import os
6
- from langchain_community.document_loaders import UnstructuredFileLoader
7
 
8
  # Create a temporary directory for storing download files
9
  temp_dir = tempfile.TemporaryDirectory()
10
 
11
  def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
12
  """
13
- Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
14
 
15
  Args:
16
  pdf_file_path (str): The file path to the uploaded PDF.
@@ -21,7 +21,8 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
21
  tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
22
  """
23
  try:
24
- loader = UnstructuredFileLoader(pdf_file_path)
 
25
  documents = loader.load()
26
 
27
  total_pages = len(documents)
@@ -29,23 +30,35 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
29
 
30
  # Validate and adjust page range
31
  if start_page is not None and end_page is not None:
 
 
 
 
 
32
  if start_page < 1:
33
  start_page = 1
34
  if end_page > total_pages:
35
  end_page = total_pages
36
  if start_page > end_page:
37
  start_page, end_page = end_page, start_page # Swap if out of order
 
 
38
  selected_docs = documents[start_page - 1:end_page]
39
  else:
40
- selected_docs = documents # Extract all pages
 
 
41
 
42
  # Concatenate selected page contents into a single string
43
  pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
44
 
45
  extracted_data = []
46
 
47
- for idx, doc in enumerate(selected_docs, start=1): # Page numbering starts at 1
48
- page_num = idx # Assigning sequential page numbers based on selection
 
 
 
49
  paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
50
 
51
  for paragraph in paragraphs:
@@ -105,8 +118,8 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
105
  Args:
106
  pdf_file_path (str): The file path to the uploaded PDF.
107
  extraction_mode (str): "All Pages" or "Range of Pages".
108
- start_page (int): Starting page number for extraction.
109
- end_page (int): Ending page number for extraction.
110
 
111
  Returns:
112
  tuple: Paths to CSV and TXT files, Status message.
@@ -169,7 +182,7 @@ with gr.Blocks() as demo:
169
  type="filepath", # Ensure type is set to "filepath"
170
  interactive=True
171
  )
172
-
173
  with gr.Row():
174
  extraction_mode = gr.Radio(
175
  label="Extraction Mode",
@@ -177,7 +190,7 @@ with gr.Blocks() as demo:
177
  value="All Pages",
178
  interactive=True
179
  )
180
-
181
  with gr.Row():
182
  start_page = gr.Number(
183
  label="Start Page",
@@ -193,18 +206,20 @@ with gr.Blocks() as demo:
193
  interactive=True,
194
  visible=False # Initially hidden
195
  )
196
-
197
  # Toggle visibility of start_page and end_page based on extraction_mode
198
  extraction_mode.change(
199
- fn=lambda mode: (gr.update(visible=(mode == "Range of Pages")),
200
- gr.update(visible=(mode == "Range of Pages"))),
 
 
201
  inputs=[extraction_mode],
202
  outputs=[start_page, end_page]
203
  )
204
-
205
  with gr.Row():
206
  extract_button = gr.Button("Extract and Download")
207
-
208
  with gr.Row():
209
  csv_download = gr.File(
210
  label="Download Extracted CSV",
@@ -214,20 +229,20 @@ with gr.Blocks() as demo:
214
  label="Download Full Text",
215
  interactive=False
216
  )
217
-
218
  with gr.Row():
219
  status_output = gr.Textbox(
220
  label="Status",
221
  interactive=False,
222
  lines=2
223
  )
224
-
225
  extract_button.click(
226
  fn=on_extract,
227
  inputs=[pdf_input, extraction_mode, start_page, end_page],
228
  outputs=[csv_download, txt_download, status_output]
229
  )
230
-
231
  gr.Markdown("""
232
  ---
233
  Developed with ❤️ using Gradio and LangChain.
 
3
  import io
4
  import tempfile
5
  import os
6
+ from langchain_community.document_loaders import UnstructuredPDFLoader
7
 
8
  # Create a temporary directory for storing download files
9
  temp_dir = tempfile.TemporaryDirectory()
10
 
11
  def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
12
  """
13
+ Extract text from a PDF page by page using LangChain's UnstructuredPDFLoader.
14
 
15
  Args:
16
  pdf_file_path (str): The file path to the uploaded PDF.
 
21
  tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
22
  """
23
  try:
24
+ # Initialize the loader with split_pages=True to ensure each page is a separate document
25
+ loader = UnstructuredPDFLoader(pdf_file_path, split_pages=True)
26
  documents = loader.load()
27
 
28
  total_pages = len(documents)
 
30
 
31
  # Validate and adjust page range
32
  if start_page is not None and end_page is not None:
33
+ # Convert to integers to avoid slicing issues
34
+ start_page = int(start_page)
35
+ end_page = int(end_page)
36
+
37
+ # Adjust to valid range
38
  if start_page < 1:
39
  start_page = 1
40
  if end_page > total_pages:
41
  end_page = total_pages
42
  if start_page > end_page:
43
  start_page, end_page = end_page, start_page # Swap if out of order
44
+
45
+ # Select the subset of documents based on user input
46
  selected_docs = documents[start_page - 1:end_page]
47
  else:
48
+ selected_docs = documents
49
+ start_page = 1
50
+ end_page = total_pages
51
 
52
  # Concatenate selected page contents into a single string
53
  pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
54
 
55
  extracted_data = []
56
 
57
+ for idx, doc in enumerate(selected_docs, start=1):
58
+ # Assign the actual page number
59
+ page_num = start_page + idx - 1
60
+
61
+ # Split content into paragraphs
62
  paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
63
 
64
  for paragraph in paragraphs:
 
118
  Args:
119
  pdf_file_path (str): The file path to the uploaded PDF.
120
  extraction_mode (str): "All Pages" or "Range of Pages".
121
+ start_page (float): Starting page number for extraction.
122
+ end_page (float): Ending page number for extraction.
123
 
124
  Returns:
125
  tuple: Paths to CSV and TXT files, Status message.
 
182
  type="filepath", # Ensure type is set to "filepath"
183
  interactive=True
184
  )
185
+
186
  with gr.Row():
187
  extraction_mode = gr.Radio(
188
  label="Extraction Mode",
 
190
  value="All Pages",
191
  interactive=True
192
  )
193
+
194
  with gr.Row():
195
  start_page = gr.Number(
196
  label="Start Page",
 
206
  interactive=True,
207
  visible=False # Initially hidden
208
  )
209
+
210
  # Toggle visibility of start_page and end_page based on extraction_mode
211
  extraction_mode.change(
212
+ fn=lambda mode: (
213
+ gr.update(visible=(mode == "Range of Pages")),
214
+ gr.update(visible=(mode == "Range of Pages"))
215
+ ),
216
  inputs=[extraction_mode],
217
  outputs=[start_page, end_page]
218
  )
219
+
220
  with gr.Row():
221
  extract_button = gr.Button("Extract and Download")
222
+
223
  with gr.Row():
224
  csv_download = gr.File(
225
  label="Download Extracted CSV",
 
229
  label="Download Full Text",
230
  interactive=False
231
  )
232
+
233
  with gr.Row():
234
  status_output = gr.Textbox(
235
  label="Status",
236
  interactive=False,
237
  lines=2
238
  )
239
+
240
  extract_button.click(
241
  fn=on_extract,
242
  inputs=[pdf_input, extraction_mode, start_page, end_page],
243
  outputs=[csv_download, txt_download, status_output]
244
  )
245
+
246
  gr.Markdown("""
247
  ---
248
  Developed with ❤️ using Gradio and LangChain.