sadickam commited on
Commit
7ae30ee
·
verified ·
1 Parent(s): 7f934fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -10
app.py CHANGED
@@ -1,8 +1,13 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import io
 
 
4
  from langchain_community.document_loaders import UnstructuredFileLoader
5
 
 
 
 
6
  def extract_text_with_langchain_pdf(pdf_file_path):
7
  """
8
  Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
@@ -18,7 +23,7 @@ def extract_text_with_langchain_pdf(pdf_file_path):
18
  documents = loader.load()
19
 
20
  extracted_data = []
21
- doc_name = pdf_file_path.split("/")[-1] # Extract document name
22
 
23
  # Concatenate all page contents into a single string
24
  pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
@@ -80,12 +85,12 @@ def text_to_txt_bytes(text):
80
  def on_extract(pdf_file_path):
81
  """
82
  Callback function to extract text from PDF and return CSV and TXT data.
83
-
84
  Args:
85
  pdf_file_path (str): The file path to the uploaded PDF.
86
-
87
  Returns:
88
- tuple: CSV download object, TXT download object, Status message.
89
  """
90
  if not pdf_file_path:
91
  return None, None, "No file uploaded."
@@ -96,16 +101,28 @@ def on_extract(pdf_file_path):
96
 
97
  # Convert DataFrame to CSV bytes
98
  csv_bytes = df_to_csv_bytes(df)
99
- csv_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_extracted.csv"
100
 
101
  # Convert full text to TXT bytes
102
  txt_bytes = text_to_txt_bytes(full_text)
103
- txt_filename = f"{pdf_file_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_full_text.txt"
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Return CSV and TXT files along with a success message
106
  return (
107
- (csv_bytes, csv_filename),
108
- (txt_bytes, txt_filename),
109
  "Extraction successful!"
110
  )
111
  except Exception as e:
@@ -118,7 +135,7 @@ with gr.Blocks() as demo:
118
  pdf_input = gr.File(
119
  label="Upload PDF",
120
  file_types=[".pdf"],
121
- type="filepath", # Using "filepath" as per Gradio's valid options
122
  interactive=True
123
  )
124
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import io
4
+ import tempfile
5
+ import os
6
  from langchain_community.document_loaders import UnstructuredFileLoader
7
 
8
+ # Create a temporary directory for storing download files
9
+ temp_dir = tempfile.TemporaryDirectory()
10
+
11
  def extract_text_with_langchain_pdf(pdf_file_path):
12
  """
13
  Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.
 
23
  documents = loader.load()
24
 
25
  extracted_data = []
26
+ doc_name = os.path.basename(pdf_file_path) # Extract document name
27
 
28
  # Concatenate all page contents into a single string
29
  pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
 
85
  def on_extract(pdf_file_path):
86
  """
87
  Callback function to extract text from PDF and return CSV and TXT data.
88
+
89
  Args:
90
  pdf_file_path (str): The file path to the uploaded PDF.
91
+
92
  Returns:
93
+ tuple: Paths to CSV and TXT files, Status message.
94
  """
95
  if not pdf_file_path:
96
  return None, None, "No file uploaded."
 
101
 
102
  # Convert DataFrame to CSV bytes
103
  csv_bytes = df_to_csv_bytes(df)
104
+ csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_extracted.csv"
105
 
106
  # Convert full text to TXT bytes
107
  txt_bytes = text_to_txt_bytes(full_text)
108
+ txt_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_full_text.txt"
109
+
110
+ # Define full paths within the temporary directory
111
+ csv_tmp_path = os.path.join(temp_dir.name, csv_filename)
112
+ txt_tmp_path = os.path.join(temp_dir.name, txt_filename)
113
+
114
+ # Write CSV bytes to temporary file
115
+ with open(csv_tmp_path, 'wb') as csv_tmp:
116
+ csv_tmp.write(csv_bytes)
117
+
118
+ # Write TXT bytes to temporary file
119
+ with open(txt_tmp_path, 'wb') as txt_tmp:
120
+ txt_tmp.write(txt_bytes)
121
 
122
+ # Return the paths to the temporary files and a success message
123
  return (
124
+ csv_tmp_path,
125
+ txt_tmp_path,
126
  "Extraction successful!"
127
  )
128
  except Exception as e:
 
135
  pdf_input = gr.File(
136
  label="Upload PDF",
137
  file_types=[".pdf"],
138
+ type="filepath", # Ensure type is set to "filepath"
139
  interactive=True
140
  )
141