sadickam commited on
Commit
c0ce244
·
verified ·
1 Parent(s): de88355

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -19
app.py CHANGED
@@ -1,45 +1,58 @@
1
  import gradio as gr
 
2
  from langchain_community.document_loaders import UnstructuredFileLoader
3
 
4
  def extract_text_with_langchain_pdf(pdf_file):
5
- """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
6
  loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
7
  documents = loader.load()
8
 
9
- # Concatenate the content from all pages with page numbers
10
- pdf_content = ""
 
 
 
11
  for doc in documents:
12
  page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
13
- pdf_content += f"\n\n--- Page {page_num} ---\n{doc.page_content.strip()}\n"
14
-
15
- return pdf_content
16
-
17
- def save_text_to_file(text, output_filename="extracted_content.txt"):
18
- """Save extracted text to a .txt file."""
19
- with open(output_filename, "w", encoding="utf-8") as f:
20
- f.write(text)
 
 
 
 
 
 
 
 
 
21
  return output_filename
22
 
23
  with gr.Blocks() as demo:
24
  with gr.Row():
25
- gr.Markdown("# PDF Text Extractor with Page Numbers")
26
 
27
  with gr.Row():
28
  pdf_file = gr.File(label="Upload PDF", type="filepath")
29
 
30
  with gr.Row():
31
- extract_button = gr.Button("Extract and Download Text")
32
 
33
  with gr.Row():
34
- download_button = gr.File(label="Download Extracted Text")
35
 
36
  def on_extract(pdf_file):
37
- """Callback function to extract text with page numbers and return a downloadable .txt file."""
38
- extracted_text = extract_text_with_langchain_pdf(pdf_file)
39
- txt_path = save_text_to_file(extracted_text)
40
- return txt_path
41
 
42
  extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
43
 
44
- # Launch the Gradio
45
  demo.queue().launch()
 
1
  import gradio as gr
2
+ import pandas as pd
3
  from langchain_community.document_loaders import UnstructuredFileLoader
4
 
5
  def extract_text_with_langchain_pdf(pdf_file):
6
+ """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
7
  loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
8
  documents = loader.load()
9
 
10
+ # Initialize an empty list to collect all extracted paragraphs
11
+ extracted_data = []
12
+
13
+ # Extract content for each page, split into paragraphs, and collect metadata
14
+ doc_name = pdf_file.split("/")[-1] # Get the document name
15
  for doc in documents:
16
  page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
17
+ paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
18
+
19
+ for paragraph in paragraphs:
20
+ if paragraph.strip(): # Skip empty paragraphs
21
+ extracted_data.append({
22
+ "Document": doc_name,
23
+ "Page": page_num,
24
+ "Paragraph": paragraph.strip()
25
+ })
26
+
27
+ # Convert the extracted data to a DataFrame
28
+ df = pd.DataFrame(extracted_data)
29
+ return df
30
+
31
+ def save_df_to_csv(df, output_filename="extracted_content.csv"):
32
+ """Save the DataFrame to a CSV file."""
33
+ df.to_csv(output_filename, index=False)
34
  return output_filename
35
 
36
  with gr.Blocks() as demo:
37
  with gr.Row():
38
+ gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
39
 
40
  with gr.Row():
41
  pdf_file = gr.File(label="Upload PDF", type="filepath")
42
 
43
  with gr.Row():
44
+ extract_button = gr.Button("Extract and Download CSV")
45
 
46
  with gr.Row():
47
+ download_button = gr.File(label="Download Extracted CSV")
48
 
49
  def on_extract(pdf_file):
50
+ """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
51
+ df = extract_text_with_langchain_pdf(pdf_file)
52
+ csv_path = save_df_to_csv(df)
53
+ return csv_path
54
 
55
  extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
56
 
57
+ # Launch the Gradio
58
  demo.queue().launch()