sadickam commited on
Commit
ff0f9fc
·
verified ·
1 Parent(s): 24333e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -153
app.py CHANGED
@@ -1,191 +1,83 @@
1
  import gradio as gr
2
  import pandas as pd
 
3
  import os
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from PyPDF2 import PdfReader
6
 
7
- def extract_text_by_page(pdf_file_path, page_num):
8
- """
9
- Extract text from a single page of the PDF and return as a list of dictionaries.
10
-
11
- Parameters:
12
- - pdf_file_path (str): Path to the uploaded PDF file.
13
- - page_num (int): Page number to extract (1-based indexing).
14
-
15
- Returns:
16
- - list of dict: Extracted data with Document, Page, and Paragraph.
17
- """
18
  doc_name = os.path.basename(pdf_file_path)
 
19
  extracted_data = []
20
 
21
- try:
 
 
22
  loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
23
  documents = loader.load()
 
24
  if not documents:
25
  print(f"No content found on Page {page_num}.")
26
- return extracted_data # Empty list
27
-
28
- # Concatenate all text from the page to preserve column integrity
29
- pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
30
-
31
- # Split content into paragraphs based on double newlines
32
- paragraphs = pdf_pages_content.split("\n\n")
33
-
34
- for para in paragraphs:
35
- if para.strip(): # Skip empty paragraphs
36
- extracted_data.append({
37
- "Document": doc_name,
38
- "Page": page_num,
39
- "Paragraph": para.strip()
40
- })
41
-
42
- except Exception as e:
43
- print(f"Error processing Page {page_num}: {e}")
44
- extracted_data.append({
45
- "Document": doc_name,
46
- "Page": page_num,
47
- "Paragraph": f"Error extracting this page: {e}"
48
- })
49
 
50
  return extracted_data
51
 
52
  def save_to_csv(data, output_filename="extracted_content.csv"):
53
- """
54
- Save extracted data to a CSV file.
55
-
56
- Parameters:
57
- - data (list of dict): Extracted data.
58
- - output_filename (str): Name of the output CSV file.
59
-
60
- Returns:
61
- - str: Path to the saved CSV file.
62
- """
63
  df = pd.DataFrame(data)
64
  df.to_csv(output_filename, index=False)
65
  return output_filename
66
 
67
- def extract_and_save(pdf_file, extraction_option, start_page, end_page):
68
- """
69
- Main function to extract text based on user options and save to CSV.
70
-
71
- Parameters:
72
- - pdf_file (File): Uploaded PDF file.
73
- - extraction_option (str): 'All Pages' or 'Page Range'.
74
- - start_page (int): Starting page number (if applicable).
75
- - end_page (int): Ending page number (if applicable).
76
-
77
- Returns:
78
- - tuple: (csv_path, message)
79
- """
80
  if pdf_file is None:
81
- return None, "No file uploaded."
82
-
83
- pdf_file_path = pdf_file.name
84
-
85
- # Initialize PDF reader to get total pages
86
- try:
87
- reader = PdfReader(pdf_file_path)
88
- total_pages = len(reader.pages)
89
- if total_pages == 0:
90
- return None, "❌ The uploaded PDF has no pages."
91
- except Exception as e:
92
- return None, f"❌ Error reading PDF: {e}"
93
-
94
- # Determine extraction parameters
95
- if extraction_option == "All Pages":
96
- pages_to_extract = list(range(1, total_pages + 1))
97
- else:
98
- # Validate start and end pages
99
- if start_page is None or end_page is None:
100
- return None, "❌ Please specify both start and end pages."
101
- if start_page < 1 or end_page > total_pages:
102
- return None, f"❌ Page range must be between 1 and {total_pages}."
103
- if start_page > end_page:
104
- return None, "❌ Start page cannot be greater than end page."
105
- pages_to_extract = list(range(int(start_page), int(end_page) + 1))
106
 
107
- extracted_data = []
108
-
109
- try:
110
- for page_num in pages_to_extract:
111
- print(f"Processing Page {page_num}/{len(pages_to_extract)}")
112
- page_data = extract_text_by_page(pdf_file_path, page_num)
113
- extracted_data.extend(page_data)
114
- except Exception as e:
115
- return None, f"❌ An error occurred during extraction: {e}"
116
 
117
  if not extracted_data:
118
- return None, "No text extracted from the specified pages."
119
 
120
  # Save to CSV
121
- try:
122
- csv_filename = "extracted_content.csv"
123
- csv_path = save_to_csv(extracted_data, csv_filename)
124
- except Exception as e:
125
- return None, f"❌ Error saving CSV: {e}"
126
 
127
- return csv_path, "✅ Extraction successful! Download your CSV file below."
128
 
129
  # Gradio Interface
130
  with gr.Blocks() as demo:
131
- gr.Markdown("""
132
- # 📄 PDF Text Extractor with Page Range Selection and CSV Export
133
-
134
- Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
135
-
136
- ## How It Works
137
-
138
- 1. **Upload PDF**: Select and upload your PDF file.
139
-
140
- 2. **Choose Extraction Option**:
141
- - **All Pages**: Extract text from every page in the PDF.
142
- - **Page Range**: Specify the start and end pages to extract text from.
143
-
144
- 3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
145
-
146
- 4. **Download**: Once complete, download the CSV file containing the extracted data.
147
- """)
148
-
149
- with gr.Row():
150
- pdf_input = gr.File(label="📁 Upload PDF", type="filepath")
151
-
152
- with gr.Row():
153
- extraction_option = gr.Radio(
154
- choices=["All Pages", "Page Range"],
155
- value="All Pages",
156
- label="Extraction Option"
157
- )
158
-
159
  with gr.Row():
160
- start_page = gr.Number(label="📝 Start Page", value=1, precision=0, visible=False)
161
- end_page = gr.Number(label="📝 End Page", value=1, precision=0, visible=False)
162
-
163
- # Show or hide start/end page inputs based on extraction option
164
- def toggle_page_range(option):
165
- if option == "Page Range":
166
- return gr.update(visible=True), gr.update(visible=True)
167
- else:
168
- return gr.update(visible=False), gr.update(visible=False)
169
-
170
- extraction_option.change(
171
- fn=toggle_page_range,
172
- inputs=[extraction_option],
173
- outputs=[start_page, end_page]
174
- )
175
-
176
  with gr.Row():
177
- extract_button = gr.Button("🟢 Extract and Download CSV")
178
-
179
  with gr.Row():
180
- download_csv = gr.File(label="📥 Download Extracted CSV")
181
- message = gr.Textbox(label="Message", interactive=False, lines=2)
182
-
183
  extract_button.click(
184
  fn=extract_and_save,
185
- inputs=[pdf_input, extraction_option, start_page, end_page],
186
- outputs=[download_csv, message],
187
- show_progress=False # Progress tracking removed
188
  )
189
-
190
- # Launch the Gradio
191
  demo.queue().launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import time
4
  import os
5
  from langchain_community.document_loaders import UnstructuredPDFLoader
6
  from PyPDF2 import PdfReader
7
 
8
+ def extract_text_by_page(pdf_file_path):
9
+ """Extract text from each page of the PDF and return as a list of dictionaries."""
10
+ # Initialize PDF reader
11
+ reader = PdfReader(pdf_file_path)
12
+ num_pages = len(reader.pages)
 
 
 
 
 
 
13
  doc_name = os.path.basename(pdf_file_path)
14
+
15
  extracted_data = []
16
 
17
+ for page_num in range(1, num_pages + 1):
18
+ print(f"Processing Page {page_num}...")
19
+ # Initialize the loader for the specific page
20
  loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
21
  documents = loader.load()
22
+
23
  if not documents:
24
  print(f"No content found on Page {page_num}.")
25
+ continue
26
+
27
+ for doc in documents:
28
+ paragraphs = doc.page_content.split("\n\n") # Split text into paragraphs
29
+ for para in paragraphs:
30
+ if para.strip(): # Skip empty paragraphs
31
+ extracted_data.append({
32
+ "Document": doc_name,
33
+ "Page": page_num,
34
+ "Paragraph": para.strip()
35
+ })
36
+
37
+ time.sleep(1) # Optional: Introduce a small delay between pages
 
 
 
 
 
 
 
 
 
 
38
 
39
  return extracted_data
40
 
41
  def save_to_csv(data, output_filename="extracted_content.csv"):
42
+ """Save extracted data to a CSV file."""
 
 
 
 
 
 
 
 
 
43
  df = pd.DataFrame(data)
44
  df.to_csv(output_filename, index=False)
45
  return output_filename
46
 
47
+ def extract_and_save(pdf_file):
48
+ """Main function to extract text and save to CSV."""
 
 
 
 
 
 
 
 
 
 
 
49
  if pdf_file is None:
50
+ return "No file uploaded."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ # Extract text by page
53
+ extracted_data = extract_text_by_page(pdf_file.name)
 
 
 
 
 
 
 
54
 
55
  if not extracted_data:
56
+ return "No text extracted from the PDF."
57
 
58
  # Save to CSV
59
+ csv_path = save_to_csv(extracted_data)
 
 
 
 
60
 
61
+ return csv_path
62
 
63
  # Gradio Interface
64
  with gr.Blocks() as demo:
65
+ gr.Markdown("# PDF Text Extractor with Page Tracking and CSV Export")
66
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  with gr.Row():
68
+ pdf_input = gr.File(label="Upload PDF", type="filepath")
69
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  with gr.Row():
71
+ extract_button = gr.Button("Extract and Download CSV")
72
+
73
  with gr.Row():
74
+ download_csv = gr.File(label="Download Extracted CSV")
75
+
 
76
  extract_button.click(
77
  fn=extract_and_save,
78
+ inputs=pdf_input,
79
+ outputs=download_csv
 
80
  )
81
+
82
+ # Launch the Gradio app
83
  demo.queue().launch()