bluenevus commited on
Commit
373964e
·
verified ·
1 Parent(s): decbda0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -53
app.py CHANGED
@@ -8,113 +8,178 @@ import tempfile
8
  import shutil
9
  import gradio as gr
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def download_pdfs_from_page(url, progress=gr.Progress()):
12
  """
13
- Download all PDFs from a webpage and return as a zip file.
14
 
15
  Args:
16
- url: The webpage URL to scrape
17
  progress: Gradio progress tracker
18
 
19
  Returns:
20
  tuple of (zip_file_path, summary_message)
21
  """
22
 
23
- # Set headers to mimic a browser request
24
  headers = {
25
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
26
  }
27
 
28
  try:
29
- # Fetch the webpage
30
- progress(0, desc="Fetching webpage...")
31
- response = requests.get(url, headers=headers, timeout=30)
32
- response.raise_for_status()
33
 
34
- # Parse HTML
35
- soup = BeautifulSoup(response.content, 'html.parser')
36
 
37
- # Find all links
38
- all_links = soup.find_all('a', href=True)
39
 
40
- # Filter for PDF links (including those with query parameters)
41
- pdf_links = []
42
- for link in all_links:
43
- href = link['href']
44
- if '.pdf' in href.lower():
45
- full_url = urljoin(url, href)
46
- pdf_links.append(full_url)
 
 
 
 
47
 
48
- if len(pdf_links) == 0:
49
- return None, "❌ No PDF links found on the page."
50
 
51
- progress(0.1, desc=f"Found {len(pdf_links)} PDF links")
 
52
 
53
- # Create temporary directory for downloads
 
 
54
  temp_dir = tempfile.mkdtemp()
55
 
56
- # Download each PDF
57
  successful = 0
58
  failed = 0
59
  failed_urls = []
60
 
61
- for idx, pdf_url in enumerate(pdf_links, 1):
62
  try:
63
- # Extract filename from URL (remove query parameters)
64
  parsed_url = urlparse(pdf_url)
65
  path_without_query = parsed_url.path
66
  filename = os.path.basename(path_without_query)
67
 
68
- # Create full file path in temp directory
 
 
 
69
  filepath = os.path.join(temp_dir, filename)
70
 
71
  # Skip if file already exists
72
  if os.path.exists(filepath):
73
- progress((0.1 + (0.8 * idx / len(pdf_links))),
74
- desc=f"[{idx}/{len(pdf_links)}] Skipping (already exists): {filename}")
75
  successful += 1
76
  continue
77
 
78
- # Update progress
79
- progress((0.1 + (0.8 * idx / len(pdf_links))),
80
- desc=f"[{idx}/{len(pdf_links)}] Downloading: {filename}")
81
 
82
  # Download PDF
83
  pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
84
  pdf_response.raise_for_status()
85
 
 
 
 
 
 
 
86
  # Save PDF
87
  with open(filepath, 'wb') as f:
88
  f.write(pdf_response.content)
89
 
90
  successful += 1
91
-
92
- # Be polite - add a small delay between downloads
93
- time.sleep(1)
94
 
95
  except Exception as e:
96
  failed += 1
97
  failed_urls.append(f"{filename}: {str(e)}")
98
  continue
99
 
100
- # Generate summary message
101
  summary = f"""
102
  ✅ **Download Complete!**
103
 
104
  📊 **Summary:**
105
- - Total PDFs found: {len(pdf_links)}
 
106
  - Successfully downloaded: {successful}
107
  - Failed: {failed}
108
  """
109
 
110
  if failed > 0:
111
  summary += f"\n\n⚠️ **Failed Downloads:**\n"
112
- for fail in failed_urls[:10]: # Show first 10 failures
113
  summary += f"- {fail}\n"
114
  if len(failed_urls) > 10:
115
  summary += f"- ... and {len(failed_urls) - 10} more\n"
116
 
117
- # Create zip file
118
  progress(0.9, desc="Creating zip file...")
119
 
120
  zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
@@ -124,7 +189,7 @@ def download_pdfs_from_page(url, progress=gr.Progress()):
124
  file_path = os.path.join(root, file)
125
  zipf.write(file_path, arcname=file)
126
 
127
- # Clean up temp directory
128
  shutil.rmtree(temp_dir)
129
 
130
  progress(1.0, desc="Complete!")
@@ -140,22 +205,22 @@ def create_interface():
140
  with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
141
  gr.Markdown(
142
  """
143
- # 📥 PDF Downloader
144
- Download all PDFs from any webpage as a ZIP file!
145
 
146
  **Instructions:**
147
- 1. Enter the URL of the webpage containing PDF links
148
  2. Click "Download PDFs"
149
- 3. Wait for the download to complete
150
- 4. Download your ZIP file
151
  """
152
  )
153
 
154
  with gr.Row():
155
  with gr.Column():
156
  url_input = gr.Textbox(
157
- label="Webpage URL",
158
- placeholder="https://example.com/pdfs",
159
  lines=1
160
  )
161
 
@@ -166,7 +231,6 @@ def create_interface():
166
  output_file = gr.File(label="Download ZIP")
167
  summary_output = gr.Markdown(label="Summary")
168
 
169
- # Handle download button click
170
  download_btn.click(
171
  fn=download_pdfs_from_page,
172
  inputs=[url_input],
@@ -176,17 +240,17 @@ def create_interface():
176
  gr.Markdown(
177
  """
178
  ---
179
- ### 💡 Tips:
180
- - The script will find all PDF links on the page, including those with query parameters
181
- - Downloads include a 1-second delay between requests to be respectful to servers
182
- - ZIP files are automatically named with a timestamp
183
- - All PDFs are packaged into a single downloadable ZIP file
 
184
  """
185
  )
186
 
187
  return demo
188
 
189
- # Launch the interface
190
  if __name__ == "__main__":
191
  demo = create_interface()
192
  demo.launch(share=True)
 
8
  import shutil
9
  import gradio as gr
10
 
11
+ def extract_detail_page_links(url, headers):
12
+ """
13
+ Extract all detail page links from the main listing page.
14
+
15
+ Args:
16
+ url: Main page URL
17
+ headers: Request headers
18
+
19
+ Returns:
20
+ list of detail page URLs
21
+ """
22
+ response = requests.get(url, headers=headers, timeout=30)
23
+ response.raise_for_status()
24
+ soup = BeautifulSoup(response.content, 'html.parser')
25
+
26
+ detail_links = []
27
+ for link in soup.find_all('a', href=True):
28
+ href = link['href']
29
+ # Look for detail page patterns (adjust pattern as needed)
30
+ if 'Details.aspx' in href or 'PUB_ID=' in href:
31
+ full_url = urljoin(url, href)
32
+ if full_url not in detail_links:
33
+ detail_links.append(full_url)
34
+
35
+ return detail_links
36
+
37
+ def extract_pdf_links_from_page(url, headers):
38
+ """
39
+ Extract PDF links from a single page.
40
+
41
+ Args:
42
+ url: Page URL to scrape
43
+ headers: Request headers
44
+
45
+ Returns:
46
+ list of PDF URLs
47
+ """
48
+ try:
49
+ response = requests.get(url, headers=headers, timeout=30)
50
+ response.raise_for_status()
51
+ soup = BeautifulSoup(response.content, 'html.parser')
52
+
53
+ pdf_links = []
54
+ for link in soup.find_all('a', href=True):
55
+ href = link['href']
56
+ if '.pdf' in href.lower():
57
+ full_url = urljoin(url, href)
58
+ if full_url not in pdf_links:
59
+ pdf_links.append(full_url)
60
+
61
+ return pdf_links
62
+ except Exception as e:
63
+ print(f"Error extracting PDFs from {url}: {str(e)}")
64
+ return []
65
+
66
  def download_pdfs_from_page(url, progress=gr.Progress()):
67
  """
68
+ Download all PDFs from a webpage by navigating through detail pages.
69
 
70
  Args:
71
+ url: The main webpage URL to scrape
72
  progress: Gradio progress tracker
73
 
74
  Returns:
75
  tuple of (zip_file_path, summary_message)
76
  """
77
 
 
78
  headers = {
79
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
80
  }
81
 
82
  try:
83
+ # Step 1: Extract detail page links from main page
84
+ progress(0, desc="Fetching main page and extracting detail links...")
85
+ detail_page_links = extract_detail_page_links(url, headers)
 
86
 
87
+ if len(detail_page_links) == 0:
88
+ return None, "❌ No detail page links found on the main page."
89
 
90
+ progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")
 
91
 
92
+ # Step 2: Visit each detail page and collect PDF links
93
+ all_pdf_links = []
94
+ for idx, detail_url in enumerate(detail_page_links, 1):
95
+ progress(0.1 + (0.3 * idx / len(detail_page_links)),
96
+ desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")
97
+
98
+ pdf_links = extract_pdf_links_from_page(detail_url, headers)
99
+ all_pdf_links.extend(pdf_links)
100
+
101
+ # Be polite - small delay between page requests
102
+ time.sleep(0.5)
103
 
104
+ # Remove duplicates
105
+ all_pdf_links = list(set(all_pdf_links))
106
 
107
+ if len(all_pdf_links) == 0:
108
+ return None, f"❌ No PDF links found across {len(detail_page_links)} detail pages."
109
 
110
+ progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")
111
+
112
+ # Step 3: Create temporary directory for downloads
113
  temp_dir = tempfile.mkdtemp()
114
 
115
+ # Step 4: Download each PDF
116
  successful = 0
117
  failed = 0
118
  failed_urls = []
119
 
120
+ for idx, pdf_url in enumerate(all_pdf_links, 1):
121
  try:
 
122
  parsed_url = urlparse(pdf_url)
123
  path_without_query = parsed_url.path
124
  filename = os.path.basename(path_without_query)
125
 
126
+ # Handle empty filenames
127
+ if not filename or filename == '':
128
+ filename = f"document_{idx}.pdf"
129
+
130
  filepath = os.path.join(temp_dir, filename)
131
 
132
  # Skip if file already exists
133
  if os.path.exists(filepath):
134
+ progress(0.4 + (0.5 * idx / len(all_pdf_links)),
135
+ desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
136
  successful += 1
137
  continue
138
 
139
+ progress(0.4 + (0.5 * idx / len(all_pdf_links)),
140
+ desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")
 
141
 
142
  # Download PDF
143
  pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
144
  pdf_response.raise_for_status()
145
 
146
+ # Verify it's actually a PDF
147
+ if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
148
+ failed += 1
149
+ failed_urls.append(f"{filename}: Not a valid PDF file")
150
+ continue
151
+
152
  # Save PDF
153
  with open(filepath, 'wb') as f:
154
  f.write(pdf_response.content)
155
 
156
  successful += 1
157
+ time.sleep(1) # Be polite
 
 
158
 
159
  except Exception as e:
160
  failed += 1
161
  failed_urls.append(f"{filename}: {str(e)}")
162
  continue
163
 
164
+ # Step 5: Generate summary
165
  summary = f"""
166
  ✅ **Download Complete!**
167
 
168
  📊 **Summary:**
169
+ - Detail pages scanned: {len(detail_page_links)}
170
+ - Total PDFs found: {len(all_pdf_links)}
171
  - Successfully downloaded: {successful}
172
  - Failed: {failed}
173
  """
174
 
175
  if failed > 0:
176
  summary += f"\n\n⚠️ **Failed Downloads:**\n"
177
+ for fail in failed_urls[:10]:
178
  summary += f"- {fail}\n"
179
  if len(failed_urls) > 10:
180
  summary += f"- ... and {len(failed_urls) - 10} more\n"
181
 
182
+ # Step 6: Create zip file
183
  progress(0.9, desc="Creating zip file...")
184
 
185
  zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
 
189
  file_path = os.path.join(root, file)
190
  zipf.write(file_path, arcname=file)
191
 
192
+ # Clean up
193
  shutil.rmtree(temp_dir)
194
 
195
  progress(1.0, desc="Complete!")
 
205
  with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
206
  gr.Markdown(
207
  """
208
+ # 📥 Two-Level PDF Downloader
209
+ Download all PDFs from webpages with intermediate detail pages!
210
 
211
  **Instructions:**
212
+ 1. Enter the URL of the main listing page
213
  2. Click "Download PDFs"
214
+ 3. The tool will navigate through all detail pages
215
+ 4. Download your ZIP file with all PDFs
216
  """
217
  )
218
 
219
  with gr.Row():
220
  with gr.Column():
221
  url_input = gr.Textbox(
222
+ label="Main Page URL",
223
+ placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
224
  lines=1
225
  )
226
 
 
231
  output_file = gr.File(label="Download ZIP")
232
  summary_output = gr.Markdown(label="Summary")
233
 
 
234
  download_btn.click(
235
  fn=download_pdfs_from_page,
236
  inputs=[url_input],
 
240
  gr.Markdown(
241
  """
242
  ---
243
+ ### 💡 Features:
244
+ - **Two-level navigation**: Scans main page visits detail pages downloads PDFs
245
+ - **Duplicate removal**: Ensures each PDF is downloaded only once
246
+ - **Polite scraping**: Includes delays between requests
247
+ - **Error handling**: Continues even if some downloads fail
248
+ - **Progress tracking**: Real-time updates on scanning and downloading
249
  """
250
  )
251
 
252
  return demo
253
 
 
254
  if __name__ == "__main__":
255
  demo = create_interface()
256
  demo.launch(share=True)