Spaces:

MicroHealth
/

Bulk-PDF-download

Runtime error

App Files Files Community

bluenevus commited on Oct 9, 2025

Commit

373964e

verified ·

1 Parent(s): decbda0

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -53

app.py CHANGED Viewed

@@ -8,113 +8,178 @@ import tempfile
 import shutil
 import gradio as gr
 def download_pdfs_from_page(url, progress=gr.Progress()):
     """
-    Download all PDFs from a webpage and return as a zip file.
     Args:
-        url: The webpage URL to scrape
         progress: Gradio progress tracker
     Returns:
         tuple of (zip_file_path, summary_message)
     """
-    # Set headers to mimic a browser request
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     }
     try:
-        # Fetch the webpage
-        progress(0, desc="Fetching webpage...")
-        response = requests.get(url, headers=headers, timeout=30)
-        response.raise_for_status()
-        # Parse HTML
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Find all links
-        all_links = soup.find_all('a', href=True)
-        # Filter for PDF links (including those with query parameters)
-        pdf_links = []
-        for link in all_links:
-            href = link['href']
-            if '.pdf' in href.lower():
-                full_url = urljoin(url, href)
-                pdf_links.append(full_url)
-        if len(pdf_links) == 0:
-            return None, "❌ No PDF links found on the page."
-        progress(0.1, desc=f"Found {len(pdf_links)} PDF links")
-        # Create temporary directory for downloads
         temp_dir = tempfile.mkdtemp()
-        # Download each PDF
         successful = 0
         failed = 0
         failed_urls = []
-        for idx, pdf_url in enumerate(pdf_links, 1):
             try:
-                # Extract filename from URL (remove query parameters)
                 parsed_url = urlparse(pdf_url)
                 path_without_query = parsed_url.path
                 filename = os.path.basename(path_without_query)
-                # Create full file path in temp directory
                 filepath = os.path.join(temp_dir, filename)
                 # Skip if file already exists
                 if os.path.exists(filepath):
-                    progress((0.1 + (0.8 * idx / len(pdf_links))),
-                            desc=f"[{idx}/{len(pdf_links)}] Skipping (already exists): {filename}")
                     successful += 1
                     continue
-                # Update progress
-                progress((0.1 + (0.8 * idx / len(pdf_links))),
-                        desc=f"[{idx}/{len(pdf_links)}] Downloading: {filename}")
                 # Download PDF
                 pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
                 pdf_response.raise_for_status()
                 # Save PDF
                 with open(filepath, 'wb') as f:
                     f.write(pdf_response.content)
                 successful += 1
-                # Be polite - add a small delay between downloads
-                time.sleep(1)
             except Exception as e:
                 failed += 1
                 failed_urls.append(f"{filename}: {str(e)}")
                 continue
-        # Generate summary message
         summary = f"""
 ✅ **Download Complete!**
 📊 **Summary:**
-- Total PDFs found: {len(pdf_links)}
 - Successfully downloaded: {successful}
 - Failed: {failed}
 """
         if failed > 0:
             summary += f"\n\n⚠️ **Failed Downloads:**\n"
-            for fail in failed_urls[:10]:  # Show first 10 failures
                 summary += f"- {fail}\n"
             if len(failed_urls) > 10:
                 summary += f"- ... and {len(failed_urls) - 10} more\n"
-        # Create zip file
         progress(0.9, desc="Creating zip file...")
         zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
@@ -124,7 +189,7 @@ def download_pdfs_from_page(url, progress=gr.Progress()):
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, arcname=file)
-        # Clean up temp directory
         shutil.rmtree(temp_dir)
         progress(1.0, desc="Complete!")
@@ -140,22 +205,22 @@ def create_interface():
     with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
-            # 📥 PDF Downloader
-            Download all PDFs from any webpage as a ZIP file!
             **Instructions:**
-            1. Enter the URL of the webpage containing PDF links
             2. Click "Download PDFs"
-            3. Wait for the download to complete
-            4. Download your ZIP file
             """
         )
         with gr.Row():
             with gr.Column():
                 url_input = gr.Textbox(
-                    label="Webpage URL",
-                    placeholder="https://example.com/pdfs",
                     lines=1
                 )
@@ -166,7 +231,6 @@ def create_interface():
                 output_file = gr.File(label="Download ZIP")
                 summary_output = gr.Markdown(label="Summary")
-        # Handle download button click
         download_btn.click(
             fn=download_pdfs_from_page,
             inputs=[url_input],
@@ -176,17 +240,17 @@ def create_interface():
         gr.Markdown(
             """
             ---
-            ### 💡 Tips:
-            - The script will find all PDF links on the page, including those with query parameters
-            - Downloads include a 1-second delay between requests to be respectful to servers
-            - ZIP files are automatically named with a timestamp
-            - All PDFs are packaged into a single downloadable ZIP file
             """
         )
     return demo
-# Launch the interface
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(share=True)

 import shutil
 import gradio as gr
+def extract_detail_page_links(url, headers):
+    """
+    Extract all detail page links from the main listing page.
+    Args:
+        url: Main page URL
+        headers: Request headers
+    Returns:
+        list of detail page URLs
+    """
+    response = requests.get(url, headers=headers, timeout=30)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, 'html.parser')
+    detail_links = []
+    for link in soup.find_all('a', href=True):
+        href = link['href']
+        # Look for detail page patterns (adjust pattern as needed)
+        if 'Details.aspx' in href or 'PUB_ID=' in href:
+            full_url = urljoin(url, href)
+            if full_url not in detail_links:
+                detail_links.append(full_url)
+    return detail_links
+def extract_pdf_links_from_page(url, headers):
+    """
+    Extract PDF links from a single page.
+    Args:
+        url: Page URL to scrape
+        headers: Request headers
+    Returns:
+        list of PDF URLs
+    """
+    try:
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        pdf_links = []
+        for link in soup.find_all('a', href=True):
+            href = link['href']
+            if '.pdf' in href.lower():
+                full_url = urljoin(url, href)
+                if full_url not in pdf_links:
+                    pdf_links.append(full_url)
+        return pdf_links
+    except Exception as e:
+        print(f"Error extracting PDFs from {url}: {str(e)}")
+        return []
 def download_pdfs_from_page(url, progress=gr.Progress()):
     """
+    Download all PDFs from a webpage by navigating through detail pages.
     Args:
+        url: The main webpage URL to scrape
         progress: Gradio progress tracker
     Returns:
         tuple of (zip_file_path, summary_message)
     """
     headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
     }
     try:
+        # Step 1: Extract detail page links from main page
+        progress(0, desc="Fetching main page and extracting detail links...")
+        detail_page_links = extract_detail_page_links(url, headers)
+        if len(detail_page_links) == 0:
+            return None, "❌ No detail page links found on the main page."
+        progress(0.1, desc=f"Found {len(detail_page_links)} detail pages to process")
+        # Step 2: Visit each detail page and collect PDF links
+        all_pdf_links = []
+        for idx, detail_url in enumerate(detail_page_links, 1):
+            progress(0.1 + (0.3 * idx / len(detail_page_links)),
+                    desc=f"[{idx}/{len(detail_page_links)}] Scanning detail page...")
+            pdf_links = extract_pdf_links_from_page(detail_url, headers)
+            all_pdf_links.extend(pdf_links)
+            # Be polite - small delay between page requests
+            time.sleep(0.5)
+        # Remove duplicates
+        all_pdf_links = list(set(all_pdf_links))
+        if len(all_pdf_links) == 0:
+            return None, f"❌ No PDF links found across {len(detail_page_links)} detail pages."
+        progress(0.4, desc=f"Found {len(all_pdf_links)} unique PDFs to download")
+        # Step 3: Create temporary directory for downloads
         temp_dir = tempfile.mkdtemp()
+        # Step 4: Download each PDF
         successful = 0
         failed = 0
         failed_urls = []
+        for idx, pdf_url in enumerate(all_pdf_links, 1):
             try:
                 parsed_url = urlparse(pdf_url)
                 path_without_query = parsed_url.path
                 filename = os.path.basename(path_without_query)
+                # Handle empty filenames
+                if not filename or filename == '':
+                    filename = f"document_{idx}.pdf"
                 filepath = os.path.join(temp_dir, filename)
                 # Skip if file already exists
                 if os.path.exists(filepath):
+                    progress(0.4 + (0.5 * idx / len(all_pdf_links)),
+                            desc=f"[{idx}/{len(all_pdf_links)}] Skipping: {filename}")
                     successful += 1
                     continue
+                progress(0.4 + (0.5 * idx / len(all_pdf_links)),
+                        desc=f"[{idx}/{len(all_pdf_links)}] Downloading: {filename}")
                 # Download PDF
                 pdf_response = requests.get(pdf_url, headers=headers, timeout=60)
                 pdf_response.raise_for_status()
+                # Verify it's actually a PDF
+                if pdf_response.headers.get('content-type', '').lower() not in ['application/pdf', 'application/octet-stream']:
+                    failed += 1
+                    failed_urls.append(f"{filename}: Not a valid PDF file")
+                    continue
                 # Save PDF
                 with open(filepath, 'wb') as f:
                     f.write(pdf_response.content)
                 successful += 1
+                time.sleep(1)  # Be polite
             except Exception as e:
                 failed += 1
                 failed_urls.append(f"{filename}: {str(e)}")
                 continue
+        # Step 5: Generate summary
         summary = f"""
 ✅ **Download Complete!**
 📊 **Summary:**
+- Detail pages scanned: {len(detail_page_links)}
+- Total PDFs found: {len(all_pdf_links)}
 - Successfully downloaded: {successful}
 - Failed: {failed}
 """
         if failed > 0:
             summary += f"\n\n⚠️ **Failed Downloads:**\n"
+            for fail in failed_urls[:10]:
                 summary += f"- {fail}\n"
             if len(failed_urls) > 10:
                 summary += f"- ... and {len(failed_urls) - 10} more\n"
+        # Step 6: Create zip file
         progress(0.9, desc="Creating zip file...")
         zip_path = os.path.join(tempfile.gettempdir(), f"pdfs_{int(time.time())}.zip")
                     file_path = os.path.join(root, file)
                     zipf.write(file_path, arcname=file)
+        # Clean up
         shutil.rmtree(temp_dir)
         progress(1.0, desc="Complete!")
     with gr.Blocks(title="PDF Downloader", theme=gr.themes.Soft()) as demo:
         gr.Markdown(
             """
+            # 📥 Two-Level PDF Downloader
+            Download all PDFs from webpages with intermediate detail pages!
             **Instructions:**
+            1. Enter the URL of the main listing page
             2. Click "Download PDFs"
+            3. The tool will navigate through all detail pages
+            4. Download your ZIP file with all PDFs
             """
         )
         with gr.Row():
             with gr.Column():
                 url_input = gr.Textbox(
+                    label="Main Page URL",
+                    placeholder="https://armypubs.army.mil/ProductMaps/PubForm/AR.aspx",
                     lines=1
                 )
                 output_file = gr.File(label="Download ZIP")
                 summary_output = gr.Markdown(label="Summary")
         download_btn.click(
             fn=download_pdfs_from_page,
             inputs=[url_input],
         gr.Markdown(
             """
             ---
+            ### 💡 Features:
+            - **Two-level navigation**: Scans main page → visits detail pages → downloads PDFs
+            - **Duplicate removal**: Ensures each PDF is downloaded only once
+            - **Polite scraping**: Includes delays between requests
+            - **Error handling**: Continues even if some downloads fail
+            - **Progress tracking**: Real-time updates on scanning and downloading
             """
         )
     return demo
 if __name__ == "__main__":
     demo = create_interface()
     demo.launch(share=True)