Spaces:

Mishal23
/

web-scrapper

Paused

App Files Files Community

Mishal23 commited on Feb 19, 2025

Commit

dcfc8fd

verified ·

1 Parent(s): 765a30b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -46

app.py CHANGED Viewed

@@ -1,117 +1,105 @@
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
-import pdfkit
 import os
 import math
 # Function to extract all links from a website
 def extract_links(url):
     try:
         response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
         if response.status_code != 200:
             return f"Error: Unable to fetch page (Status Code {response.status_code})", []
         soup = BeautifulSoup(response.text, "html.parser")
         base_url = "/".join(url.split("/")[:3])  # Extract base domain
         links = []
         for a_tag in soup.find_all("a", href=True):
             href = a_tag["href"]
             if not href.startswith("http"):  # Convert relative links to absolute
                 href = base_url + href if href.startswith("/") else base_url + "/" + href
             links.append(href)
         links = list(set(links))  # Remove duplicates
         if not links:
             return "No links found on the website.", []
-        return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links
     except Exception as e:
         return f"Error: {str(e)}", []
 # Function to clean unwanted content (like headers, footers, etc.)
 def clean_content(soup):
-    # Remove common unwanted elements
     for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
         tag.decompose()  # Remove the tag completely
-    # You can also remove specific classes or IDs if necessary, for example:
-    # for tag in soup.find_all(attrs={"class": "footer"}):
-    #     tag.decompose()
-    # Get the cleaned text from the remaining content
     return soup.get_text(separator="\n", strip=True)
-# Function to scrape selected links and generate PDFs
-def scrape_and_generate_pdfs(selected_links):
     try:
         if not selected_links:
             return "No links selected.", None
-        pdf_files = []
-        batch_size = 4  # Each PDF contains up to 4 links
-        # Process selected links in batches of 4
         for i in range(0, len(selected_links), batch_size):
             batch_links = selected_links[i:i + batch_size]
-            all_text = ""
-            # Scrape text content from each selected link
             for link in batch_links:
                 try:
                     response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
                     if response.status_code == 200:
                         soup = BeautifulSoup(response.text, "html.parser")
                         page_text = clean_content(soup)
-                        all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
-                except:
-                    all_text += f"Failed to fetch content from {link}\n\n"
-            if all_text:
-                pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
-                # Save as temporary HTML file
-                html_path = f"temp_{i}.html"
-                with open(html_path, "w", encoding="utf-8") as f:
-                    f.write(f"<html><body><pre>{all_text}</pre></body></html>")
-                # Convert HTML to PDF
-                pdfkit.from_file(html_path, pdf_filename)
-                os.remove(html_path)
-                pdf_files.append(pdf_filename)
-        return pdf_files  # Return list of generated PDFs
     except Exception as e:
         return f"Error: {str(e)}", None
 # Gradio UI with link selection
-def show_links_and_generate_pdfs(url):
     message, links = extract_links(url)
     if not links:
         return message, gr.update(choices=[], value=[])
     return message, gr.update(choices=links, value=[])
 iface = gr.Blocks()
 with iface:
-    gr.Markdown("### 🌐 Web Scraper & PDF Generator")
-    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")
     url_input = gr.Textbox(label="Enter Website URL")
     extract_btn = gr.Button("Extract Links")
     message_output = gr.Markdown("")
     link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
-    generate_btn = gr.Button("Generate PDFs")
-    pdf_output = gr.File(label="Download Generated PDFs")
-    extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
-    generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)
 iface.launch()

 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
 import os
 import math
+from docx import Document  # Import for Word file generation
 # Function to extract all links from a website
 def extract_links(url):
     try:
         response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
         if response.status_code != 200:
             return f"Error: Unable to fetch page (Status Code {response.status_code})", []
         soup = BeautifulSoup(response.text, "html.parser")
         base_url = "/".join(url.split("/")[:3])  # Extract base domain
         links = []
         for a_tag in soup.find_all("a", href=True):
             href = a_tag["href"]
             if not href.startswith("http"):  # Convert relative links to absolute
                 href = base_url + href if href.startswith("/") else base_url + "/" + href
             links.append(href)
         links = list(set(links))  # Remove duplicates
         if not links:
             return "No links found on the website.", []
+        return f"✅ {len(links)} links found! Select which ones to convert into Word files:", links
     except Exception as e:
         return f"Error: {str(e)}", []
 # Function to clean unwanted content (like headers, footers, etc.)
 def clean_content(soup):
     for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
         tag.decompose()  # Remove the tag completely
     return soup.get_text(separator="\n", strip=True)
+# Function to scrape selected links and generate Word files
+def scrape_and_generate_word(selected_links):
     try:
         if not selected_links:
             return "No links selected.", None
+        word_files = []
+        batch_size = 4  # Each Word file contains up to 4 links
         for i in range(0, len(selected_links), batch_size):
             batch_links = selected_links[i:i + batch_size]
+            doc = Document()
             for link in batch_links:
                 try:
                     response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
                     if response.status_code == 200:
                         soup = BeautifulSoup(response.text, "html.parser")
                         page_text = clean_content(soup)
+                        # Add title for each link
+                        doc.add_heading(f"Content from: {link}", level=1)
+                        doc.add_paragraph(page_text)
+                        doc.add_page_break()  # Ensure proper formatting
+                except:
+                    doc.add_paragraph(f"Failed to fetch content from {link}\n\n")
+            # Save the Word file
+            word_filename = f"output_{(i//batch_size) + 1}.docx"
+            doc.save(word_filename)
+            word_files.append(word_filename)
+        return word_files  # Return list of generated Word files
     except Exception as e:
         return f"Error: {str(e)}", None
 # Gradio UI with link selection
+def show_links_and_generate_word(url):
     message, links = extract_links(url)
     if not links:
         return message, gr.update(choices=[], value=[])
     return message, gr.update(choices=links, value=[])
 iface = gr.Blocks()
 with iface:
+    gr.Markdown("### 🌐 Web Scraper & Word Document Generator")
+    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).")
     url_input = gr.Textbox(label="Enter Website URL")
     extract_btn = gr.Button("Extract Links")
     message_output = gr.Markdown("")
     link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
+    generate_btn = gr.Button("Generate Word Files")
+    word_output = gr.File(label="Download Generated Word Files")
+    extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector])
+    generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output)
 iface.launch()