Spaces:

Mishal23
/

web-scrapper

Paused

App Files Files Community

Mishal23 commited on Feb 18

Commit

d64f230

verified ·

1 Parent(s): 46416f9

Create app.py

Browse files

Files changed (1) hide show

app.py +118 -0

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import pdfkit
+import os
+import math
+# Function to extract all links from a website
+def extract_links(url):
+    try:
+        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
+        if response.status_code != 200:
+            return f"Error: Unable to fetch page (Status Code {response.status_code})", []
+        soup = BeautifulSoup(response.text, "html.parser")
+        base_url = "/".join(url.split("/")[:3])  # Extract base domain
+        links = []
+        for a_tag in soup.find_all("a", href=True):
+            href = a_tag["href"]
+            if not href.startswith("http"):  # Convert relative links to absolute
+                href = base_url + href if href.startswith("/") else base_url + "/" + href
+            links.append(href)
+        links = list(set(links))  # Remove duplicates
+        if not links:
+            return "No links found on the website.", []
+        return f"✅ {len(links)} links found! Select which ones to convert into PDFs:", links
+    except Exception as e:
+        return f"Error: {str(e)}", []
+# Function to clean unwanted content (like headers, footers, etc.)
+def clean_content(soup):
+    # Remove common unwanted elements
+    for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
+        tag.decompose()  # Remove the tag completely
+    # You can also remove specific classes or IDs if necessary, for example:
+    # for tag in soup.find_all(attrs={"class": "footer"}):
+    #     tag.decompose()
+    # Get the cleaned text from the remaining content
+    return soup.get_text(separator="\n", strip=True)
+# Function to scrape selected links and generate PDFs
+def scrape_and_generate_pdfs(selected_links):
+    try:
+        if not selected_links:
+            return "No links selected.", None
+        pdf_files = []
+        batch_size = 4  # Each PDF contains up to 4 links
+        # Process selected links in batches of 4
+        for i in range(0, len(selected_links), batch_size):
+            batch_links = selected_links[i:i + batch_size]
+            all_text = ""
+            # Scrape text content from each selected link
+            for link in batch_links:
+                try:
+                    response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
+                    if response.status_code == 200:
+                        soup = BeautifulSoup(response.text, "html.parser")
+                        page_text = clean_content(soup)
+                        all_text += f"--- Content from: {link} ---\n\n" + page_text + "\n\n"
+                except:
+                    all_text += f"Failed to fetch content from {link}\n\n"
+            if all_text:
+                pdf_filename = f"output_{(i//batch_size) + 1}.pdf"
+                # Save as temporary HTML file
+                html_path = f"temp_{i}.html"
+                with open(html_path, "w", encoding="utf-8") as f:
+                    f.write(f"<html><body><pre>{all_text}</pre></body></html>")
+                # Convert HTML to PDF
+                pdfkit.from_file(html_path, pdf_filename)
+                os.remove(html_path)
+                pdf_files.append(pdf_filename)
+        return pdf_files  # Return list of generated PDFs
+    except Exception as e:
+        return f"Error: {str(e)}", None
+# Gradio UI with link selection
+def show_links_and_generate_pdfs(url):
+    message, links = extract_links(url)
+    if not links:
+        return message, gr.update(choices=[], value=[])
+    return message, gr.update(choices=links, value=[])
+iface = gr.Blocks()
+with iface:
+    gr.Markdown("### 🌐 Web Scraper & PDF Generator")
+    gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into PDFs (4 links per PDF).")
+    url_input = gr.Textbox(label="Enter Website URL")
+    extract_btn = gr.Button("Extract Links")
+    message_output = gr.Markdown("")
+    link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
+    generate_btn = gr.Button("Generate PDFs")
+    pdf_output = gr.File(label="Download Generated PDFs")
+    extract_btn.click(show_links_and_generate_pdfs, inputs=url_input, outputs=[message_output, link_selector])
+    generate_btn.click(scrape_and_generate_pdfs, inputs=link_selector, outputs=pdf_output)
+iface.launch()