Spaces:

maaz21
/

aiwebscrapper

Sleeping

App Files Files Community

maaz21 commited on Oct 16, 2025

Commit

4c8f5ab

verified ·

1 Parent(s): 2fc9d03

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +66 -70

src/streamlit_app.py CHANGED Viewed

@@ -1,88 +1,84 @@
 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
 from fpdf import FPDF
-from PyPDF2 import PdfMerger
-import tempfile, os
-st.set_page_config(page_title="Website Scraper → PDF (RAG Ready)", layout="wide")
-st.title("🌐 Website Scraper → 📄 PDF Generator for RAG Chatbots")
-st.write("Scrape all pages from a website and generate a single combined PDF containing the content — ready for RAG ingestion.")
-url = st.text_input("Enter website URL (e.g. https://njmarketings.com):")
-def get_all_links(base_url):
-    """Find all internal links from a website"""
-    visited, to_visit = set(), [base_url]
-    domain = urlparse(base_url).netloc
-    while to_visit:
-        current = to_visit.pop(0)
-        if current in visited:
-            continue
-        visited.add(current)
-        try:
-            res = requests.get(current, timeout=10)
-            soup = BeautifulSoup(res.text, "html.parser")
-            for a in soup.find_all("a", href=True):
-                link = urljoin(base_url, a["href"])
-                if domain in urlparse(link).netloc and link not in visited and link.startswith(base_url):
-                    to_visit.append(link)
-        except Exception:
-            pass
     return list(visited)
-def extract_text(url):
-    """Extract visible text from a webpage"""
     try:
-        r = requests.get(url, timeout=10)
-        soup = BeautifulSoup(r.text, "html.parser")
-        for tag in soup(["script", "style", "noscript"]):
-            tag.extract()
-        text = " ".join(soup.stripped_strings)
-        return text
-    except Exception:
-        return ""
-def save_text_to_pdf(text, filename):
-    """Save plain text to a simple PDF using fpdf"""
     pdf = FPDF()
     pdf.add_page()
-    pdf.set_font("Arial", size=12)
     for line in text.split("\n"):
-        pdf.multi_cell(0, 10, line)
-    pdf.output(filename)
-if st.button("Scrape & Generate PDF"):
-    if not url:
-        st.warning("Please enter a valid URL first.")
-    else:
-        with st.spinner("Scraping pages..."):
-            links = get_all_links(url)
-            st.write(f"✅ Found {len(links)} pages")
-            temp_dir = tempfile.mkdtemp()
-            merger = PdfMerger()
-            for link in links:
-                text = extract_text(link)
-                filename = link.strip("/").split("/")[-1] or "index"
-                pdf_path = os.path.join(temp_dir, f"{filename}.pdf")
-                save_text_to_pdf(f"URL: {link}\n\n{text}", pdf_path)
-                merger.append(pdf_path)
-                st.success(f"Processed: {link}")
-            combined_pdf = os.path.join(temp_dir, "combined_website.pdf")
-            merger.write(combined_pdf)
-            merger.close()
-            with open(combined_pdf, "rb") as f:
-                st.download_button(
-                    "📥 Download Combined PDF",
-                    data=f,
-                    file_name="website_data.pdf",
-                    mime="application/pdf"
-                )
-        st.success("🎉 All pages scraped and combined into one PDF!")

 import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 from fpdf import FPDF
+import os
+from urllib.parse import urljoin, urlparse
+st.set_page_config(page_title="Web Scraper to PDF", layout="wide")
+# Create safe output folder
+OUT_DIR = os.path.join(os.getcwd(), "outputs")
+os.makedirs(OUT_DIR, exist_ok=True)
+st.title("🌐 Website Scraper → PDF for RAG Chatbots")
+st.write("Enter a website URL to scrape all readable text and convert it to a PDF file for your RAG model training.")
+url = st.text_input("Enter website URL (e.g., https://njmarketings.com)")
+depth = st.number_input("Number of subpages to fetch (0 for main page only)", 0, 5, 0)
+start_btn = st.button("Scrape & Generate PDF")
+def fetch_links(base_url, depth=1):
+    """Recursively fetch internal links up to the given depth."""
+    visited = set()
+    links_to_visit = [base_url]
+    for _ in range(depth):
+        new_links = []
+        for link in links_to_visit:
+            try:
+                r = requests.get(link, timeout=10)
+                soup = BeautifulSoup(r.text, 'html.parser')
+                for a in soup.find_all('a', href=True):
+                    abs_url = urljoin(base_url, a['href'])
+                    if base_url in abs_url and abs_url not in visited:
+                        visited.add(abs_url)
+                        new_links.append(abs_url)
+            except Exception:
+                pass
+        links_to_visit.extend(new_links)
     return list(visited)
+def extract_text_from_url(url):
+    """Extract visible text content from a webpage."""
     try:
+        response = requests.get(url, timeout=10)
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Remove scripts and styles
+        for script in soup(["script", "style", "noscript"]):
+            script.extract()
+        text = soup.get_text(separator="\n")
+        lines = [line.strip() for line in text.splitlines() if line.strip()]
+        return "\n".join(lines)
+    except Exception as e:
+        return f"Failed to fetch {url}: {e}"
+def create_pdf(text, output_path):
+    """Generate a PDF from text using fpdf (no external dependencies)."""
     pdf = FPDF()
     pdf.add_page()
+    pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.set_font("Arial", size=11)
     for line in text.split("\n"):
+        pdf.multi_cell(0, 8, line)
+    pdf.output(output_path)
+if start_btn and url:
+    with st.spinner("Scraping website..."):
+        all_text = ""
+        all_links = [url] + fetch_links(url, depth)
+        st.write(f"🔗 Found {len(all_links)} pages to fetch.")
+        for link in all_links:
+            st.write(f"📄 Fetching: {link}")
+            page_text = extract_text_from_url(link)
+            all_text += f"\n\n--- PAGE: {link} ---\n\n{page_text}"
+        pdf_path = os.path.join(OUT_DIR, "scraped_data.pdf")
+        create_pdf(all_text, pdf_path)
+        st.success("✅ PDF created successfully!")
+        with open(pdf_path, "rb") as f:
+            st.download_button("📥 Download PDF", f, file_name="website_data.pdf", mime="application/pdf")