Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +66 -70
src/streamlit_app.py
CHANGED
|
@@ -1,88 +1,84 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
-
from urllib.parse import urljoin, urlparse
|
| 5 |
from fpdf import FPDF
|
| 6 |
-
|
| 7 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
st.
|
| 10 |
-
st.title("π Website Scraper β π PDF Generator for RAG Chatbots")
|
| 11 |
-
st.write("Scrape all pages from a website and generate a single combined PDF containing the content β ready for RAG ingestion.")
|
| 12 |
|
| 13 |
-
url = st.text_input("Enter website URL (e.g. https://njmarketings.com)
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
def
|
| 16 |
-
"""
|
| 17 |
-
visited
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
return list(visited)
|
| 35 |
|
| 36 |
-
def
|
| 37 |
-
"""Extract visible text from a webpage"""
|
| 38 |
try:
|
| 39 |
-
|
| 40 |
-
soup = BeautifulSoup(
|
| 41 |
-
for tag in soup(["script", "style", "noscript"]):
|
| 42 |
-
tag.extract()
|
| 43 |
-
text = " ".join(soup.stripped_strings)
|
| 44 |
-
return text
|
| 45 |
-
except Exception:
|
| 46 |
-
return ""
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
pdf = FPDF()
|
| 51 |
pdf.add_page()
|
| 52 |
-
pdf.
|
|
|
|
| 53 |
for line in text.split("\n"):
|
| 54 |
-
pdf.multi_cell(0,
|
| 55 |
-
pdf.output(
|
| 56 |
-
|
| 57 |
-
if st.button("Scrape & Generate PDF"):
|
| 58 |
-
if not url:
|
| 59 |
-
st.warning("Please enter a valid URL first.")
|
| 60 |
-
else:
|
| 61 |
-
with st.spinner("Scraping pages..."):
|
| 62 |
-
links = get_all_links(url)
|
| 63 |
-
st.write(f"β
Found {len(links)} pages")
|
| 64 |
-
|
| 65 |
-
temp_dir = tempfile.mkdtemp()
|
| 66 |
-
merger = PdfMerger()
|
| 67 |
-
|
| 68 |
-
for link in links:
|
| 69 |
-
text = extract_text(link)
|
| 70 |
-
filename = link.strip("/").split("/")[-1] or "index"
|
| 71 |
-
pdf_path = os.path.join(temp_dir, f"{filename}.pdf")
|
| 72 |
-
save_text_to_pdf(f"URL: {link}\n\n{text}", pdf_path)
|
| 73 |
-
merger.append(pdf_path)
|
| 74 |
-
st.success(f"Processed: {link}")
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
"π₯ Download Combined PDF",
|
| 83 |
-
data=f,
|
| 84 |
-
file_name="website_data.pdf",
|
| 85 |
-
mime="application/pdf"
|
| 86 |
-
)
|
| 87 |
|
| 88 |
-
st.success("
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import requests
|
| 3 |
from bs4 import BeautifulSoup
|
|
|
|
| 4 |
from fpdf import FPDF
|
| 5 |
+
import os
|
| 6 |
+
from urllib.parse import urljoin, urlparse
|
| 7 |
+
|
| 8 |
+
st.set_page_config(page_title="Web Scraper to PDF", layout="wide")
|
| 9 |
+
|
| 10 |
+
# Create safe output folder
|
| 11 |
+
OUT_DIR = os.path.join(os.getcwd(), "outputs")
|
| 12 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
st.title("π Website Scraper β PDF for RAG Chatbots")
|
| 15 |
|
| 16 |
+
st.write("Enter a website URL to scrape all readable text and convert it to a PDF file for your RAG model training.")
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
url = st.text_input("Enter website URL (e.g., https://njmarketings.com)")
|
| 19 |
+
depth = st.number_input("Number of subpages to fetch (0 for main page only)", 0, 5, 0)
|
| 20 |
+
start_btn = st.button("Scrape & Generate PDF")
|
| 21 |
|
| 22 |
+
def fetch_links(base_url, depth=1):
|
| 23 |
+
"""Recursively fetch internal links up to the given depth."""
|
| 24 |
+
visited = set()
|
| 25 |
+
links_to_visit = [base_url]
|
| 26 |
|
| 27 |
+
for _ in range(depth):
|
| 28 |
+
new_links = []
|
| 29 |
+
for link in links_to_visit:
|
| 30 |
+
try:
|
| 31 |
+
r = requests.get(link, timeout=10)
|
| 32 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
| 33 |
+
for a in soup.find_all('a', href=True):
|
| 34 |
+
abs_url = urljoin(base_url, a['href'])
|
| 35 |
+
if base_url in abs_url and abs_url not in visited:
|
| 36 |
+
visited.add(abs_url)
|
| 37 |
+
new_links.append(abs_url)
|
| 38 |
+
except Exception:
|
| 39 |
+
pass
|
| 40 |
+
links_to_visit.extend(new_links)
|
| 41 |
return list(visited)
|
| 42 |
|
| 43 |
+
def extract_text_from_url(url):
|
| 44 |
+
"""Extract visible text content from a webpage."""
|
| 45 |
try:
|
| 46 |
+
response = requests.get(url, timeout=10)
|
| 47 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Remove scripts and styles
|
| 50 |
+
for script in soup(["script", "style", "noscript"]):
|
| 51 |
+
script.extract()
|
| 52 |
+
|
| 53 |
+
text = soup.get_text(separator="\n")
|
| 54 |
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 55 |
+
return "\n".join(lines)
|
| 56 |
+
except Exception as e:
|
| 57 |
+
return f"Failed to fetch {url}: {e}"
|
| 58 |
+
|
| 59 |
+
def create_pdf(text, output_path):
|
| 60 |
+
"""Generate a PDF from text using fpdf (no external dependencies)."""
|
| 61 |
pdf = FPDF()
|
| 62 |
pdf.add_page()
|
| 63 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
| 64 |
+
pdf.set_font("Arial", size=11)
|
| 65 |
for line in text.split("\n"):
|
| 66 |
+
pdf.multi_cell(0, 8, line)
|
| 67 |
+
pdf.output(output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
if start_btn and url:
|
| 70 |
+
with st.spinner("Scraping website..."):
|
| 71 |
+
all_text = ""
|
| 72 |
+
all_links = [url] + fetch_links(url, depth)
|
| 73 |
+
st.write(f"π Found {len(all_links)} pages to fetch.")
|
| 74 |
+
for link in all_links:
|
| 75 |
+
st.write(f"π Fetching: {link}")
|
| 76 |
+
page_text = extract_text_from_url(link)
|
| 77 |
+
all_text += f"\n\n--- PAGE: {link} ---\n\n{page_text}"
|
| 78 |
|
| 79 |
+
pdf_path = os.path.join(OUT_DIR, "scraped_data.pdf")
|
| 80 |
+
create_pdf(all_text, pdf_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
st.success("β
PDF created successfully!")
|
| 83 |
+
with open(pdf_path, "rb") as f:
|
| 84 |
+
st.download_button("π₯ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
|