Spaces:
Sleeping
Sleeping
File size: 1,951 Bytes
bad4426 0d79f8e bad4426 12a3818 5a0a5cc 4c8f5ab 9d51b92 5a0a5cc 4c8f5ab 0d79f8e bad4426 4c8f5ab 0d79f8e 4c8f5ab 0d79f8e 9d51b92 4c8f5ab 0d79f8e 4c8f5ab 0d79f8e 9d51b92 12a3818 4c8f5ab 9d51b92 bad4426 0d79f8e 9d51b92 0d79f8e bad4426 0d79f8e 712cc1f 0d79f8e 4c8f5ab 0d79f8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import streamlit as st
import os
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import tempfile
# Use safe temporary directory (works on Hugging Face)
OUT_DIR = tempfile.gettempdir()
def scrape_website(url):
"""Scrape text content from a webpage."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract readable text
text = soup.get_text(separator="\n", strip=True)
return text[:20000] # Limit content size
except Exception as e:
return f"Error scraping {url}: {str(e)}"
def make_pdf(urls):
"""Create a PDF file from list of website URLs."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
pdf.set_font("DejaVu", size=12)
for url in urls:
st.write(f"Scraping: {url}")
content = scrape_website(url)
pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")
pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
pdf.output(pdf_path)
return pdf_path
# Streamlit UI
st.set_page_config(page_title="Website → PDF for RAG", layout="wide")
st.title("🌐 Website Scraper → PDF Generator for RAG Chatbots")
urls_input = st.text_area(
"Enter website URLs (one per line):",
"https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events"
)
if st.button("Generate PDF"):
urls = [u.strip() for u in urls_input.splitlines() if u.strip()]
if urls:
pdf_path = make_pdf(urls)
st.success("✅ PDF generated successfully!")
with open(pdf_path, "rb") as f:
st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
else:
st.warning("Please enter at least one URL.")
|