import streamlit as st import os import requests from bs4 import BeautifulSoup from fpdf import FPDF import tempfile # Use safe temporary directory (works on Hugging Face) OUT_DIR = tempfile.gettempdir() def scrape_website(url): """Scrape text content from a webpage.""" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") # Extract readable text text = soup.get_text(separator="\n", strip=True) return text[:20000] # Limit content size except Exception as e: return f"Error scraping {url}: {str(e)}" def make_pdf(urls): """Create a PDF file from list of website URLs.""" pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True) pdf.set_font("DejaVu", size=12) for url in urls: st.write(f"Scraping: {url}") content = scrape_website(url) pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n") pdf_path = os.path.join(OUT_DIR, "website_data.pdf") pdf.output(pdf_path) return pdf_path # Streamlit UI st.set_page_config(page_title="Website → PDF for RAG", layout="wide") st.title("🌐 Website Scraper → PDF Generator for RAG Chatbots") urls_input = st.text_area( "Enter website URLs (one per line):", "https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events" ) if st.button("Generate PDF"): urls = [u.strip() for u in urls_input.splitlines() if u.strip()] if urls: pdf_path = make_pdf(urls) st.success("✅ PDF generated successfully!") with open(pdf_path, "rb") as f: st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf") else: st.warning("Please enter at least one URL.")