Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from fpdf import FPDF | |
| import tempfile | |
| # Use safe temporary directory (works on Hugging Face) | |
| OUT_DIR = tempfile.gettempdir() | |
| def scrape_website(url): | |
| """Scrape text content from a webpage.""" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Extract readable text | |
| text = soup.get_text(separator="\n", strip=True) | |
| return text[:20000] # Limit content size | |
| except Exception as e: | |
| return f"Error scraping {url}: {str(e)}" | |
| def make_pdf(urls): | |
| """Create a PDF file from list of website URLs.""" | |
| pdf = FPDF() | |
| pdf.set_auto_page_break(auto=True, margin=15) | |
| pdf.add_page() | |
| pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True) | |
| pdf.set_font("DejaVu", size=12) | |
| for url in urls: | |
| st.write(f"Scraping: {url}") | |
| content = scrape_website(url) | |
| pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n") | |
| pdf_path = os.path.join(OUT_DIR, "website_data.pdf") | |
| pdf.output(pdf_path) | |
| return pdf_path | |
| # Streamlit UI | |
| st.set_page_config(page_title="Website β PDF for RAG", layout="wide") | |
| st.title("π Website Scraper β PDF Generator for RAG Chatbots") | |
| urls_input = st.text_area( | |
| "Enter website URLs (one per line):", | |
| "https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events" | |
| ) | |
| if st.button("Generate PDF"): | |
| urls = [u.strip() for u in urls_input.splitlines() if u.strip()] | |
| if urls: | |
| pdf_path = make_pdf(urls) | |
| st.success("β PDF generated successfully!") | |
| with open(pdf_path, "rb") as f: | |
| st.download_button("β¬οΈ Download PDF", f, file_name="website_data.pdf", mime="application/pdf") | |
| else: | |
| st.warning("Please enter at least one URL.") | |