Spaces:

maaz21
/

aiwebscrapper

Sleeping

App Files Files Community

aiwebscrapper / src /streamlit_app.py

maaz21

Update src/streamlit_app.py

9d51b92 verified 5 months ago

raw

history blame contribute delete

1.95 kB

	import streamlit as st
	import os
	import requests
	from bs4 import BeautifulSoup
	from fpdf import FPDF
	import tempfile

	# Use safe temporary directory (works on Hugging Face)
	OUT_DIR = tempfile.gettempdir()

	def scrape_website(url):
	"""Scrape text content from a webpage."""
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, "html.parser")

	# Extract readable text
	text = soup.get_text(separator="\n", strip=True)
	return text[:20000] # Limit content size
	except Exception as e:
	return f"Error scraping {url}: {str(e)}"

	def make_pdf(urls):
	"""Create a PDF file from list of website URLs."""
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
	pdf.set_font("DejaVu", size=12)

	for url in urls:
	st.write(f"Scraping: {url}")
	content = scrape_website(url)
	pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")

	pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
	pdf.output(pdf_path)
	return pdf_path

	# Streamlit UI
	st.set_page_config(page_title="Website → PDF for RAG", layout="wide")
	st.title("🌐 Website Scraper → PDF Generator for RAG Chatbots")

	urls_input = st.text_area(
	"Enter website URLs (one per line):",
	"https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events"
	)

	if st.button("Generate PDF"):
	urls = [u.strip() for u in urls_input.splitlines() if u.strip()]
	if urls:
	pdf_path = make_pdf(urls)
	st.success("✅ PDF generated successfully!")
	with open(pdf_path, "rb") as f:
	st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
	else:
	st.warning("Please enter at least one URL.")