aiwebscrapper / src /streamlit_app.py
maaz21's picture
Update src/streamlit_app.py
9d51b92 verified
import streamlit as st
import os
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import tempfile
# Use safe temporary directory (works on Hugging Face)
OUT_DIR = tempfile.gettempdir()
def scrape_website(url):
"""Scrape text content from a webpage."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract readable text
text = soup.get_text(separator="\n", strip=True)
return text[:20000] # Limit content size
except Exception as e:
return f"Error scraping {url}: {str(e)}"
def make_pdf(urls):
"""Create a PDF file from list of website URLs."""
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
pdf.set_font("DejaVu", size=12)
for url in urls:
st.write(f"Scraping: {url}")
content = scrape_website(url)
pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")
pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
pdf.output(pdf_path)
return pdf_path
# Streamlit UI
st.set_page_config(page_title="Website β†’ PDF for RAG", layout="wide")
st.title("🌐 Website Scraper β†’ PDF Generator for RAG Chatbots")
urls_input = st.text_area(
"Enter website URLs (one per line):",
"https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events"
)
if st.button("Generate PDF"):
urls = [u.strip() for u in urls_input.splitlines() if u.strip()]
if urls:
pdf_path = make_pdf(urls)
st.success("βœ… PDF generated successfully!")
with open(pdf_path, "rb") as f:
st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
else:
st.warning("Please enter at least one URL.")