import streamlit as st
import os
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import tempfile

# Use safe temporary directory (works on Hugging Face)
OUT_DIR = tempfile.gettempdir()

def scrape_website(url):
    """Scrape text content from a webpage."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract readable text
        text = soup.get_text(separator="\n", strip=True)
        return text[:20000]  # Limit content size
    except Exception as e:
        return f"Error scraping {url}: {str(e)}"

def make_pdf(urls):
    """Create a PDF file from list of website URLs."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
    pdf.set_font("DejaVu", size=12)

    for url in urls:
        st.write(f"Scraping: {url}")
        content = scrape_website(url)
        pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")

    pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
    pdf.output(pdf_path)
    return pdf_path

# Streamlit UI
st.set_page_config(page_title="Website → PDF for RAG", layout="wide")
st.title("🌐 Website Scraper → PDF Generator for RAG Chatbots")

urls_input = st.text_area(
    "Enter website URLs (one per line):",
    "https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events"
)

if st.button("Generate PDF"):
    urls = [u.strip() for u in urls_input.splitlines() if u.strip()]
    if urls:
        pdf_path = make_pdf(urls)
        st.success("✅ PDF generated successfully!")
        with open(pdf_path, "rb") as f:
            st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
    else:
        st.warning("Please enter at least one URL.")