Spaces:

maaz21
/

aiwebscrapper

Sleeping

File size: 1,951 Bytes

bad4426
0d79f8e
bad4426
 
12a3818
5a0a5cc
4c8f5ab
9d51b92
5a0a5cc
4c8f5ab
0d79f8e
 
bad4426
4c8f5ab
0d79f8e
4c8f5ab
0d79f8e
 
 
9d51b92
4c8f5ab
0d79f8e
4c8f5ab
0d79f8e
9d51b92
12a3818
4c8f5ab
9d51b92
 
 
bad4426
0d79f8e
 
 
9d51b92
0d79f8e
 
 
 
 
 
 
 
bad4426
0d79f8e
 
 
 
712cc1f
0d79f8e
 
 
 
 
4c8f5ab
0d79f8e

import streamlit as st
import os
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import tempfile

# Use safe temporary directory (works on Hugging Face)
OUT_DIR = tempfile.gettempdir()

def scrape_website(url):
    """Scrape text content from a webpage."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract readable text
        text = soup.get_text(separator="\n", strip=True)
        return text[:20000]  # Limit content size
    except Exception as e:
        return f"Error scraping {url}: {str(e)}"

def make_pdf(urls):
    """Create a PDF file from list of website URLs."""
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
    pdf.set_font("DejaVu", size=12)

    for url in urls:
        st.write(f"Scraping: {url}")
        content = scrape_website(url)
        pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")

    pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
    pdf.output(pdf_path)
    return pdf_path

# Streamlit UI
st.set_page_config(page_title="Website → PDF for RAG", layout="wide")
st.title("🌐 Website Scraper → PDF Generator for RAG Chatbots")

urls_input = st.text_area(
    "Enter website URLs (one per line):",
    "https://njmarketings.com\nhttps://njmarketings.com/seo\nhttps://njmarketings.com/events"
)

if st.button("Generate PDF"):
    urls = [u.strip() for u in urls_input.splitlines() if u.strip()]
    if urls:
        pdf_path = make_pdf(urls)
        st.success("✅ PDF generated successfully!")
        with open(pdf_path, "rb") as f:
            st.download_button("⬇️ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")
    else:
        st.warning("Please enter at least one URL.")