maaz21 commited on
Commit
4c8f5ab
Β·
verified Β·
1 Parent(s): 2fc9d03

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +66 -70
src/streamlit_app.py CHANGED
@@ -1,88 +1,84 @@
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from urllib.parse import urljoin, urlparse
5
  from fpdf import FPDF
6
- from PyPDF2 import PdfMerger
7
- import tempfile, os
 
 
 
 
 
 
 
 
8
 
9
- st.set_page_config(page_title="Website Scraper β†’ PDF (RAG Ready)", layout="wide")
10
- st.title("🌐 Website Scraper β†’ πŸ“„ PDF Generator for RAG Chatbots")
11
- st.write("Scrape all pages from a website and generate a single combined PDF containing the content β€” ready for RAG ingestion.")
12
 
13
- url = st.text_input("Enter website URL (e.g. https://njmarketings.com):")
 
 
14
 
15
- def get_all_links(base_url):
16
- """Find all internal links from a website"""
17
- visited, to_visit = set(), [base_url]
18
- domain = urlparse(base_url).netloc
19
 
20
- while to_visit:
21
- current = to_visit.pop(0)
22
- if current in visited:
23
- continue
24
- visited.add(current)
25
- try:
26
- res = requests.get(current, timeout=10)
27
- soup = BeautifulSoup(res.text, "html.parser")
28
- for a in soup.find_all("a", href=True):
29
- link = urljoin(base_url, a["href"])
30
- if domain in urlparse(link).netloc and link not in visited and link.startswith(base_url):
31
- to_visit.append(link)
32
- except Exception:
33
- pass
34
  return list(visited)
35
 
36
- def extract_text(url):
37
- """Extract visible text from a webpage"""
38
  try:
39
- r = requests.get(url, timeout=10)
40
- soup = BeautifulSoup(r.text, "html.parser")
41
- for tag in soup(["script", "style", "noscript"]):
42
- tag.extract()
43
- text = " ".join(soup.stripped_strings)
44
- return text
45
- except Exception:
46
- return ""
47
 
48
- def save_text_to_pdf(text, filename):
49
- """Save plain text to a simple PDF using fpdf"""
 
 
 
 
 
 
 
 
 
 
50
  pdf = FPDF()
51
  pdf.add_page()
52
- pdf.set_font("Arial", size=12)
 
53
  for line in text.split("\n"):
54
- pdf.multi_cell(0, 10, line)
55
- pdf.output(filename)
56
-
57
- if st.button("Scrape & Generate PDF"):
58
- if not url:
59
- st.warning("Please enter a valid URL first.")
60
- else:
61
- with st.spinner("Scraping pages..."):
62
- links = get_all_links(url)
63
- st.write(f"βœ… Found {len(links)} pages")
64
-
65
- temp_dir = tempfile.mkdtemp()
66
- merger = PdfMerger()
67
-
68
- for link in links:
69
- text = extract_text(link)
70
- filename = link.strip("/").split("/")[-1] or "index"
71
- pdf_path = os.path.join(temp_dir, f"{filename}.pdf")
72
- save_text_to_pdf(f"URL: {link}\n\n{text}", pdf_path)
73
- merger.append(pdf_path)
74
- st.success(f"Processed: {link}")
75
 
76
- combined_pdf = os.path.join(temp_dir, "combined_website.pdf")
77
- merger.write(combined_pdf)
78
- merger.close()
 
 
 
 
 
 
79
 
80
- with open(combined_pdf, "rb") as f:
81
- st.download_button(
82
- "πŸ“₯ Download Combined PDF",
83
- data=f,
84
- file_name="website_data.pdf",
85
- mime="application/pdf"
86
- )
87
 
88
- st.success("πŸŽ‰ All pages scraped and combined into one PDF!")
 
 
 
1
  import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  from fpdf import FPDF
5
+ import os
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ st.set_page_config(page_title="Web Scraper to PDF", layout="wide")
9
+
10
+ # Create safe output folder
11
+ OUT_DIR = os.path.join(os.getcwd(), "outputs")
12
+ os.makedirs(OUT_DIR, exist_ok=True)
13
+
14
+ st.title("🌐 Website Scraper β†’ PDF for RAG Chatbots")
15
 
16
+ st.write("Enter a website URL to scrape all readable text and convert it to a PDF file for your RAG model training.")
 
 
17
 
18
+ url = st.text_input("Enter website URL (e.g., https://njmarketings.com)")
19
+ depth = st.number_input("Number of subpages to fetch (0 for main page only)", 0, 5, 0)
20
+ start_btn = st.button("Scrape & Generate PDF")
21
 
22
+ def fetch_links(base_url, depth=1):
23
+ """Recursively fetch internal links up to the given depth."""
24
+ visited = set()
25
+ links_to_visit = [base_url]
26
 
27
+ for _ in range(depth):
28
+ new_links = []
29
+ for link in links_to_visit:
30
+ try:
31
+ r = requests.get(link, timeout=10)
32
+ soup = BeautifulSoup(r.text, 'html.parser')
33
+ for a in soup.find_all('a', href=True):
34
+ abs_url = urljoin(base_url, a['href'])
35
+ if base_url in abs_url and abs_url not in visited:
36
+ visited.add(abs_url)
37
+ new_links.append(abs_url)
38
+ except Exception:
39
+ pass
40
+ links_to_visit.extend(new_links)
41
  return list(visited)
42
 
43
+ def extract_text_from_url(url):
44
+ """Extract visible text content from a webpage."""
45
  try:
46
+ response = requests.get(url, timeout=10)
47
+ soup = BeautifulSoup(response.text, "html.parser")
 
 
 
 
 
 
48
 
49
+ # Remove scripts and styles
50
+ for script in soup(["script", "style", "noscript"]):
51
+ script.extract()
52
+
53
+ text = soup.get_text(separator="\n")
54
+ lines = [line.strip() for line in text.splitlines() if line.strip()]
55
+ return "\n".join(lines)
56
+ except Exception as e:
57
+ return f"Failed to fetch {url}: {e}"
58
+
59
+ def create_pdf(text, output_path):
60
+ """Generate a PDF from text using fpdf (no external dependencies)."""
61
  pdf = FPDF()
62
  pdf.add_page()
63
+ pdf.set_auto_page_break(auto=True, margin=15)
64
+ pdf.set_font("Arial", size=11)
65
  for line in text.split("\n"):
66
+ pdf.multi_cell(0, 8, line)
67
+ pdf.output(output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ if start_btn and url:
70
+ with st.spinner("Scraping website..."):
71
+ all_text = ""
72
+ all_links = [url] + fetch_links(url, depth)
73
+ st.write(f"πŸ”— Found {len(all_links)} pages to fetch.")
74
+ for link in all_links:
75
+ st.write(f"πŸ“„ Fetching: {link}")
76
+ page_text = extract_text_from_url(link)
77
+ all_text += f"\n\n--- PAGE: {link} ---\n\n{page_text}"
78
 
79
+ pdf_path = os.path.join(OUT_DIR, "scraped_data.pdf")
80
+ create_pdf(all_text, pdf_path)
 
 
 
 
 
81
 
82
+ st.success("βœ… PDF created successfully!")
83
+ with open(pdf_path, "rb") as f:
84
+ st.download_button("πŸ“₯ Download PDF", f, file_name="website_data.pdf", mime="application/pdf")