maaz21 commited on
Commit
9d51b92
·
verified ·
1 Parent(s): 5a0a5cc

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +7 -7
src/streamlit_app.py CHANGED
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
5
  from fpdf import FPDF
6
  import tempfile
7
 
8
- # Use a safe temp directory (writable on Hugging Face)
9
  OUT_DIR = tempfile.gettempdir()
10
 
11
  def scrape_website(url):
@@ -17,22 +17,22 @@ def scrape_website(url):
17
 
18
  # Extract readable text
19
  text = soup.get_text(separator="\n", strip=True)
20
- return text[:15000] # Limit to avoid too-large PDFs
21
  except Exception as e:
22
  return f"Error scraping {url}: {str(e)}"
23
 
24
  def make_pdf(urls):
25
- """Create a single PDF file with content from given URLs."""
26
  pdf = FPDF()
27
  pdf.set_auto_page_break(auto=True, margin=15)
 
 
 
28
 
29
  for url in urls:
30
  st.write(f"Scraping: {url}")
31
  content = scrape_website(url)
32
-
33
- pdf.add_page()
34
- pdf.set_font("Arial", size=12)
35
- pdf.multi_cell(0, 10, f"Website: {url}\n\n{content}\n")
36
 
37
  pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
38
  pdf.output(pdf_path)
 
5
  from fpdf import FPDF
6
  import tempfile
7
 
8
+ # Use safe temporary directory (works on Hugging Face)
9
  OUT_DIR = tempfile.gettempdir()
10
 
11
  def scrape_website(url):
 
17
 
18
  # Extract readable text
19
  text = soup.get_text(separator="\n", strip=True)
20
+ return text[:20000] # Limit content size
21
  except Exception as e:
22
  return f"Error scraping {url}: {str(e)}"
23
 
24
  def make_pdf(urls):
25
+ """Create a PDF file from list of website URLs."""
26
  pdf = FPDF()
27
  pdf.set_auto_page_break(auto=True, margin=15)
28
+ pdf.add_page()
29
+ pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
30
+ pdf.set_font("DejaVu", size=12)
31
 
32
  for url in urls:
33
  st.write(f"Scraping: {url}")
34
  content = scrape_website(url)
35
+ pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")
 
 
 
36
 
37
  pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
38
  pdf.output(pdf_path)