Spaces:

maaz21
/

aiwebscrapper

Sleeping

maaz21 commited on Oct 16, 2025

Commit

9d51b92

verified ·

1 Parent(s): 5a0a5cc

Update src/streamlit_app.py

Files changed (1) hide show

src/streamlit_app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
 from fpdf import FPDF
 import tempfile
-# Use a safe temp directory (writable on Hugging Face)
 OUT_DIR = tempfile.gettempdir()
 def scrape_website(url):
@@ -17,22 +17,22 @@ def scrape_website(url):
         # Extract readable text
         text = soup.get_text(separator="\n", strip=True)
-        return text[:15000]  # Limit to avoid too-large PDFs
     except Exception as e:
         return f"Error scraping {url}: {str(e)}"
 def make_pdf(urls):
-    """Create a single PDF file with content from given URLs."""
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
     for url in urls:
         st.write(f"Scraping: {url}")
         content = scrape_website(url)
-        pdf.add_page()
-        pdf.set_font("Arial", size=12)
-        pdf.multi_cell(0, 10, f"Website: {url}\n\n{content}\n")
     pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
     pdf.output(pdf_path)

 from fpdf import FPDF
 import tempfile
+# Use safe temporary directory (works on Hugging Face)
 OUT_DIR = tempfile.gettempdir()
 def scrape_website(url):
         # Extract readable text
         text = soup.get_text(separator="\n", strip=True)
+        return text[:20000]  # Limit content size
     except Exception as e:
         return f"Error scraping {url}: {str(e)}"
 def make_pdf(urls):
+    """Create a PDF file from list of website URLs."""
     pdf = FPDF()
     pdf.set_auto_page_break(auto=True, margin=15)
+    pdf.add_page()
+    pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
+    pdf.set_font("DejaVu", size=12)
     for url in urls:
         st.write(f"Scraping: {url}")
         content = scrape_website(url)
+        pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")
     pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
     pdf.output(pdf_path)