Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +7 -7
src/streamlit_app.py
CHANGED
|
@@ -5,7 +5,7 @@ from bs4 import BeautifulSoup
|
|
| 5 |
from fpdf import FPDF
|
| 6 |
import tempfile
|
| 7 |
|
| 8 |
-
# Use
|
| 9 |
OUT_DIR = tempfile.gettempdir()
|
| 10 |
|
| 11 |
def scrape_website(url):
|
|
@@ -17,22 +17,22 @@ def scrape_website(url):
|
|
| 17 |
|
| 18 |
# Extract readable text
|
| 19 |
text = soup.get_text(separator="\n", strip=True)
|
| 20 |
-
return text[:
|
| 21 |
except Exception as e:
|
| 22 |
return f"Error scraping {url}: {str(e)}"
|
| 23 |
|
| 24 |
def make_pdf(urls):
|
| 25 |
-
"""Create a
|
| 26 |
pdf = FPDF()
|
| 27 |
pdf.set_auto_page_break(auto=True, margin=15)
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
for url in urls:
|
| 30 |
st.write(f"Scraping: {url}")
|
| 31 |
content = scrape_website(url)
|
| 32 |
-
|
| 33 |
-
pdf.add_page()
|
| 34 |
-
pdf.set_font("Arial", size=12)
|
| 35 |
-
pdf.multi_cell(0, 10, f"Website: {url}\n\n{content}\n")
|
| 36 |
|
| 37 |
pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
|
| 38 |
pdf.output(pdf_path)
|
|
|
|
| 5 |
from fpdf import FPDF
|
| 6 |
import tempfile
|
| 7 |
|
| 8 |
+
# Use safe temporary directory (works on Hugging Face)
|
| 9 |
OUT_DIR = tempfile.gettempdir()
|
| 10 |
|
| 11 |
def scrape_website(url):
|
|
|
|
| 17 |
|
| 18 |
# Extract readable text
|
| 19 |
text = soup.get_text(separator="\n", strip=True)
|
| 20 |
+
return text[:20000] # Limit content size
|
| 21 |
except Exception as e:
|
| 22 |
return f"Error scraping {url}: {str(e)}"
|
| 23 |
|
| 24 |
def make_pdf(urls):
|
| 25 |
+
"""Create a PDF file from list of website URLs."""
|
| 26 |
pdf = FPDF()
|
| 27 |
pdf.set_auto_page_break(auto=True, margin=15)
|
| 28 |
+
pdf.add_page()
|
| 29 |
+
pdf.add_font("DejaVu", "", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
|
| 30 |
+
pdf.set_font("DejaVu", size=12)
|
| 31 |
|
| 32 |
for url in urls:
|
| 33 |
st.write(f"Scraping: {url}")
|
| 34 |
content = scrape_website(url)
|
| 35 |
+
pdf.multi_cell(0, 8, f"Website: {url}\n\n{content}\n\n---\n\n")
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
pdf_path = os.path.join(OUT_DIR, "website_data.pdf")
|
| 38 |
pdf.output(pdf_path)
|