Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse, urljoin | |
| from markdownify import markdownify as md | |
| import tempfile | |
| import zipfile | |
| import re | |
| from typing import Tuple | |
| import os | |
| import gradio as gr | |
| from collections import deque | |
| # =========================================================== | |
| # π WEBSITE CRAWLER | |
| # =========================================================== | |
| def crawl_site_for_links(start_url: str, max_pages: int = 50, max_depth: int = 2): | |
| """ | |
| Recursively crawl a website and collect: | |
| β’ Internal HTML pages | |
| β’ PDF files | |
| We stay inside the same domain for safety. | |
| """ | |
| visited = set() | |
| html_links = set() | |
| pdf_links = set() | |
| parsed_base = urlparse(start_url) | |
| domain = parsed_base.netloc | |
| queue = deque([(start_url, 0)]) | |
| session = requests.Session() | |
| session.headers.update({ | |
| "User-Agent": "Mozilla/5.0" | |
| }) | |
| while queue and len(visited) < max_pages: | |
| current_url, depth = queue.popleft() | |
| if current_url in visited or depth > max_depth: | |
| continue | |
| visited.add(current_url) | |
| try: | |
| response = session.get(current_url, timeout=10) | |
| if "text/html" not in response.headers.get("Content-Type", ""): | |
| continue | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"] | |
| full_url = urljoin(current_url, href) | |
| parsed = urlparse(full_url) | |
| if parsed.netloc != domain: | |
| continue | |
| if full_url.lower().endswith(".pdf"): | |
| pdf_links.add(full_url) | |
| elif not href.startswith(("#", "javascript:", "mailto:", "tel:")): | |
| html_links.add(full_url) | |
| if full_url not in visited: | |
| queue.append((full_url, depth + 1)) | |
| except Exception: | |
| continue | |
| return html_links, pdf_links | |
| # =========================================================== | |
| # π¦ EXTRACTION ENGINE | |
| # =========================================================== | |
| def extract_all_content_as_zip(url: str, max_links: int, max_depth: int) -> Tuple[str, str]: | |
| """ | |
| Main function: | |
| β’ Crawls the site | |
| β’ Converts pages to Markdown | |
| β’ Downloads PDFs | |
| β’ Packs everything into a ZIP file | |
| """ | |
| try: | |
| if not url.startswith(("http://", "https://")): | |
| url = "https://" + url | |
| html_links, pdf_links = crawl_site_for_links(url, max_links, max_depth) | |
| if not html_links and not pdf_links: | |
| return "β No internal pages or PDFs found.", None | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_zip: | |
| zip_path = temp_zip.name | |
| session = requests.Session() | |
| session.headers.update({"User-Agent": "Mozilla/5.0"}) | |
| html_ok = 0 | |
| pdf_ok = 0 | |
| with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zip_file: | |
| # ---- HTML β Markdown ---- | |
| for i, link_url in enumerate(html_links, 1): | |
| try: | |
| resp = session.get(link_url, timeout=10) | |
| soup = BeautifulSoup(resp.content, "html.parser") | |
| for tag in soup(["script","style","nav","footer","header","aside"]): | |
| tag.decompose() | |
| main_content = ( | |
| soup.find("main") | |
| or soup.find("article") | |
| or soup.find("body") | |
| ) | |
| markdown_text = md(str(main_content)) | |
| title = soup.find("title") | |
| if title: | |
| markdown_text = f"# {title.text.strip()}\n\n{markdown_text}" | |
| filename = f"page_{i}.md" | |
| zip_file.writestr(filename, markdown_text) | |
| html_ok += 1 | |
| except Exception: | |
| pass | |
| # ---- PDFs ---- | |
| for j, pdf_url in enumerate(pdf_links, 1): | |
| try: | |
| resp = session.get(pdf_url, timeout=20) | |
| zip_file.writestr(f"pdfs/document_{j}.pdf", resp.content) | |
| pdf_ok += 1 | |
| except Exception: | |
| pass | |
| message = f""" | |
| β Extraction completed! | |
| β’ HTML pages saved as Markdown: {html_ok} | |
| β’ PDFs downloaded: {pdf_ok} | |
| You can now download the ZIP file below. | |
| """ | |
| return message, zip_path | |
| except Exception as e: | |
| return f"β Error: {str(e)}", None | |
| # =========================================================== | |
| # π¨ GRADIO WEB APP (GRADIO 6 SAFE) | |
| # =========================================================== | |
| def run_extraction(url, max_links, depth): | |
| return extract_all_content_as_zip(url, int(max_links), int(depth)) | |
| with gr.Blocks(title="Website Content Extractor") as app: | |
| gr.Markdown(""" | |
| # π Website Content & PDF Extractor | |
| Download the **text and PDFs from a website** and package everything into a ZIP file. | |
| """) | |
| gr.Markdown("---") | |
| # HOW TO USE SECTION (replaces Box) | |
| with gr.Group(): | |
| gr.Markdown("## π§ How to use") | |
| gr.Markdown(""" | |
| 1οΈβ£ Enter a website homepage | |
| 2οΈβ£ Choose how deep to crawl | |
| 3οΈβ£ Click **Start Extraction** | |
| 4οΈβ£ Download your ZIP file | |
| β οΈ Large sites may take several minutes. | |
| """) | |
| gr.Markdown("---") | |
| url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="https://example.com" | |
| ) | |
| with gr.Row(): | |
| max_links_input = gr.Slider( | |
| 10, 200, value=50, step=10, | |
| label="Maximum pages to scan", | |
| info="Higher = more content but slower" | |
| ) | |
| depth_input = gr.Slider( | |
| 1, 3, value=2, step=1, | |
| label="Crawl depth", | |
| info="How many clicks away from homepage" | |
| ) | |
| run_btn = gr.Button("π Start Extraction", variant="primary") | |
| status_output = gr.Textbox(label="Status") | |
| file_output = gr.File(label="Download ZIP") | |
| run_btn.click( | |
| fn=run_extraction, | |
| inputs=[url_input, max_links_input, depth_input], | |
| outputs=[status_output, file_output] | |
| ) | |
| # =========================================================== | |
| # π ENTRY POINT | |
| # =========================================================== | |
| if __name__ == "__main__": | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| theme=gr.themes.Soft() | |
| ) |