Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import warnings | |
| from WebScraper import WebsiteScraper | |
| from merge_md import merge_md_to_pdf_and_convert_to_url | |
| warnings.filterwarnings("ignore") | |
| os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" | |
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" | |
| global_output_dir = "" | |
| def scrape_website(url, site_name, site_description="", site_category="General", | |
| max_pages=20, max_depth=3, delay=2, scrape_external_links=False): | |
| scraper = WebsiteScraper( | |
| base_url=url, | |
| site_name=site_name, | |
| site_description=site_description, | |
| site_category=site_category, | |
| max_pages=max_pages, | |
| max_depth=max_depth, | |
| delay=delay, | |
| scrape_external_links=scrape_external_links | |
| ) | |
| return scraper.start() | |
| with gr.Blocks(title="General Website Scraper", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# General Website Scraper") | |
| gr.Markdown("Scrape content from any website, save as markdown files, and merge into a PDF with viewer and downloadable link.") | |
| with gr.Row(): | |
| url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="e.g., https://example.com or https://blog.example.com", | |
| info="Enter the starting URL to scrape" | |
| ) | |
| site_name_input = gr.Textbox( | |
| label="Site Name", | |
| placeholder="e.g., Example Blog", | |
| info="A descriptive name for the website" | |
| ) | |
| with gr.Row(): | |
| site_description_input = gr.Textbox( | |
| label="Site Description (Optional)", | |
| placeholder="e.g., A technology blog about AI and programming", | |
| info="Brief description of the website content" | |
| ) | |
| site_category_input = gr.Dropdown( | |
| label="Site Category", | |
| choices=[ | |
| "General", "Blog", "News", "E-commerce", "Portfolio", | |
| "Company", "Documentation", "Forum", "Social Media", | |
| "Education", "Technology", "Entertainment", "Health", | |
| "Finance", "Travel", "Food", "Sports", "Art", "Other" | |
| ], | |
| value="General", | |
| info="Select the most appropriate category" | |
| ) | |
| with gr.Row(): | |
| max_pages_input = gr.Number( | |
| label="Max Pages", value=20, precision=0, minimum=1, maximum=1000, | |
| info="Maximum number of pages to scrape" | |
| ) | |
| max_depth_input = gr.Number( | |
| label="Max Depth", value=3, precision=0, minimum=1, maximum=10, | |
| info="How many clicks deep to follow links" | |
| ) | |
| delay_input = gr.Number( | |
| label="Delay (seconds)", value=2, precision=1, minimum=0.5, maximum=10, | |
| info="Delay between requests to avoid overwhelming the server" | |
| ) | |
| with gr.Row(): | |
| external_links_input = gr.Checkbox( | |
| label="Include External Links", value=False, | |
| info="Scrape links that go outside the original domain (use with caution)" | |
| ) | |
| scrape_btn = gr.Button("Start Scraping", variant="primary", size="lg") | |
| with gr.Row(): | |
| output = gr.Textbox( | |
| label="Scraping Results", | |
| lines=10, | |
| max_lines=20, | |
| info="Real-time scraping progress and results will appear here" | |
| ) | |
| gr.Markdown("## PDF Generation & Viewer") | |
| with gr.Row(): | |
| merge_pdf_btn = gr.Button("Merge to PDF and Get Link", variant="secondary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_output = gr.Textbox( | |
| label="PDF Merge Results", | |
| lines=5, | |
| max_lines=10, | |
| info="Results of merging Markdown files to PDF" | |
| ) | |
| pdf_download = gr.File( | |
| label="Download Merged PDF (Local File)", | |
| file_types=[".pdf"], | |
| visible=False | |
| ) | |
| pdf_url_output = gr.HTML( | |
| label="PDF Download Link", | |
| visible=False | |
| ) | |
| with gr.Column(scale=2): | |
| pdf_viewer = gr.File( | |
| label="PDF Viewer - View Merged Content", | |
| file_types=[".pdf"], | |
| visible=False, | |
| interactive=False | |
| ) | |
| gr.Markdown("## Related Video Demo") | |
| youtube_embed = gr.HTML( | |
| value=""" | |
| <div style='text-align: center;'> | |
| <iframe width='560' height='315' src='https://www.youtube.com/embed/Wf2CqjQgOcI' | |
| frameborder='0' allow='accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture' | |
| allowfullscreen></iframe> | |
| </div> | |
| """, | |
| label="Tutorial Video", | |
| visible=True | |
| ) | |
| def process_scrape(url, site_name, site_description, site_category, max_pages, max_depth, delay, external_links): | |
| """ | |
| The function `process_scrape` takes in parameters related to website scraping, performs the | |
| scraping operation, and returns a success message or an error message based on the result. | |
| :param url: The `url` parameter is the URL of the website that you want to scrape | |
| :param site_name: The `site_name` parameter is a string that represents the name of the website | |
| being scraped. It is one of the required parameters for the `process_scrape` function | |
| :param site_description: The `site_description` parameter in the `process_scrape` function is | |
| used to provide a description of the website being scraped. It is a text description that helps | |
| in identifying and describing the content or purpose of the website. This information can be | |
| used for various purposes such as categorizing the website, | |
| :param site_category: The `site_category` parameter in the `process_scrape` function is used to | |
| specify the category of the website being scraped. It is one of the inputs required for the | |
| scraping process | |
| :param max_pages: The `max_pages` parameter in the `process_scrape` function represents the | |
| maximum number of pages to scrape on the website. It is an integer value that determines the | |
| limit for the number of pages that will be scraped during the process | |
| :param max_depth: The `max_depth` parameter in the `process_scrape` function represents the | |
| maximum depth of links to follow during the website scraping process. It determines how many | |
| levels deep the scraper will navigate through the website's links starting from the initial URL. | |
| This parameter helps control the extent of the scraping process and | |
| :param delay: The `delay` parameter in the `process_scrape` function represents the time delay | |
| (in seconds) between consecutive requests made during the scraping process. This delay is useful | |
| for preventing overwhelming the target website with too many requests in a short period, which | |
| could lead to being blocked or flagged as suspicious activity | |
| :param external_links: The `external_links` parameter in the `process_scrape` function is a | |
| boolean flag that determines whether external links should be scraped along with the internal | |
| links of the website. If `external_links` is set to `True`, the scraper will also follow and | |
| scrape external links found on the website | |
| :return: The function `process_scrape` returns a tuple containing a message string, and three | |
| `None` values. The message string can vary depending on the outcome of the scraping process. If | |
| the scraping is successful, it returns a success message with details such as the number of | |
| pages scraped, duration, output directory, and a list of files created. If the scraping fails, | |
| it returns an error message indicating | |
| """ | |
| global global_output_dir | |
| if not url or not site_name: | |
| return "Please provide both URL and Site Name", None, None, None | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| try: | |
| result = scrape_website( | |
| url=url, | |
| site_name=site_name, | |
| site_description=site_description, | |
| site_category=site_category, | |
| max_pages=int(max_pages), | |
| max_depth=int(max_depth), | |
| delay=float(delay), | |
| scrape_external_links=external_links | |
| ) | |
| if result["success"]: | |
| global_output_dir = result['output_dir'] | |
| return ( | |
| f"Successfully scraped {result['pages_scraped']} pages!\n" | |
| f"Duration: {result['duration']}\n" | |
| f"Files saved to: {result['output_dir']}\n\n" | |
| f"Files created:\n" | |
| f" • Individual page files (.md)\n" | |
| f" • scraping_summary.md\n" | |
| f" • scraping_log.txt\n\n" | |
| f"Ready to merge into PDF - click 'Merge to PDF' button below." | |
| ), None, None, None | |
| else: | |
| return f"Scraping failed: {result['error']}", None, None, None | |
| except Exception as e: | |
| return f"Error: {str(e)}", None, None, None | |
| def process_merge_to_pdf(): | |
| """ | |
| The function `process_merge_to_pdf` merges Markdown files into a PDF and provides download | |
| options for the generated PDF. | |
| :return: The `process_merge_to_pdf` function returns a tuple containing four elements: | |
| """ | |
| global global_output_dir | |
| if not global_output_dir: | |
| return ("No scraping output directory found. Please scrape a website first.", | |
| None, None, gr.update(visible=False)) | |
| try: | |
| result = merge_md_to_pdf_and_convert_to_url( | |
| output_dir=global_output_dir, | |
| site_name="Scraped Website", | |
| site_description="Scraped content from website", | |
| site_category="Technology", | |
| output_format="pdf" | |
| ) | |
| if result["success"]: | |
| pdf_url = result["output_url"] | |
| local_pdf_path = result["converted_path"] | |
| message = ( | |
| f"{result['message']}\n\n" | |
| f"PDF created successfully!\n" | |
| f"Local file: {local_pdf_path}\n" | |
| f"Download URL: {pdf_url}\n\n" | |
| f"View the PDF in the viewer on the right." | |
| ) | |
| download_html = f''' | |
| <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;"> | |
| <h4>Download Options:</h4> | |
| <p><a href="{pdf_url}" target="_blank" style="color: #1f77b4; text-decoration: none; font-weight: bold;"> | |
| Click here to download PDF from web link | |
| </a></p> | |
| <p><small>The PDF is also available in the viewer on the right and as a downloadable file above.</small></p> | |
| </div> | |
| ''' | |
| return ( | |
| message, | |
| local_pdf_path, | |
| download_html, | |
| gr.update(value=local_pdf_path, visible=True) | |
| ) | |
| else: | |
| return ( | |
| f"PDF merge failed: {result['error']}", | |
| None, | |
| None, | |
| gr.update(visible=False) | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"Error during PDF merge: {str(e)}", | |
| None, | |
| None, | |
| gr.update(visible=False) | |
| ) | |
| scrape_btn.click( | |
| process_scrape, | |
| inputs=[ | |
| url_input, site_name_input, site_description_input, site_category_input, | |
| max_pages_input, max_depth_input, delay_input, external_links_input | |
| ], | |
| outputs=[output, pdf_download, pdf_url_output, pdf_viewer] | |
| ) | |
| merge_pdf_btn.click( | |
| process_merge_to_pdf, | |
| inputs=[], | |
| outputs=[pdf_output, pdf_download, pdf_url_output, pdf_viewer] | |
| ) | |
| with gr.Accordion("Example Usage & Tips", open=False): | |
| gr.Markdown(""" | |
| ### Common Use Cases: | |
| - News Websites: `https://techcrunch.com` - scrape latest tech news articles | |
| - Blogs: `https://blog.openai.com` - scrape all blog posts and updates | |
| - Company Sites: `https://company.com/products` - scrape product pages and documentation | |
| - Personal Portfolios: `https://designer.com` - scrape project galleries and case studies | |
| - Forums/Communities: `https://stackoverflow.com/questions/tagged/python` - scrape Q&A content | |
| - E-commerce: `https://shop.com/category` - scrape product listings and descriptions | |
| ### Tips for Better Results: | |
| - Start with specific sections: Instead of `https://wikipedia.org`, try `https://en.wikipedia.org/wiki/Category:Artificial_intelligence` | |
| - Use reasonable limits: Start with 10-20 pages to test, then increase if needed | |
| - Respect rate limits: Use 2-3 second delays for most sites | |
| - External links: Only enable for trusted sites to avoid scraping the entire internet | |
| - Check robots.txt: Make sure you're allowed to scrape the site (`site.com/robots.txt`) | |
| ### Output Files Explained: | |
| - Individual .md files: Each scraped page saved as markdown | |
| - scraping_summary.md: Overview of all scraped content with links | |
| - scraping_log.txt: Detailed log of the scraping process | |
| - Merged PDF: Combined content of all Markdown files, viewable in the interface and downloadable | |
| ### PDF Features: | |
| - Inline Viewer: View the merged PDF directly in the interface | |
| - Download Options: Download via direct file or web link | |
| - Multiple Formats: Local file and web-hosted version available | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| Important Notes: | |
| - Always respect website terms of service and robots.txt | |
| - Use reasonable delays to avoid overwhelming servers | |
| - Some sites may block automated scraping | |
| - Consider the website's bandwidth and server load | |
| - The merged PDF is uploaded to a public link for easy sharing | |
| - PDF viewer works best with modern browsers that support PDF display | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True, share=True, server_port=7860) | |
| # @https://google.github.io/adk-docs/get-started/installation/ | |
| # use process scrape to extract above link and of maxpages of 2 and mergepdf using process_merge_to_pdf | |