Spaces:
Paused
Paused
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import math | |
| from docx import Document # Import for Word file generation | |
| # Function to extract all links from a website | |
| def extract_links(url): | |
| try: | |
| response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| if response.status_code != 200: | |
| return f"Error: Unable to fetch page (Status Code {response.status_code})", [] | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| base_url = "/".join(url.split("/")[:3]) # Extract base domain | |
| links = [] | |
| for a_tag in soup.find_all("a", href=True): | |
| href = a_tag["href"] | |
| if not href.startswith("http"): # Convert relative links to absolute | |
| href = base_url + href if href.startswith("/") else base_url + "/" + href | |
| links.append(href) | |
| links = list(set(links)) # Remove duplicates | |
| if not links: | |
| return "No links found on the website.", [] | |
| return f"β {len(links)} links found! Select which ones to convert into Word files:", links | |
| except Exception as e: | |
| return f"Error: {str(e)}", [] | |
| # Function to clean unwanted content (like headers, footers, etc.) | |
| def clean_content(soup): | |
| for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]): | |
| tag.decompose() # Remove the tag completely | |
| return soup.get_text(separator="\n", strip=True) | |
| # Function to scrape selected links and generate Word files | |
| def scrape_and_generate_word(selected_links): | |
| try: | |
| if not selected_links: | |
| return "No links selected.", None | |
| word_files = [] | |
| batch_size = 4 # Each Word file contains up to 4 links | |
| for i in range(0, len(selected_links), batch_size): | |
| batch_links = selected_links[i:i + batch_size] | |
| doc = Document() | |
| for link in batch_links: | |
| try: | |
| response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"}) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| page_text = clean_content(soup) | |
| # Add title for each link | |
| doc.add_heading(f"Content from: {link}", level=1) | |
| doc.add_paragraph(page_text) | |
| doc.add_page_break() # Ensure proper formatting | |
| except: | |
| doc.add_paragraph(f"Failed to fetch content from {link}\n\n") | |
| # Save the Word file | |
| word_filename = f"output_{(i//batch_size) + 1}.docx" | |
| doc.save(word_filename) | |
| word_files.append(word_filename) | |
| return word_files # Return list of generated Word files | |
| except Exception as e: | |
| return f"Error: {str(e)}", None | |
| # Gradio UI with link selection | |
| def show_links_and_generate_word(url): | |
| message, links = extract_links(url) | |
| if not links: | |
| return message, gr.update(choices=[], value=[]) | |
| return message, gr.update(choices=links, value=[]) | |
| iface = gr.Blocks() | |
| with iface: | |
| gr.Markdown("### π Web Scraper & Word Document Generator") | |
| gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).") | |
| url_input = gr.Textbox(label="Enter Website URL") | |
| extract_btn = gr.Button("Extract Links") | |
| message_output = gr.Markdown("") | |
| link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True) | |
| generate_btn = gr.Button("Generate Word Files") | |
| word_output = gr.File(label="Download Generated Word Files") | |
| extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector]) | |
| generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output) | |
| iface.launch() | |