Spaces:

Mishal23
/

web-scrapper

Paused

App Files Files Community

web-scrapper / app.py

Mishal23

Update app.py

dcfc8fd verified 10 months ago

raw

history blame contribute delete

3.95 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import os
	import math
	from docx import Document # Import for Word file generation

	# Function to extract all links from a website
	def extract_links(url):
	try:
	response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code != 200:
	return f"Error: Unable to fetch page (Status Code {response.status_code})", []

	soup = BeautifulSoup(response.text, "html.parser")
	base_url = "/".join(url.split("/")[:3]) # Extract base domain

	links = []
	for a_tag in soup.find_all("a", href=True):
	href = a_tag["href"]
	if not href.startswith("http"): # Convert relative links to absolute
	href = base_url + href if href.startswith("/") else base_url + "/" + href

	links.append(href)

	links = list(set(links)) # Remove duplicates
	if not links:
	return "No links found on the website.", []

	return f"✅ {len(links)} links found! Select which ones to convert into Word files:", links

	except Exception as e:
	return f"Error: {str(e)}", []

	# Function to clean unwanted content (like headers, footers, etc.)
	def clean_content(soup):
	for tag in soup.find_all(["header", "footer", "nav", "aside", "script", "style"]):
	tag.decompose() # Remove the tag completely

	return soup.get_text(separator="\n", strip=True)

	# Function to scrape selected links and generate Word files
	def scrape_and_generate_word(selected_links):
	try:
	if not selected_links:
	return "No links selected.", None

	word_files = []
	batch_size = 4 # Each Word file contains up to 4 links

	for i in range(0, len(selected_links), batch_size):
	batch_links = selected_links[i:i + batch_size]
	doc = Document()

	for link in batch_links:
	try:
	response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")
	page_text = clean_content(soup)

	# Add title for each link
	doc.add_heading(f"Content from: {link}", level=1)
	doc.add_paragraph(page_text)
	doc.add_page_break() # Ensure proper formatting
	except:
	doc.add_paragraph(f"Failed to fetch content from {link}\n\n")

	# Save the Word file
	word_filename = f"output_{(i//batch_size) + 1}.docx"
	doc.save(word_filename)
	word_files.append(word_filename)

	return word_files # Return list of generated Word files

	except Exception as e:
	return f"Error: {str(e)}", None

	# Gradio UI with link selection
	def show_links_and_generate_word(url):
	message, links = extract_links(url)
	if not links:
	return message, gr.update(choices=[], value=[])

	return message, gr.update(choices=links, value=[])

	iface = gr.Blocks()

	with iface:
	gr.Markdown("### 🌐 Web Scraper & Word Document Generator")
	gr.Markdown("Enter a website URL to extract internal links, then select which links to convert into Word files (4 links per file).")

	url_input = gr.Textbox(label="Enter Website URL")
	extract_btn = gr.Button("Extract Links")

	message_output = gr.Markdown("")
	link_selector = gr.CheckboxGroup([], label="Select Links to Convert", interactive=True)
	generate_btn = gr.Button("Generate Word Files")

	word_output = gr.File(label="Download Generated Word Files")

	extract_btn.click(show_links_and_generate_word, inputs=url_input, outputs=[message_output, link_selector])
	generate_btn.click(scrape_and_generate_word, inputs=link_selector, outputs=word_output)

	iface.launch()