Spaces:
Sleeping
Sleeping
| ''' | |
| # Web Scrapping | |
| [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping) | |
| ''' | |
| import os, re, requests, uuid, zipfile, hashlib, shutil | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from transformers import pipeline, AutoTokenizer | |
| import torch | |
| # Function to validate URLs | |
| def validator(url): | |
| parsed = urlparse(url) | |
| return bool(parsed.netloc) and bool(parsed.scheme) | |
| def finder(url, soup, media_type): | |
| files = [] | |
| # Find text | |
| if media_type == "text": | |
| text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] | |
| for tag in text_tags: | |
| for element in soup.find_all(tag): | |
| files.append(element.get_text()) | |
| return files | |
| def summarize_long_text(text, model_name="facebook/bart-large-cnn", max_chunk_tokens=500): | |
| # Initialize the summarization pipeline | |
| summarizer = pipeline('summarization', model=model_name) | |
| # Initialize the tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Tokenize the text | |
| tokens = tokenizer.encode(text) | |
| # Split the tokens into chunks of the specified size | |
| chunks = [tokens[i:i + max_chunk_tokens] for i in range(0, len(tokens), max_chunk_tokens)] | |
| # Summarize each chunk and combine the results | |
| final_summary = '' | |
| for chunk in chunks: | |
| chunk_text = tokenizer.decode(chunk) | |
| summary = summarizer(chunk_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] | |
| final_summary += ' ' + summary | |
| return final_summary.strip() | |
| def scrapper(url): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| except (requests.exceptions.RequestException, ValueError) as e: | |
| raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") | |
| return None | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Add text files to the text folder | |
| text_content = finder(url, soup, 'text') | |
| os.makedirs('text', exist_ok=True) | |
| full_text = ' '.join(text_content) # Join the text content into a single string | |
| # Save the full text to a file | |
| with open('text/content.txt', 'w') as text_file: | |
| text_file.write(full_text) | |
| # Summarize the text | |
| summary = summarize_long_text(full_text) | |
| return summary | |
| def checker(url): | |
| if not url: | |
| raise Exception("URL cannot be empty.") | |
| if not url.startswith("https://"): | |
| raise Exception("The URL must begin with https://") | |
| try: | |
| summary_text = scrapper(url) | |
| except requests.exceptions.HTTPError as e: | |
| if e.response.status_code == 403: | |
| raise Exception("HTTP Error: Forbidden. Access to the URL is forbidden.") | |
| else: | |
| raise Exception(f"HTTP Error: {e.response.status_code}") | |
| except TypeError as e: | |
| raise Exception(f"TypeError: {str(e)}") | |
| except (requests.exceptions.RequestException, ValueError) as e: | |
| raise Exception(f"Unable to access URL: {url}. Error: {str(e)}") | |
| if not summary_text: | |
| raise Exception("Found no text.") | |
| print(f"Returning summarized text from {url} ...") | |
| return summary_text | |
| with gr.Blocks(theme="dwancin/theme") as app: | |
| title = gr.Markdown('''# Web Scraping 🕵️''') | |
| description = gr.Markdown('''Get the summarized text from your desired webpages with just a few clicks.''') | |
| with gr.Row(): | |
| with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"): | |
| url_name = gr.Textbox( | |
| placeholder="Enter URL here", | |
| show_label=True, | |
| label="Website", | |
| ) | |
| submit_button = gr.Button( | |
| "Submit", | |
| variant="primary", | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=2): | |
| summary_output = gr.Textbox( | |
| label="Summary", | |
| elem_id="summary-text", | |
| size="lg", | |
| show_label=False, | |
| readonly=True, | |
| ) | |
| submit_button.click( | |
| checker, | |
| inputs=[url_name], | |
| outputs=[summary_output], | |
| ) | |
| app.launch() | |