Spaces:
Runtime error
Runtime error
| from transformers import pipeline | |
| from bs4 import BeautifulSoup | |
| import requests | |
| def fetch_webpage_content(url): | |
| """Fetch the content of a webpage.""" | |
| try: | |
| response = requests.get(url, timeout=10) # Ensures the use of standard HTTP/HTTPS ports | |
| response.raise_for_status() # Raises an error for bad responses | |
| print("Hello", response.text) | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching the webpage: {e}") | |
| return None | |
| def parse_and_segment_content(html_content): | |
| """Parse and segment HTML content into manageable chunks.""" | |
| if not html_content: | |
| return [] | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| results = soup.find_all(['h1', 'p']) | |
| text = ' '.join([result.text for result in results]) | |
| text = text.replace('.', '.<eos>').replace('!', '!<eos>').replace('?', '?<eos>') | |
| sentences = text.split('<eos>') | |
| print("Doing segmentation") | |
| max_chunk = 500 | |
| chunks = [] | |
| current_chunk = -1 | |
| for sentence in sentences: | |
| if len(sentence.strip()) == 0: | |
| continue | |
| if current_chunk == -1 or len(chunks[current_chunk]) + len(sentence.split()) > max_chunk: | |
| chunks.append([]) | |
| current_chunk += 1 | |
| chunks[current_chunk].extend(sentence.split()) | |
| chunks = [' '.join(chunk).strip() for chunk in chunks] | |
| return chunks | |
| def summarize_text(chunks): | |
| """Summarize the given text chunks.""" | |
| if not chunks: | |
| return "No content to summarize." | |
| summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
| summaries = [] | |
| print("Summarizing content") | |
| for chunk in chunks: | |
| try: | |
| summary = summarizer(chunk, max_length=50, min_length=30, do_sample=False) | |
| summaries.append(summary[0]['summary_text']) | |
| except Exception as e: | |
| print(f"Error in summarization: {e}") | |
| summaries.append("Error summarizing text.") # Keep the flow even if summarization fails | |
| return ' '.join(summaries) | |
| # Example usage | |
| # url = "https://example.com" | |
| # html_content = fetch_webpage_content(url) | |
| # if html_content: | |
| # chunks = parse_and_segment_content(html_content) | |
| # summary = summarize_text(chunks) | |
| # print(summary) | |
| # else: | |
| # print("Failed to fetch or parse webpage content.") | |