Spaces:

MHamdan
/

smart-web-analyzer-plus

Runtime error

App Files Files Community

smart-web-analyzer-plus / smart_web_analyzer.py

MHamdan

Update files

8cc50db verified 11 months ago

raw

history blame contribute delete

5.82 kB

	# smart_web_analyzer.py
	"""
	Smart Web Analyzer Plus - Core Functionality

	Features:
	- Web content fetching with custom User-Agent (to avoid 403 errors)
	- Basic HTML cleaning (no removal of script/style)
	- Summarization using "facebook/bart-large-cnn"
	- Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment"
	- Topic detection via zero-shot classification ("facebook/bart-large-mnli")
	- Preview text for display
	"""

	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline

	# 1) Summarization Pipeline
	try:
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	except Exception as e:
	summarizer = None
	print("Error loading summarization model:", e)

	# 2) Sentiment Analysis Pipeline
	try:
	sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
	except Exception as e:
	sentiment_analyzer = None
	print("Error loading sentiment analysis model:", e)

	# 3) Zero-Shot Topic Detection Pipeline
	try:
	zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	except Exception as e:
	zero_shot_classifier = None
	print("Error loading topic detection model:", e)


	def fetch_web_content(url):
	"""
	Fetches the HTML content of a given URL, using a spoofed User-Agent.

	Parameters:
	url (str): The URL to fetch.

	Returns:
	str: HTML content if successful.

	Raises:
	ValueError: if the URL is invalid.
	Exception: if the request fails (network error, 4xx/5xx, etc.).
	"""
	# Validate input URL
	if not url.startswith("http://") and not url.startswith("https://"):
	raise ValueError("Invalid URL. URL must start with http:// or https://")

	# Spoof common browser User-Agent to reduce 403 errors
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"
	)
	}

	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status() # Raises HTTPError for 4XX or 5XX
	return response.text
	except requests.exceptions.RequestException as e:
	# Catch all exceptions from the requests library
	raise Exception(f"Error fetching the URL: {e}")


	def clean_text(html_content):
	"""
	Cleans HTML content to extract raw text (keeps <script> and <style>).

	Parameters:
	html_content (str): The raw HTML content.

	Returns:
	str: Cleaned text extracted from the HTML.
	"""
	soup = BeautifulSoup(html_content, "html.parser")
	# NOTE: We are NOT removing <script> or <style> tags here:
	# for script_or_style in soup(["script", "style"]):
	# script_or_style.decompose()

	text = soup.get_text(separator=" ")
	# Collapse multiple whitespaces
	cleaned_text = " ".join(text.split())
	return cleaned_text


	def summarize_text(text, max_length=130, min_length=30):
	"""
	Summarizes text using the facebook/bart-large-cnn model.

	Parameters:
	text (str): The text to summarize.
	max_length (int): Maximum length for the summary.
	min_length (int): Minimum length for the summary.

	Returns:
	str: The summarized text or an error message.
	"""
	if not summarizer:
	return "Summarization model is not available."

	try:
	summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
	return summary_list[0]["summary_text"]
	except Exception as e:
	return f"Error during summarization: {e}"


	def analyze_sentiment(text):
	"""
	Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment.

	Parameters:
	text (str): Text for sentiment analysis.

	Returns:
	str: A label describing sentiment (e.g., '4 stars') or an error message.
	"""
	if not sentiment_analyzer:
	return "Sentiment analysis model is not available."

	try:
	results = sentiment_analyzer(text)
	# Typically returns a list of results; we grab the first
	label = results[0]["label"]
	return label
	except Exception as e:
	return f"Error during sentiment analysis: {e}"


	def detect_topic(text):
	"""
	Detects topics in text using zero-shot classification via facebook/bart-large-mnli.

	Parameters:
	text (str): The text to analyze.

	Returns:
	dict or str: Dictionary of topics & confidence scores OR an error string.
	"""
	if not zero_shot_classifier:
	return {"error": "Topic detection model is not available."}

	# Example candidate labels
	candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"]

	try:
	result = zero_shot_classifier(text, candidate_labels)
	# result['labels'] are sorted by confidence
	# We'll map each label to its corresponding score
	topics = {
	label: score for label, score
	in zip(result["labels"], result["scores"])
	}
	return topics
	except Exception as e:
	return {"error": f"Error during topic detection: {e}"}


	def preview_clean_text(text, max_chars=500):
	"""
	Returns a preview slice of the cleaned text for display.

	Parameters:
	text (str): The text to preview.
	max_chars (int): Maximum number of characters in the preview.

	Returns:
	str: The truncated text plus ellipsis if it's longer than max_chars.
	"""
	if len(text) > max_chars:
	return text[:max_chars] + "..."
	return text

	# End of smart_web_analyzer.py