Spaces:

MHamdan
/

SmartWebAnalyzerPlus

Sleeping

App Files Files Community

SmartWebAnalyzerPlus / smart_web_analyzer.py

MHamdan

Initial commit with full functionality

37ff7dd 11 months ago

raw

history blame

2.27 kB

	# smart_web_analyzer.py
	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import torch

	class WebAnalyzer:
	def __init__(self):
	self.device = 0 if torch.cuda.is_available() else -1
	self.models = {
	'summarize': pipeline("summarization", model="facebook/bart-large-cnn"),
	'sentiment': pipeline("text-classification",
	model="nlptown/bert-base-multilingual-uncased-sentiment"),
	'topics': pipeline("zero-shot-classification",
	model="facebook/bart-large-mnli")
	}

	def fetch_content(self, url: str) -> str:
	"""Fetch webpage content with custom headers"""
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()
	return response.text

	def clean_html(self, html: str) -> str:
	"""Basic HTML cleaning preserving all tags"""
	soup = BeautifulSoup(html, 'html.parser')
	return soup.prettify()

	def analyze(self, url: str, modes: list) -> dict:
	"""Core analysis pipeline"""
	results = {}
	try:
	html = self.fetch_content(url)
	results['clean_text'] = self.clean_html(html)

	if 'summarize' in modes:
	results['summary'] = self.models['summarize'](html, max_length=150)[0]['summary_text']

	if 'sentiment' in modes:
	sentiment = self.models['sentiment'](html[:512])[0]
	results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})"

	if 'topics' in modes:
	topics = self.models['topics'](html[:512],
	candidate_labels=["Technology", "AI", "Business",
	"Science", "Politics"])
	results['topics'] = {topic: score for topic, score
	in zip(topics['labels'], topics['scores'])}

	except Exception as e:
	results['error'] = str(e)

	return results