Spaces:
Sleeping
Sleeping
| # smart_web_analyzer.py | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline | |
| import torch | |
| class WebAnalyzer: | |
| def __init__(self): | |
| self.device = 0 if torch.cuda.is_available() else -1 | |
| self.models = { | |
| 'summarize': pipeline("summarization", model="facebook/bart-large-cnn"), | |
| 'sentiment': pipeline("text-classification", | |
| model="nlptown/bert-base-multilingual-uncased-sentiment"), | |
| 'topics': pipeline("zero-shot-classification", | |
| model="facebook/bart-large-mnli") | |
| } | |
| def fetch_content(self, url: str) -> str: | |
| """Fetch webpage content with custom headers""" | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} | |
| response = requests.get(url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| return response.text | |
| def clean_html(self, html: str) -> str: | |
| """Basic HTML cleaning preserving all tags""" | |
| soup = BeautifulSoup(html, 'html.parser') | |
| return soup.prettify() | |
| def analyze(self, url: str, modes: list) -> dict: | |
| """Core analysis pipeline""" | |
| results = {} | |
| try: | |
| html = self.fetch_content(url) | |
| results['clean_text'] = self.clean_html(html) | |
| if 'summarize' in modes: | |
| results['summary'] = self.models['summarize'](html, max_length=150)[0]['summary_text'] | |
| if 'sentiment' in modes: | |
| sentiment = self.models['sentiment'](html[:512])[0] | |
| results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})" | |
| if 'topics' in modes: | |
| topics = self.models['topics'](html[:512], | |
| candidate_labels=["Technology", "AI", "Business", | |
| "Science", "Politics"]) | |
| results['topics'] = {topic: score for topic, score | |
| in zip(topics['labels'], topics['scores'])} | |
| except Exception as e: | |
| results['error'] = str(e) | |
| return results |