Spaces:
Runtime error
Runtime error
| # smart_web_analyzer.py | |
| """ | |
| Smart Web Analyzer Plus - Core Functionality | |
| Features: | |
| - Web content fetching with custom User-Agent (to avoid 403 errors) | |
| - Basic HTML cleaning (no removal of script/style) | |
| - Summarization using "facebook/bart-large-cnn" | |
| - Sentiment analysis using "nlptown/bert-base-multilingual-uncased-sentiment" | |
| - Topic detection via zero-shot classification ("facebook/bart-large-mnli") | |
| - Preview text for display | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline | |
| # 1) Summarization Pipeline | |
| try: | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| except Exception as e: | |
| summarizer = None | |
| print("Error loading summarization model:", e) | |
| # 2) Sentiment Analysis Pipeline | |
| try: | |
| sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment") | |
| except Exception as e: | |
| sentiment_analyzer = None | |
| print("Error loading sentiment analysis model:", e) | |
| # 3) Zero-Shot Topic Detection Pipeline | |
| try: | |
| zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
| except Exception as e: | |
| zero_shot_classifier = None | |
| print("Error loading topic detection model:", e) | |
| def fetch_web_content(url): | |
| """ | |
| Fetches the HTML content of a given URL, using a spoofed User-Agent. | |
| Parameters: | |
| url (str): The URL to fetch. | |
| Returns: | |
| str: HTML content if successful. | |
| Raises: | |
| ValueError: if the URL is invalid. | |
| Exception: if the request fails (network error, 4xx/5xx, etc.). | |
| """ | |
| # Validate input URL | |
| if not url.startswith("http://") and not url.startswith("https://"): | |
| raise ValueError("Invalid URL. URL must start with http:// or https://") | |
| # Spoof common browser User-Agent to reduce 403 errors | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36" | |
| ) | |
| } | |
| try: | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() # Raises HTTPError for 4XX or 5XX | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| # Catch all exceptions from the requests library | |
| raise Exception(f"Error fetching the URL: {e}") | |
| def clean_text(html_content): | |
| """ | |
| Cleans HTML content to extract raw text (keeps <script> and <style>). | |
| Parameters: | |
| html_content (str): The raw HTML content. | |
| Returns: | |
| str: Cleaned text extracted from the HTML. | |
| """ | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| # NOTE: We are NOT removing <script> or <style> tags here: | |
| # for script_or_style in soup(["script", "style"]): | |
| # script_or_style.decompose() | |
| text = soup.get_text(separator=" ") | |
| # Collapse multiple whitespaces | |
| cleaned_text = " ".join(text.split()) | |
| return cleaned_text | |
| def summarize_text(text, max_length=130, min_length=30): | |
| """ | |
| Summarizes text using the facebook/bart-large-cnn model. | |
| Parameters: | |
| text (str): The text to summarize. | |
| max_length (int): Maximum length for the summary. | |
| min_length (int): Minimum length for the summary. | |
| Returns: | |
| str: The summarized text or an error message. | |
| """ | |
| if not summarizer: | |
| return "Summarization model is not available." | |
| try: | |
| summary_list = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) | |
| return summary_list[0]["summary_text"] | |
| except Exception as e: | |
| return f"Error during summarization: {e}" | |
| def analyze_sentiment(text): | |
| """ | |
| Analyzes sentiment using nlptown/bert-base-multilingual-uncased-sentiment. | |
| Parameters: | |
| text (str): Text for sentiment analysis. | |
| Returns: | |
| str: A label describing sentiment (e.g., '4 stars') or an error message. | |
| """ | |
| if not sentiment_analyzer: | |
| return "Sentiment analysis model is not available." | |
| try: | |
| results = sentiment_analyzer(text) | |
| # Typically returns a list of results; we grab the first | |
| label = results[0]["label"] | |
| return label | |
| except Exception as e: | |
| return f"Error during sentiment analysis: {e}" | |
| def detect_topic(text): | |
| """ | |
| Detects topics in text using zero-shot classification via facebook/bart-large-mnli. | |
| Parameters: | |
| text (str): The text to analyze. | |
| Returns: | |
| dict or str: Dictionary of topics & confidence scores OR an error string. | |
| """ | |
| if not zero_shot_classifier: | |
| return {"error": "Topic detection model is not available."} | |
| # Example candidate labels | |
| candidate_labels = ["Politics", "Technology", "Business", "Entertainment", "Science", "Health", "Sports", "Education"] | |
| try: | |
| result = zero_shot_classifier(text, candidate_labels) | |
| # result['labels'] are sorted by confidence | |
| # We'll map each label to its corresponding score | |
| topics = { | |
| label: score for label, score | |
| in zip(result["labels"], result["scores"]) | |
| } | |
| return topics | |
| except Exception as e: | |
| return {"error": f"Error during topic detection: {e}"} | |
| def preview_clean_text(text, max_chars=500): | |
| """ | |
| Returns a preview slice of the cleaned text for display. | |
| Parameters: | |
| text (str): The text to preview. | |
| max_chars (int): Maximum number of characters in the preview. | |
| Returns: | |
| str: The truncated text plus ellipsis if it's longer than max_chars. | |
| """ | |
| if len(text) > max_chars: | |
| return text[:max_chars] + "..." | |
| return text | |
| # End of smart_web_analyzer.py | |