import tensorflow as tf import logging from bs4 import BeautifulSoup logger = logging.getLogger(__name__) def get_article_text(html_text): try: soup = BeautifulSoup(html_text, 'html.parser') article_text = [] # Step 1: Try
tag article_tag = soup.find('article') if article_tag: paragraphs = article_tag.find_all('p') article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)] if article_text: return article_text # Step 2: Try common container patterns (site-specific fallbacks) candidates = [ {"name": "div", "attrs": {"class": "article-body"}}, {"name": "section", "attrs": {"name": "articleBody"}}, {"name": "div", "attrs": {"property": "articleBody"}}, {"name": "div", "attrs": {"class": "Article__content"}}, ] for cand in candidates: container = soup.find(cand["name"], cand["attrs"]) if container: paragraphs = container.find_all('p') article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)] if article_text: return article_text # Step 3: Fallback → all

tags, but filter out junk bad_classes = ['caption', 'credit', 'advertisement', 'footer'] for p in soup.find_all('p'): if not any(cls in (p.get('class') or []) for cls in bad_classes): text = p.get_text(strip=True) if text: article_text.append(clean_text(text)) return article_text # Always return a list (may be empty) except Exception as e: logger.exception(f'Error: Could not retrieve article text: {e}') return [] def split_paragraphs(text: str): paragraphs = text.splitlines() for paragraph in paragraphs: paragraph = clean_text(paragraph) return paragraphs def clean_text(text): try: text = text.lower() if '-' in text: text = text.split('-', 1)[1].strip() text = ''.join(char for char in text if char.isalnum() or char.isspace()) return text except AttributeError: logger.exception(f'Input for clean_text has to be a string. Incorrect value: {text} Type: {type(text)}') except Exception as e: logger.exception(f"Error cleaning text: {e} | Input: {text}") # Use model on scraped text def evaluate_text(text, model, tokenizer, max_len=1000): sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len) # Pad to max_len result = float(model.predict(padded_sequence)) return result