Spaces:
Sleeping
Sleeping
| import tensorflow as tf | |
| import logging | |
| from bs4 import BeautifulSoup | |
| logger = logging.getLogger(__name__) | |
| def get_article_text(html_text): | |
| try: | |
| soup = BeautifulSoup(html_text, 'html.parser') | |
| article_text = [] | |
| # Step 1: Try <article> tag | |
| article_tag = soup.find('article') | |
| if article_tag: | |
| paragraphs = article_tag.find_all('p') | |
| article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)] | |
| if article_text: | |
| return article_text | |
| # Step 2: Try common container patterns (site-specific fallbacks) | |
| candidates = [ | |
| {"name": "div", "attrs": {"class": "article-body"}}, | |
| {"name": "section", "attrs": {"name": "articleBody"}}, | |
| {"name": "div", "attrs": {"property": "articleBody"}}, | |
| {"name": "div", "attrs": {"class": "Article__content"}}, | |
| ] | |
| for cand in candidates: | |
| container = soup.find(cand["name"], cand["attrs"]) | |
| if container: | |
| paragraphs = container.find_all('p') | |
| article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)] | |
| if article_text: | |
| return article_text | |
| # Step 3: Fallback → all <p> tags, but filter out junk | |
| bad_classes = ['caption', 'credit', 'advertisement', 'footer'] | |
| for p in soup.find_all('p'): | |
| if not any(cls in (p.get('class') or []) for cls in bad_classes): | |
| text = p.get_text(strip=True) | |
| if text: | |
| article_text.append(clean_text(text)) | |
| return article_text # Always return a list (may be empty) | |
| except Exception as e: | |
| logger.exception(f'Error: Could not retrieve article text: {e}') | |
| return [] | |
| def split_paragraphs(text: str): | |
| paragraphs = text.splitlines() | |
| for paragraph in paragraphs: | |
| paragraph = clean_text(paragraph) | |
| return paragraphs | |
| def clean_text(text): | |
| try: | |
| text = text.lower() | |
| if '-' in text: | |
| text = text.split('-', 1)[1].strip() | |
| text = ''.join(char for char in text if char.isalnum() or char.isspace()) | |
| return text | |
| except AttributeError: | |
| logger.exception(f'Input for clean_text has to be a string. Incorrect value: {text} Type: {type(text)}') | |
| except Exception as e: | |
| logger.exception(f"Error cleaning text: {e} | Input: {text}") | |
| # Use model on scraped text | |
| def evaluate_text(text, model, tokenizer, max_len=1000): | |
| sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence | |
| padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len) # Pad to max_len | |
| result = float(model.predict(padded_sequence)) | |
| return result | |