fake_news_detection_api / functions.py
holcombzv's picture
Updated scraping behavior.
609d9fc
import tensorflow as tf
import logging
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
def get_article_text(html_text):
try:
soup = BeautifulSoup(html_text, 'html.parser')
article_text = []
# Step 1: Try <article> tag
article_tag = soup.find('article')
if article_tag:
paragraphs = article_tag.find_all('p')
article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
if article_text:
return article_text
# Step 2: Try common container patterns (site-specific fallbacks)
candidates = [
{"name": "div", "attrs": {"class": "article-body"}},
{"name": "section", "attrs": {"name": "articleBody"}},
{"name": "div", "attrs": {"property": "articleBody"}},
{"name": "div", "attrs": {"class": "Article__content"}},
]
for cand in candidates:
container = soup.find(cand["name"], cand["attrs"])
if container:
paragraphs = container.find_all('p')
article_text = [clean_text(p.get_text(strip=True)) for p in paragraphs if p.get_text(strip=True)]
if article_text:
return article_text
# Step 3: Fallback → all <p> tags, but filter out junk
bad_classes = ['caption', 'credit', 'advertisement', 'footer']
for p in soup.find_all('p'):
if not any(cls in (p.get('class') or []) for cls in bad_classes):
text = p.get_text(strip=True)
if text:
article_text.append(clean_text(text))
return article_text # Always return a list (may be empty)
except Exception as e:
logger.exception(f'Error: Could not retrieve article text: {e}')
return []
def split_paragraphs(text: str):
paragraphs = text.splitlines()
for paragraph in paragraphs:
paragraph = clean_text(paragraph)
return paragraphs
def clean_text(text):
try:
text = text.lower()
if '-' in text:
text = text.split('-', 1)[1].strip()
text = ''.join(char for char in text if char.isalnum() or char.isspace())
return text
except AttributeError:
logger.exception(f'Input for clean_text has to be a string. Incorrect value: {text} Type: {type(text)}')
except Exception as e:
logger.exception(f"Error cleaning text: {e} | Input: {text}")
# Use model on scraped text
def evaluate_text(text, model, tokenizer, max_len=1000):
sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence
padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=max_len) # Pad to max_len
result = float(model.predict(padded_sequence))
return result