# Web Scraping #%% import urllib3 from bs4 import BeautifulSoup def clean_text(text): text = text.lower() if '-' in text: text = text.split('-', 1)[1].strip() text = ''.join(char for char in text if char.isalnum() or char.isspace()) return text def get_title(url): http = urllib3.PoolManager() resp = http.request('GET', url) soup = BeautifulSoup(resp.data, 'html.parser') title = soup.find("title") return title.text if title else "No title found" import urllib3 from bs4 import BeautifulSoup import re def get_article_text(url): # Validate URL format if not isinstance(url, str) or not re.match(r'^https?://', url): return "Error: Invalid URL. Please enter a valid website link." http = urllib3.PoolManager() try: # Attempt to make a request resp = http.request('GET', url) # Check if response is successful if resp.status != 200: return f"Error: Failed to retrieve article. HTTP Status Code: {resp.status}" soup = BeautifulSoup(resp.data, 'html.parser') article_text = [] for tag in ["article", "story"]: article_body = soup.find_all(tag) if article_body: article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body]) # Plan B: Extract all
tags if no article/story tag is found if not article_text: article_body = soup.find_all('p') if article_body: article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body]) return "\n".join(article_text) if article_text else "No article text found." except urllib3.exceptions.MaxRetryError: return "Error: Unable to reach the website. Please check the URL and try again." except urllib3.exceptions.HTTPError as e: return f"Error: HTTP error occurred - {e}" except Exception as e: return f"Error: An unexpected error occurred - {e}" # %% # load fact checker and tokenization from tensorflow.keras.models import load_model import pickle model = load_model('fact_checker_trained.keras') with open('fact_checker_tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # %% # Use model on scraped text import numpy as np from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.text import Tokenizer def evaluate_text(text, model, tokenizer, max_len=1000): sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence padded_sequence = pad_sequences(sequence, maxlen=max_len) # Pad to max_len result = float(model.predict(padded_sequence)) return result # %%