fact_checker_streamlit / functions.py
holcombzv's picture
Upload 7 files
27c9a60 verified
raw
history blame
2.84 kB
# Web Scraping
#%%
import urllib3
from bs4 import BeautifulSoup
def clean_text(text):
text = text.lower()
if '-' in text:
text = text.split('-', 1)[1].strip()
text = ''.join(char for char in text if char.isalnum() or char.isspace())
return text
def get_title(url):
http = urllib3.PoolManager()
resp = http.request('GET', url)
soup = BeautifulSoup(resp.data, 'html.parser')
title = soup.find("title")
return title.text if title else "No title found"
import urllib3
from bs4 import BeautifulSoup
import re
def get_article_text(url):
# Validate URL format
if not isinstance(url, str) or not re.match(r'^https?://', url):
return "Error: Invalid URL. Please enter a valid website link."
http = urllib3.PoolManager()
try:
# Attempt to make a request
resp = http.request('GET', url)
# Check if response is successful
if resp.status != 200:
return f"Error: Failed to retrieve article. HTTP Status Code: {resp.status}"
soup = BeautifulSoup(resp.data, 'html.parser')
article_text = []
for tag in ["article", "story"]:
article_body = soup.find_all(tag)
if article_body:
article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
# Plan B: Extract all <p> tags if no article/story tag is found
if not article_text:
article_body = soup.find_all('p')
if article_body:
article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])
return "\n".join(article_text) if article_text else "No article text found."
except urllib3.exceptions.MaxRetryError:
return "Error: Unable to reach the website. Please check the URL and try again."
except urllib3.exceptions.HTTPError as e:
return f"Error: HTTP error occurred - {e}"
except Exception as e:
return f"Error: An unexpected error occurred - {e}"
# %%
# load fact checker and tokenization
from tensorflow.keras.models import load_model
import pickle
model = load_model('fact_checker_trained.keras')
with open('fact_checker_tokenizer.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# %%
# Use model on scraped text
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
def evaluate_text(text, model, tokenizer, max_len=1000):
sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence
padded_sequence = pad_sequences(sequence, maxlen=max_len) # Pad to max_len
result = float(model.predict(padded_sequence))
return result
# %%