Spaces:

holcombzv
/

fact_checker_streamlit

Sleeping

App Files Files

fact_checker_streamlit / functions.py

holcombzv

Upload 7 files

27c9a60 verified about 1 year ago

raw

history blame

2.84 kB

	# Web Scraping

	#%%
	import urllib3
	from bs4 import BeautifulSoup

	def clean_text(text):
	text = text.lower()
	if '-' in text:
	text = text.split('-', 1)[1].strip()
	text = ''.join(char for char in text if char.isalnum() or char.isspace())
	return text

	def get_title(url):
	http = urllib3.PoolManager()
	resp = http.request('GET', url)

	soup = BeautifulSoup(resp.data, 'html.parser')
	title = soup.find("title")

	return title.text if title else "No title found"


	import urllib3
	from bs4 import BeautifulSoup
	import re

	def get_article_text(url):
	# Validate URL format
	if not isinstance(url, str) or not re.match(r'^https?://', url):
	return "Error: Invalid URL. Please enter a valid website link."

	http = urllib3.PoolManager()

	try:
	# Attempt to make a request
	resp = http.request('GET', url)

	# Check if response is successful
	if resp.status != 200:
	return f"Error: Failed to retrieve article. HTTP Status Code: {resp.status}"

	soup = BeautifulSoup(resp.data, 'html.parser')

	article_text = []

	for tag in ["article", "story"]:
	article_body = soup.find_all(tag)
	if article_body:
	article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])

	# Plan B: Extract all <p> tags if no article/story tag is found
	if not article_text:
	article_body = soup.find_all('p')
	if article_body:
	article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body])

	return "\n".join(article_text) if article_text else "No article text found."

	except urllib3.exceptions.MaxRetryError:
	return "Error: Unable to reach the website. Please check the URL and try again."

	except urllib3.exceptions.HTTPError as e:
	return f"Error: HTTP error occurred - {e}"

	except Exception as e:
	return f"Error: An unexpected error occurred - {e}"

	# %%
	# load fact checker and tokenization

	from tensorflow.keras.models import load_model
	import pickle

	model = load_model('fact_checker_trained.keras')

	with open('fact_checker_tokenizer.pickle', 'rb') as handle:
	tokenizer = pickle.load(handle)

	# %%
	# Use model on scraped text
	import numpy as np
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.text import Tokenizer

	def evaluate_text(text, model, tokenizer, max_len=1000):
	sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence
	padded_sequence = pad_sequences(sequence, maxlen=max_len) # Pad to max_len

	result = float(model.predict(padded_sequence))
	return result
	# %%