Spaces:
Sleeping
Sleeping
| # Web Scraping | |
| #%% | |
| import urllib3 | |
| from bs4 import BeautifulSoup | |
| def clean_text(text): | |
| text = text.lower() | |
| if '-' in text: | |
| text = text.split('-', 1)[1].strip() | |
| text = ''.join(char for char in text if char.isalnum() or char.isspace()) | |
| return text | |
| def get_title(url): | |
| http = urllib3.PoolManager() | |
| resp = http.request('GET', url) | |
| soup = BeautifulSoup(resp.data, 'html.parser') | |
| title = soup.find("title") | |
| return title.text if title else "No title found" | |
| import urllib3 | |
| from bs4 import BeautifulSoup | |
| import re | |
| def get_article_text(url): | |
| # Validate URL format | |
| if not isinstance(url, str) or not re.match(r'^https?://', url): | |
| return "Error: Invalid URL. Please enter a valid website link." | |
| http = urllib3.PoolManager() | |
| try: | |
| # Attempt to make a request | |
| resp = http.request('GET', url) | |
| # Check if response is successful | |
| if resp.status != 200: | |
| return f"Error: Failed to retrieve article. HTTP Status Code: {resp.status}" | |
| soup = BeautifulSoup(resp.data, 'html.parser') | |
| article_text = [] | |
| for tag in ["article", "story"]: | |
| article_body = soup.find_all(tag) | |
| if article_body: | |
| article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body]) | |
| # Plan B: Extract all <p> tags if no article/story tag is found | |
| if not article_text: | |
| article_body = soup.find_all('p') | |
| if article_body: | |
| article_text.extend([clean_text(p.get_text(strip=True)) for p in article_body]) | |
| return "\n".join(article_text) if article_text else "No article text found." | |
| except urllib3.exceptions.MaxRetryError: | |
| return "Error: Unable to reach the website. Please check the URL and try again." | |
| except urllib3.exceptions.HTTPError as e: | |
| return f"Error: HTTP error occurred - {e}" | |
| except Exception as e: | |
| return f"Error: An unexpected error occurred - {e}" | |
| # %% | |
| # load fact checker and tokenization | |
| from tensorflow.keras.models import load_model | |
| import pickle | |
| model = load_model('fact_checker_trained.keras') | |
| with open('fact_checker_tokenizer.pickle', 'rb') as handle: | |
| tokenizer = pickle.load(handle) | |
| # %% | |
| # Use model on scraped text | |
| import numpy as np | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| from tensorflow.keras.preprocessing.text import Tokenizer | |
| def evaluate_text(text, model, tokenizer, max_len=1000): | |
| sequence = tokenizer.texts_to_sequences([text]) # Convert text to sequence | |
| padded_sequence = pad_sequences(sequence, maxlen=max_len) # Pad to max_len | |
| result = float(model.predict(padded_sequence)) | |
| return result | |
| # %% | |