import requests from bs4 import BeautifulSoup import time def read_file(file_path): """ Read text content from a file. Args: file_path (str): Path to the text file Returns: str: File content """ try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip() if not content: raise Exception("File is empty") return content except UnicodeDecodeError: # Try with different encodings if utf-8 fails try: with open(file_path, 'r', encoding='latin-1') as f: content = f.read().strip() if not content: raise Exception("File is empty") return content except Exception as e: raise Exception(f"Failed to read file with alternative encoding: {str(e)}") except Exception as e: raise Exception(f"File reading failed: {str(e)}") def extract_from_url(url): """ Extract text content from a URL. Args: url (str): URL to extract text from Returns: str: Extracted text content """ try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } # Add retry mechanism max_retries = 3 for attempt in range(max_retries): try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() break except requests.RequestException as e: if attempt == max_retries - 1: raise time.sleep(1) soup = BeautifulSoup(response.text, 'html.parser') # Try to get text from articles first article_text = "" articles = soup.find_all(['article', 'main']) if articles: for article in articles: paragraphs = article.find_all("p") article_text += " ".join(p.text.strip() for p in paragraphs if p.text.strip()) # If no article text found, fall back to all paragraphs if not article_text: paragraphs = soup.find_all("p") article_text = " ".join(p.text.strip() for p in paragraphs if p.text.strip()) if not article_text: raise Exception("No text content found on the page") return article_text except requests.RequestException as e: raise Exception(f"Failed to fetch URL: {str(e)}") except Exception as e: raise Exception(f"URL extraction failed: {str(e)}")