Spaces:
Sleeping
Sleeping
File size: 2,825 Bytes
20e57f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import requests
from bs4 import BeautifulSoup
import time
def read_file(file_path):
"""
Read text content from a file.
Args:
file_path (str): Path to the text file
Returns:
str: File content
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
raise Exception("File is empty")
return content
except UnicodeDecodeError:
# Try with different encodings if utf-8 fails
try:
with open(file_path, 'r', encoding='latin-1') as f:
content = f.read().strip()
if not content:
raise Exception("File is empty")
return content
except Exception as e:
raise Exception(f"Failed to read file with alternative encoding: {str(e)}")
except Exception as e:
raise Exception(f"File reading failed: {str(e)}")
def extract_from_url(url):
"""
Extract text content from a URL.
Args:
url (str): URL to extract text from
Returns:
str: Extracted text content
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Add retry mechanism
max_retries = 3
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
break
except requests.RequestException as e:
if attempt == max_retries - 1:
raise
time.sleep(1)
soup = BeautifulSoup(response.text, 'html.parser')
# Try to get text from articles first
article_text = ""
articles = soup.find_all(['article', 'main'])
if articles:
for article in articles:
paragraphs = article.find_all("p")
article_text += " ".join(p.text.strip() for p in paragraphs if p.text.strip())
# If no article text found, fall back to all paragraphs
if not article_text:
paragraphs = soup.find_all("p")
article_text = " ".join(p.text.strip() for p in paragraphs if p.text.strip())
if not article_text:
raise Exception("No text content found on the page")
return article_text
except requests.RequestException as e:
raise Exception(f"Failed to fetch URL: {str(e)}")
except Exception as e:
raise Exception(f"URL extraction failed: {str(e)}") |