Spaces:

hwang2006
/

SummaryMaker

Sleeping

App Files Files Community

SummaryMaker / src /summarizer /utils.py

hwang2006

Upload summarymaker files incluidng src, examples, etc

20e57f9 verified about 1 year ago

raw

history blame contribute delete

2.83 kB

	import requests
	from bs4 import BeautifulSoup
	import time

	def read_file(file_path):
	"""
	Read text content from a file.

	Args:
	file_path (str): Path to the text file

	Returns:
	str: File content
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read().strip()
	if not content:
	raise Exception("File is empty")
	return content
	except UnicodeDecodeError:
	# Try with different encodings if utf-8 fails
	try:
	with open(file_path, 'r', encoding='latin-1') as f:
	content = f.read().strip()
	if not content:
	raise Exception("File is empty")
	return content
	except Exception as e:
	raise Exception(f"Failed to read file with alternative encoding: {str(e)}")
	except Exception as e:
	raise Exception(f"File reading failed: {str(e)}")

	def extract_from_url(url):
	"""
	Extract text content from a URL.

	Args:
	url (str): URL to extract text from

	Returns:
	str: Extracted text content
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	# Add retry mechanism
	max_retries = 3
	for attempt in range(max_retries):
	try:
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()
	break
	except requests.RequestException as e:
	if attempt == max_retries - 1:
	raise
	time.sleep(1)

	soup = BeautifulSoup(response.text, 'html.parser')
	# Try to get text from articles first
	article_text = ""
	articles = soup.find_all(['article', 'main'])
	if articles:
	for article in articles:
	paragraphs = article.find_all("p")
	article_text += " ".join(p.text.strip() for p in paragraphs if p.text.strip())

	# If no article text found, fall back to all paragraphs
	if not article_text:
	paragraphs = soup.find_all("p")
	article_text = " ".join(p.text.strip() for p in paragraphs if p.text.strip())

	if not article_text:
	raise Exception("No text content found on the page")

	return article_text
	except requests.RequestException as e:
	raise Exception(f"Failed to fetch URL: {str(e)}")
	except Exception as e:
	raise Exception(f"URL extraction failed: {str(e)}")