Spaces:

chelleboyer
/

cert-challenge

Sleeping

App Files Files Community

cert-challenge / utils /data.py

chelleboyer

Initial commit

0389a81 9 months ago

raw

history blame contribute delete

2.84 kB

	import os
	from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFLoader
	from bs4 import BeautifulSoup

	def extract_text_from_html(html_content):
	"""Extract text content from HTML."""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.extract()

	# Get text
	text = soup.get_text()

	# Break into lines and remove leading and trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# Drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text
	except Exception as e:
	print(f"Error parsing HTML: {e}")
	return html_content # Return original content in case of error

	def load_documents():
	"""Process all files in the data folder"""
	data_folder = "data"

	# Check if folder exists
	if not os.path.exists(data_folder):
	return []

	# Get list of files in the folder
	files = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]

	if not files:
	return []

	# Process each file
	documents = []

	# Process each file
	for filename in files:
	file_path = os.path.join(data_folder, filename)
	print(f"Processing file: {file_path}")

	try:
	# Read the file content directly
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Check if it's HTML and extract text if so
	if filename.lower().endswith('.html') or content.strip().startswith('<!DOCTYPE html>') or content.strip().startswith('<html'):
	print(f"Detected HTML content in {filename}, extracting text...")
	content = extract_text_from_html(content)

	# Print sample document content for debugging
	print(f"File: {filename}, Content length: {len(content)}")
	print(f"Sample content (first 200 chars): {content[:200]}")

	# Add to documents list
	documents.append(content)

	except Exception as e:
	print(f"Error processing file {filename}: {str(e)}. Skipping.")
	continue

	return documents

	def split_documents(documents):
	text_splitter = CharacterTextSplitter()
	try:
	texts = text_splitter.split_texts(documents)
	except Exception as e:
	print(f"Error split_documents: {str(e)}.")
	return []

	return texts