import os from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFLoader from bs4 import BeautifulSoup def extract_text_from_html(html_content): """Extract text content from HTML.""" try: soup = BeautifulSoup(html_content, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.extract() # Get text text = soup.get_text() # Break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # Break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # Drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) return text except Exception as e: print(f"Error parsing HTML: {e}") return html_content # Return original content in case of error def load_documents(): """Process all files in the data folder""" data_folder = "data" # Check if folder exists if not os.path.exists(data_folder): return [] # Get list of files in the folder files = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))] if not files: return [] # Process each file documents = [] # Process each file for filename in files: file_path = os.path.join(data_folder, filename) print(f"Processing file: {file_path}") try: # Read the file content directly with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Check if it's HTML and extract text if so if filename.lower().endswith('.html') or content.strip().startswith('') or content.strip().startswith('