File size: 2,835 Bytes
0389a81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFLoader
from bs4 import BeautifulSoup

def extract_text_from_html(html_content):
    """Extract text content from HTML."""
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()
        
        # Get text
        text = soup.get_text()
        
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        print(f"Error parsing HTML: {e}")
        return html_content  # Return original content in case of error

def load_documents():
    """Process all files in the data folder"""
    data_folder = "data"
    
    # Check if folder exists
    if not os.path.exists(data_folder):
        return []
            
    # Get list of files in the folder
    files = [f for f in os.listdir(data_folder) if os.path.isfile(os.path.join(data_folder, f))]
    
    if not files:
        return []
    
    # Process each file
    documents = []
    
    # Process each file
    for filename in files:
        file_path = os.path.join(data_folder, filename)
        print(f"Processing file: {file_path}")
        
        try:
            # Read the file content directly
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            
            # Check if it's HTML and extract text if so
            if filename.lower().endswith('.html') or content.strip().startswith('<!DOCTYPE html>') or content.strip().startswith('<html'):
                print(f"Detected HTML content in {filename}, extracting text...")
                content = extract_text_from_html(content)
            
            # Print sample document content for debugging
            print(f"File: {filename}, Content length: {len(content)}")
            print(f"Sample content (first 200 chars): {content[:200]}")
            
            # Add to documents list
            documents.append(content)
           
        except Exception as e:
            print(f"Error processing file {filename}: {str(e)}. Skipping.")
            continue
 
    return documents

def split_documents(documents):  
    text_splitter = CharacterTextSplitter()
    try:      
        texts = text_splitter.split_texts(documents)      
    except Exception as e:
        print(f"Error split_documents: {str(e)}.")
        return []
    
    return texts