File size: 4,830 Bytes
e820a8a
 
 
 
 
 
 
 
 
fb7084d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d12e375
 
 
fb7084d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d12e375
 
fb7084d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# document_processor.py
import os
import glob
from tqdm import tqdm
import pandas as pd
from utils import clean_text, setup_logger

logger = setup_logger('document_processor')

def split_into_chunks(text, chunk_size=400, overlap=75):
    """
    Split text into overlapping chunks
    
    Args:
        text: The text to split
        chunk_size: Number of characters per chunk
        overlap: Number of characters to overlap between chunks
    """
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        
        # Try to break at sentence boundary for better context
        if end < text_length:
            # Look for sentence endings
            last_period = chunk.rfind('.')
            last_question = chunk.rfind('؟')  # Arabic question mark
            last_exclamation = chunk.rfind('!')
            last_newline = chunk.rfind('\n')
            
            # Find the best break point
            break_point = max(last_period, last_question, last_exclamation, last_newline)
            
            # Only break if we're past halfway through the chunk
            if break_point > chunk_size * 0.5:
                chunk = chunk[:break_point + 1]
                end = start + break_point + 1
        
        chunk = chunk.strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        start = end - overlap  # Move start with overlap
    
    return chunks

def load_documents(folder_path, chunk_size=400, overlap=75):
    """
    Load all .txt documents from folder and split them into chunks
    
    Args:
        folder_path: Path to folder containing .txt files
        chunk_size: Size of each chunk in characters (default: 400)
        overlap: Overlap between chunks in characters (default: 75)
    """
    documents = []
    file_count = 0
    
    txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
    
    if not txt_files:
        logger.warning(f"No .txt files found in {folder_path}")
        return pd.DataFrame()
    
    for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = clean_text(file.read())
                
                if not content:
                    logger.warning(f"Empty content in {file_path}")
                    continue
                
                # Split into chunks
                chunks = split_into_chunks(content, chunk_size, overlap)
                
                # Create a document entry for each chunk
                for i, chunk in enumerate(chunks):
                    documents.append({
                        'path': file_path,
                        'chunk_id': i,
                        'total_chunks': len(chunks),
                        'content': chunk,
                        'content_length': len(chunk)
                    })
                
                file_count += 1
                logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
                
        except Exception as e:
            logger.error(f"Error reading {file_path}: {e}")
    
    df = pd.DataFrame(documents)
    
    if not df.empty:
        logger.info(f"Total: {file_count} files → {len(df)} chunks")
        logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
    
    return df

def load_single_document(file_path, chunk_size=400, overlap=75):
    """
    Load a single document and split it into chunks
    
    Args:
        file_path: Path to the .txt file
        chunk_size: Size of each chunk in characters
        overlap: Overlap between chunks in characters
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = clean_text(file.read())
            
            if not content:
                logger.warning(f"Empty content in {file_path}")
                return pd.DataFrame()
            
            # Split into chunks
            chunks = split_into_chunks(content, chunk_size, overlap)
            
            # Create dataframe with chunks
            documents = []
            for i, chunk in enumerate(chunks):
                documents.append({
                    'path': file_path,
                    'chunk_id': i,
                    'total_chunks': len(chunks),
                    'content': chunk,
                    'content_length': len(chunk)
                })
            
            logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
            return pd.DataFrame(documents)
            
    except Exception as e:
        logger.error(f"Error reading {file_path}: {e}")
        return pd.DataFrame()