File size: 3,212 Bytes
e820a8a
 
 
 
 
 
c19463a
 
fb7084d
 
 
 
 
c19463a
 
fb7084d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c19463a
fb7084d
 
 
 
 
 
 
 
c19463a
 
 
 
 
 
 
fb7084d
 
c19463a
fb7084d
 
 
 
 
c19463a
 
fb7084d
d12e375
 
 
fb7084d
 
 
 
 
c19463a
fb7084d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d12e375
 
fb7084d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import pandas as pd
from utils import clean_text, setup_logger

logger = setup_logger('document_processor')

# تم تعديل القيم الافتراضية هنا لتناسب النصوص الطويلة
def split_into_chunks(text, chunk_size=1000, overlap=200):
    """
    Split text into overlapping chunks
    
    Args:
        text: The text to split
        chunk_size: Number of characters per chunk (Zidnah to 1000)
        overlap: Number of characters to overlap (Zidnah to 200)
    """
    chunks = []
    start = 0
    text_length = len(text)
    
    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        
        # Try to break at sentence boundary for better context
        if end < text_length:
            # Look for sentence endings
            last_period = chunk.rfind('.')
            last_question = chunk.rfind('؟')  # Arabic question mark
            last_exclamation = chunk.rfind('!')
            last_newline = chunk.rfind('\n')
            
            # Find the best break point
            break_point = max(last_period, last_question, last_exclamation, last_newline)
            
            # Only break if we're past halfway through the chunk
            # This ensures we don't create very small chunks
            if break_point > chunk_size * 0.5:
                chunk = chunk[:break_point + 1]
                end = start + break_point + 1
        
        chunk = chunk.strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        # Move start pointer, ensuring we overlap
        # If we reached the end of text, break to avoid infinite loop
        if start >= end - overlap:
            start = end
        else:
            start = end - overlap
            
    return chunks

def load_single_document(file_path, chunk_size=1000, overlap=200):
    """
    Load a single document and split it into chunks
    
    Args:
        file_path: Path to the .txt file
        chunk_size: Size of each chunk in characters (Default: 1000)
        overlap: Overlap between chunks in characters (Default: 200)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = clean_text(file.read())
            
            if not content:
                logger.warning(f"Empty content in {file_path}")
                return pd.DataFrame()
            
            # Split into chunks using the new sizes
            chunks = split_into_chunks(content, chunk_size, overlap)
            
            # Create dataframe with chunks
            documents = []
            for i, chunk in enumerate(chunks):
                documents.append({
                    'path': file_path,
                    'chunk_id': i,
                    'total_chunks': len(chunks),
                    'content': chunk,
                    'content_length': len(chunk)
                })
            
            logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
            return pd.DataFrame(documents)
            
    except Exception as e:
        logger.error(f"Error reading {file_path}: {e}")
        return pd.DataFrame()