Ahmed-Alghamdi commited on
Commit
fb7084d
·
verified ·
1 Parent(s): a7f3645

Update document_processor.py

Browse files
Files changed (1) hide show
  1. document_processor.py +127 -3
document_processor.py CHANGED
@@ -7,11 +7,135 @@ from utils import clean_text, setup_logger
7
 
8
  logger = setup_logger('document_processor')
9
 
10
- def load_single_document(file_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  try:
12
  with open(file_path, 'r', encoding='utf-8') as file:
13
  content = clean_text(file.read())
14
- return pd.DataFrame([{'path': file_path, 'content': content}])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  except Exception as e:
16
  logger.error(f"Error reading {file_path}: {e}")
17
- return pd.DataFrame()
 
7
 
8
  logger = setup_logger('document_processor')
9
 
10
+ def split_into_chunks(text, chunk_size=400, overlap=75):
11
+ """
12
+ Split text into overlapping chunks
13
+
14
+ Args:
15
+ text: The text to split
16
+ chunk_size: Number of characters per chunk
17
+ overlap: Number of characters to overlap between chunks
18
+ """
19
+ chunks = []
20
+ start = 0
21
+ text_length = len(text)
22
+
23
+ while start < text_length:
24
+ end = start + chunk_size
25
+ chunk = text[start:end]
26
+
27
+ # Try to break at sentence boundary for better context
28
+ if end < text_length:
29
+ # Look for sentence endings
30
+ last_period = chunk.rfind('.')
31
+ last_question = chunk.rfind('؟') # Arabic question mark
32
+ last_exclamation = chunk.rfind('!')
33
+ last_newline = chunk.rfind('\n')
34
+
35
+ # Find the best break point
36
+ break_point = max(last_period, last_question, last_exclamation, last_newline)
37
+
38
+ # Only break if we're past halfway through the chunk
39
+ if break_point > chunk_size * 0.5:
40
+ chunk = chunk[:break_point + 1]
41
+ end = start + break_point + 1
42
+
43
+ chunk = chunk.strip()
44
+ if chunk: # Only add non-empty chunks
45
+ chunks.append(chunk)
46
+
47
+ start = end - overlap # Move start with overlap
48
+
49
+ return chunks
50
+
51
+ def load_documents(folder_path, chunk_size=400, overlap=75):
52
+ """
53
+ Load all .txt documents from folder and split them into chunks
54
+
55
+ Args:
56
+ folder_path: Path to folder containing .txt files
57
+ chunk_size: Size of each chunk in characters (default: 400)
58
+ overlap: Overlap between chunks in characters (default: 75)
59
+ """
60
+ documents = []
61
+ file_count = 0
62
+
63
+ txt_files = glob.glob(os.path.join(folder_path, '*.txt'))
64
+
65
+ if not txt_files:
66
+ logger.warning(f"No .txt files found in {folder_path}")
67
+ return pd.DataFrame()
68
+
69
+ for file_path in tqdm(txt_files, desc="Loading and chunking documents"):
70
+ try:
71
+ with open(file_path, 'r', encoding='utf-8') as file:
72
+ content = clean_text(file.read())
73
+
74
+ if not content:
75
+ logger.warning(f"Empty content in {file_path}")
76
+ continue
77
+
78
+ # Split into chunks
79
+ chunks = split_into_chunks(content, chunk_size, overlap)
80
+
81
+ # Create a document entry for each chunk
82
+ for i, chunk in enumerate(chunks):
83
+ documents.append({
84
+ 'path': file_path,
85
+ 'chunk_id': i,
86
+ 'total_chunks': len(chunks),
87
+ 'content': chunk,
88
+ 'content_length': len(chunk)
89
+ })
90
+
91
+ file_count += 1
92
+ logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
93
+
94
+ except Exception as e:
95
+ logger.error(f"Error reading {file_path}: {e}")
96
+
97
+ df = pd.DataFrame(documents)
98
+
99
+ if not df.empty:
100
+ logger.info(f"Total: {file_count} files → {len(df)} chunks")
101
+ logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
102
+
103
+ return df
104
+
105
+ def load_single_document(file_path, chunk_size=400, overlap=75):
106
+ """
107
+ Load a single document and split it into chunks
108
+
109
+ Args:
110
+ file_path: Path to the .txt file
111
+ chunk_size: Size of each chunk in characters
112
+ overlap: Overlap between chunks in characters
113
+ """
114
  try:
115
  with open(file_path, 'r', encoding='utf-8') as file:
116
  content = clean_text(file.read())
117
+
118
+ if not content:
119
+ logger.warning(f"Empty content in {file_path}")
120
+ return pd.DataFrame()
121
+
122
+ # Split into chunks
123
+ chunks = split_into_chunks(content, chunk_size, overlap)
124
+
125
+ # Create dataframe with chunks
126
+ documents = []
127
+ for i, chunk in enumerate(chunks):
128
+ documents.append({
129
+ 'path': file_path,
130
+ 'chunk_id': i,
131
+ 'total_chunks': len(chunks),
132
+ 'content': chunk,
133
+ 'content_length': len(chunk)
134
+ })
135
+
136
+ logger.info(f"Loaded {os.path.basename(file_path)}: {len(chunks)} chunks")
137
+ return pd.DataFrame(documents)
138
+
139
  except Exception as e:
140
  logger.error(f"Error reading {file_path}: {e}")
141
+ return pd.DataFrame()