Ahmed-Alghamdi commited on
Commit
c19463a
·
verified ·
1 Parent(s): 5b8e218

Update document_processor.py

Browse files
Files changed (1) hide show
  1. document_processor.py +16 -19
document_processor.py CHANGED
@@ -1,20 +1,18 @@
1
- # document_processor.py
2
  import os
3
- import glob
4
- from tqdm import tqdm
5
  import pandas as pd
6
  from utils import clean_text, setup_logger
7
 
8
  logger = setup_logger('document_processor')
9
 
10
- def split_into_chunks(text, chunk_size=400, overlap=75):
 
11
  """
12
  Split text into overlapping chunks
13
 
14
  Args:
15
  text: The text to split
16
- chunk_size: Number of characters per chunk
17
- overlap: Number of characters to overlap between chunks
18
  """
19
  chunks = []
20
  start = 0
@@ -36,6 +34,7 @@ def split_into_chunks(text, chunk_size=400, overlap=75):
36
  break_point = max(last_period, last_question, last_exclamation, last_newline)
37
 
38
  # Only break if we're past halfway through the chunk
 
39
  if break_point > chunk_size * 0.5:
40
  chunk = chunk[:break_point + 1]
41
  end = start + break_point + 1
@@ -44,25 +43,23 @@ def split_into_chunks(text, chunk_size=400, overlap=75):
44
  if chunk: # Only add non-empty chunks
45
  chunks.append(chunk)
46
 
47
- start = end - overlap # Move start with overlap
48
-
 
 
 
 
 
49
  return chunks
50
 
51
-
52
- if not df.empty:
53
- logger.info(f"Total: {file_count} files → {len(df)} chunks")
54
- logger.info(f"Average chunk size: {df['content_length'].mean():.0f} characters")
55
-
56
- return df
57
-
58
- def load_single_document(file_path, chunk_size=400, overlap=75):
59
  """
60
  Load a single document and split it into chunks
61
 
62
  Args:
63
  file_path: Path to the .txt file
64
- chunk_size: Size of each chunk in characters
65
- overlap: Overlap between chunks in characters
66
  """
67
  try:
68
  with open(file_path, 'r', encoding='utf-8') as file:
@@ -72,7 +69,7 @@ def load_single_document(file_path, chunk_size=400, overlap=75):
72
  logger.warning(f"Empty content in {file_path}")
73
  return pd.DataFrame()
74
 
75
- # Split into chunks
76
  chunks = split_into_chunks(content, chunk_size, overlap)
77
 
78
  # Create dataframe with chunks
 
 
1
  import os
 
 
2
  import pandas as pd
3
  from utils import clean_text, setup_logger
4
 
5
  logger = setup_logger('document_processor')
6
 
7
+ # تم تعديل القيم الافتراضية هنا لتناسب النصوص الطويلة
8
+ def split_into_chunks(text, chunk_size=1000, overlap=200):
9
  """
10
  Split text into overlapping chunks
11
 
12
  Args:
13
  text: The text to split
14
+ chunk_size: Number of characters per chunk (Zidnah to 1000)
15
+ overlap: Number of characters to overlap (Zidnah to 200)
16
  """
17
  chunks = []
18
  start = 0
 
34
  break_point = max(last_period, last_question, last_exclamation, last_newline)
35
 
36
  # Only break if we're past halfway through the chunk
37
+ # This ensures we don't create very small chunks
38
  if break_point > chunk_size * 0.5:
39
  chunk = chunk[:break_point + 1]
40
  end = start + break_point + 1
 
43
  if chunk: # Only add non-empty chunks
44
  chunks.append(chunk)
45
 
46
+ # Move start pointer, ensuring we overlap
47
+ # If we reached the end of text, break to avoid infinite loop
48
+ if start >= end - overlap:
49
+ start = end
50
+ else:
51
+ start = end - overlap
52
+
53
  return chunks
54
 
55
+ def load_single_document(file_path, chunk_size=1000, overlap=200):
 
 
 
 
 
 
 
56
  """
57
  Load a single document and split it into chunks
58
 
59
  Args:
60
  file_path: Path to the .txt file
61
+ chunk_size: Size of each chunk in characters (Default: 1000)
62
+ overlap: Overlap between chunks in characters (Default: 200)
63
  """
64
  try:
65
  with open(file_path, 'r', encoding='utf-8') as file:
 
69
  logger.warning(f"Empty content in {file_path}")
70
  return pd.DataFrame()
71
 
72
+ # Split into chunks using the new sizes
73
  chunks = split_into_chunks(content, chunk_size, overlap)
74
 
75
  # Create dataframe with chunks