MrSimple07 commited on
Commit
38ed4e9
·
1 Parent(s): d013631

new documents prep

Browse files
Files changed (2) hide show
  1. table_prep.py +16 -23
  2. utils.py +3 -4
table_prep.py CHANGED
@@ -32,15 +32,14 @@ def create_table_content(table_data):
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
- def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
36
-
37
  lines = doc.text.strip().split('\n')
38
-
39
  # Separate header and data rows
40
  header_lines = []
41
  data_rows = []
42
  in_data = False
43
-
44
  for line in lines:
45
  if line.startswith('Данные таблицы:'):
46
  in_data = True
@@ -49,40 +48,34 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
49
  data_rows.append(line)
50
  elif not in_data:
51
  header_lines.append(line)
52
-
53
  header = '\n'.join(header_lines) + '\n'
54
-
55
- # No rows to chunk
56
  if not data_rows:
57
  return [doc]
58
-
59
- # Chunk the data rows
60
  chunks = []
61
  current_rows = []
62
  current_size = len(header)
63
-
64
  for row in data_rows:
65
- row_size = len(row) + 1
66
- if (len(current_rows) >= max_rows_per_chunk or
67
- current_size + row_size > max_chunk_size) and current_rows:
68
-
69
- # Save current chunk
70
  chunk_text = header + '\n'.join(current_rows)
71
  chunks.append(chunk_text)
72
  log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
73
-
74
- current_rows = [current_rows[-1]]
75
- log_message(f"Перенос строки для перекрытия: {current_rows[-1]}")
76
- current_size = len(header) + len(current_rows[0]) + 1
77
-
78
  current_rows.append(row)
79
  current_size += row_size
80
-
81
  # Add final chunk
82
  if current_rows:
83
  chunk_text = header + '\n'.join(current_rows)
84
  chunks.append(chunk_text)
85
-
86
  # Create Document objects
87
  chunked_docs = []
88
  for i, chunk_text in enumerate(chunks):
@@ -99,7 +92,7 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
99
  }
100
  )
101
  chunked_docs.append(chunk_doc)
102
-
103
  return chunked_docs
104
 
105
 
 
32
  from llama_index.core.text_splitter import SentenceSplitter
33
  from config import CHUNK_SIZE, CHUNK_OVERLAP
34
 
35
+ def chunk_table_document(doc, max_chunk_size=2000):
 
36
  lines = doc.text.strip().split('\n')
37
+
38
  # Separate header and data rows
39
  header_lines = []
40
  data_rows = []
41
  in_data = False
42
+
43
  for line in lines:
44
  if line.startswith('Данные таблицы:'):
45
  in_data = True
 
48
  data_rows.append(line)
49
  elif not in_data:
50
  header_lines.append(line)
51
+
52
  header = '\n'.join(header_lines) + '\n'
53
+
 
54
  if not data_rows:
55
  return [doc]
56
+
 
57
  chunks = []
58
  current_rows = []
59
  current_size = len(header)
60
+
61
  for row in data_rows:
62
+ row_size = len(row) + 1
63
+ # If adding this row would exceed max_chunk_size, save current chunk
64
+ if current_size + row_size > max_chunk_size and current_rows:
 
 
65
  chunk_text = header + '\n'.join(current_rows)
66
  chunks.append(chunk_text)
67
  log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
68
+ current_rows = []
69
+ current_size = len(header)
70
+
 
 
71
  current_rows.append(row)
72
  current_size += row_size
73
+
74
  # Add final chunk
75
  if current_rows:
76
  chunk_text = header + '\n'.join(current_rows)
77
  chunks.append(chunk_text)
78
+
79
  # Create Document objects
80
  chunked_docs = []
81
  for i, chunk_text in enumerate(chunks):
 
92
  }
93
  )
94
  chunked_docs.append(chunk_doc)
95
+
96
  return chunked_docs
97
 
98
 
utils.py CHANGED
@@ -139,19 +139,18 @@ def deduplicate_nodes(nodes):
139
  unique_nodes = []
140
 
141
  for node in nodes:
142
- # Create unique identifier from metadata
143
  doc_id = node.metadata.get('document_id', '')
144
  section_id = node.metadata.get('section_id', '')
145
- chunk_id = node.metadata.get('chunk_id', 0)
146
  node_type = node.metadata.get('type', 'text')
147
 
148
- if node_type == 'table':
149
  table_num = node.metadata.get('table_number', '')
150
- identifier = f"{doc_id}|table|{table_num}|{chunk_id}"
151
  elif node_type == 'image':
152
  img_num = node.metadata.get('image_number', '')
153
  identifier = f"{doc_id}|image|{img_num}"
154
  else:
 
155
  identifier = f"{doc_id}|{section_id}|{chunk_id}"
156
 
157
  if identifier not in seen:
 
139
  unique_nodes = []
140
 
141
  for node in nodes:
 
142
  doc_id = node.metadata.get('document_id', '')
143
  section_id = node.metadata.get('section_id', '')
 
144
  node_type = node.metadata.get('type', 'text')
145
 
146
+ if node_type == 'table' or node_type == 'table_row':
147
  table_num = node.metadata.get('table_number', '')
148
+ identifier = f"{doc_id}|table|{table_num}"
149
  elif node_type == 'image':
150
  img_num = node.metadata.get('image_number', '')
151
  identifier = f"{doc_id}|image|{img_num}"
152
  else:
153
+ chunk_id = node.metadata.get('chunk_id', 0)
154
  identifier = f"{doc_id}|{section_id}|{chunk_id}"
155
 
156
  if identifier not in seen: