Spaces:
Sleeping
Sleeping
Commit
·
38ed4e9
1
Parent(s):
d013631
new documents prep
Browse files- table_prep.py +16 -23
- utils.py +3 -4
table_prep.py
CHANGED
|
@@ -32,15 +32,14 @@ def create_table_content(table_data):
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
-
def chunk_table_document(doc,
|
| 36 |
-
|
| 37 |
lines = doc.text.strip().split('\n')
|
| 38 |
-
|
| 39 |
# Separate header and data rows
|
| 40 |
header_lines = []
|
| 41 |
data_rows = []
|
| 42 |
in_data = False
|
| 43 |
-
|
| 44 |
for line in lines:
|
| 45 |
if line.startswith('Данные таблицы:'):
|
| 46 |
in_data = True
|
|
@@ -49,40 +48,34 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
|
|
| 49 |
data_rows.append(line)
|
| 50 |
elif not in_data:
|
| 51 |
header_lines.append(line)
|
| 52 |
-
|
| 53 |
header = '\n'.join(header_lines) + '\n'
|
| 54 |
-
|
| 55 |
-
# No rows to chunk
|
| 56 |
if not data_rows:
|
| 57 |
return [doc]
|
| 58 |
-
|
| 59 |
-
# Chunk the data rows
|
| 60 |
chunks = []
|
| 61 |
current_rows = []
|
| 62 |
current_size = len(header)
|
| 63 |
-
|
| 64 |
for row in data_rows:
|
| 65 |
-
row_size = len(row) + 1
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
# Save current chunk
|
| 70 |
chunk_text = header + '\n'.join(current_rows)
|
| 71 |
chunks.append(chunk_text)
|
| 72 |
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
current_size = len(header) + len(current_rows[0]) + 1
|
| 77 |
-
|
| 78 |
current_rows.append(row)
|
| 79 |
current_size += row_size
|
| 80 |
-
|
| 81 |
# Add final chunk
|
| 82 |
if current_rows:
|
| 83 |
chunk_text = header + '\n'.join(current_rows)
|
| 84 |
chunks.append(chunk_text)
|
| 85 |
-
|
| 86 |
# Create Document objects
|
| 87 |
chunked_docs = []
|
| 88 |
for i, chunk_text in enumerate(chunks):
|
|
@@ -99,7 +92,7 @@ def chunk_table_document(doc, max_rows_per_chunk=3, max_chunk_size=2000):
|
|
| 99 |
}
|
| 100 |
)
|
| 101 |
chunked_docs.append(chunk_doc)
|
| 102 |
-
|
| 103 |
return chunked_docs
|
| 104 |
|
| 105 |
|
|
|
|
| 32 |
from llama_index.core.text_splitter import SentenceSplitter
|
| 33 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 34 |
|
| 35 |
+
def chunk_table_document(doc, max_chunk_size=2000):
|
|
|
|
| 36 |
lines = doc.text.strip().split('\n')
|
| 37 |
+
|
| 38 |
# Separate header and data rows
|
| 39 |
header_lines = []
|
| 40 |
data_rows = []
|
| 41 |
in_data = False
|
| 42 |
+
|
| 43 |
for line in lines:
|
| 44 |
if line.startswith('Данные таблицы:'):
|
| 45 |
in_data = True
|
|
|
|
| 48 |
data_rows.append(line)
|
| 49 |
elif not in_data:
|
| 50 |
header_lines.append(line)
|
| 51 |
+
|
| 52 |
header = '\n'.join(header_lines) + '\n'
|
| 53 |
+
|
|
|
|
| 54 |
if not data_rows:
|
| 55 |
return [doc]
|
| 56 |
+
|
|
|
|
| 57 |
chunks = []
|
| 58 |
current_rows = []
|
| 59 |
current_size = len(header)
|
| 60 |
+
|
| 61 |
for row in data_rows:
|
| 62 |
+
row_size = len(row) + 1
|
| 63 |
+
# If adding this row would exceed max_chunk_size, save current chunk
|
| 64 |
+
if current_size + row_size > max_chunk_size and current_rows:
|
|
|
|
|
|
|
| 65 |
chunk_text = header + '\n'.join(current_rows)
|
| 66 |
chunks.append(chunk_text)
|
| 67 |
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
|
| 68 |
+
current_rows = []
|
| 69 |
+
current_size = len(header)
|
| 70 |
+
|
|
|
|
|
|
|
| 71 |
current_rows.append(row)
|
| 72 |
current_size += row_size
|
| 73 |
+
|
| 74 |
# Add final chunk
|
| 75 |
if current_rows:
|
| 76 |
chunk_text = header + '\n'.join(current_rows)
|
| 77 |
chunks.append(chunk_text)
|
| 78 |
+
|
| 79 |
# Create Document objects
|
| 80 |
chunked_docs = []
|
| 81 |
for i, chunk_text in enumerate(chunks):
|
|
|
|
| 92 |
}
|
| 93 |
)
|
| 94 |
chunked_docs.append(chunk_doc)
|
| 95 |
+
|
| 96 |
return chunked_docs
|
| 97 |
|
| 98 |
|
utils.py
CHANGED
|
@@ -139,19 +139,18 @@ def deduplicate_nodes(nodes):
|
|
| 139 |
unique_nodes = []
|
| 140 |
|
| 141 |
for node in nodes:
|
| 142 |
-
# Create unique identifier from metadata
|
| 143 |
doc_id = node.metadata.get('document_id', '')
|
| 144 |
section_id = node.metadata.get('section_id', '')
|
| 145 |
-
chunk_id = node.metadata.get('chunk_id', 0)
|
| 146 |
node_type = node.metadata.get('type', 'text')
|
| 147 |
|
| 148 |
-
if node_type == 'table':
|
| 149 |
table_num = node.metadata.get('table_number', '')
|
| 150 |
-
identifier = f"{doc_id}|table|{table_num}
|
| 151 |
elif node_type == 'image':
|
| 152 |
img_num = node.metadata.get('image_number', '')
|
| 153 |
identifier = f"{doc_id}|image|{img_num}"
|
| 154 |
else:
|
|
|
|
| 155 |
identifier = f"{doc_id}|{section_id}|{chunk_id}"
|
| 156 |
|
| 157 |
if identifier not in seen:
|
|
|
|
| 139 |
unique_nodes = []
|
| 140 |
|
| 141 |
for node in nodes:
|
|
|
|
| 142 |
doc_id = node.metadata.get('document_id', '')
|
| 143 |
section_id = node.metadata.get('section_id', '')
|
|
|
|
| 144 |
node_type = node.metadata.get('type', 'text')
|
| 145 |
|
| 146 |
+
if node_type == 'table' or node_type == 'table_row':
|
| 147 |
table_num = node.metadata.get('table_number', '')
|
| 148 |
+
identifier = f"{doc_id}|table|{table_num}"
|
| 149 |
elif node_type == 'image':
|
| 150 |
img_num = node.metadata.get('image_number', '')
|
| 151 |
identifier = f"{doc_id}|image|{img_num}"
|
| 152 |
else:
|
| 153 |
+
chunk_id = node.metadata.get('chunk_id', 0)
|
| 154 |
identifier = f"{doc_id}|{section_id}|{chunk_id}"
|
| 155 |
|
| 156 |
if identifier not in seen:
|