MrSimple07 commited on
Commit
9da507d
·
1 Parent(s): a42e1ff

eski holat with utils simplified

Browse files
Files changed (2) hide show
  1. index_retriever.py +2 -2
  2. table_prep.py +59 -66
index_retriever.py CHANGED
@@ -46,12 +46,12 @@ def create_query_engine(vector_index):
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
- similarity_top_k=40
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
- similarity_top_k=40,
55
  similarity_cutoff=0.65
56
  )
57
 
 
46
 
47
  bm25_retriever = BM25Retriever.from_defaults(
48
  docstore=vector_index.docstore,
49
+ similarity_top_k=50
50
  )
51
 
52
  vector_retriever = VectorIndexRetriever(
53
  index=vector_index,
54
+ similarity_top_k=50,
55
  similarity_cutoff=0.65
56
  )
57
 
table_prep.py CHANGED
@@ -35,128 +35,121 @@ from config import CHUNK_SIZE, CHUNK_OVERLAP
35
  def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
36
  """Simple table chunking: max 5 rows or 2000 chars per chunk"""
37
 
38
- table_num = doc.metadata.get('table_number', 'unknown')
39
-
40
- # Parse table
41
  lines = doc.text.strip().split('\n')
42
 
43
- table_header_lines = []
 
44
  data_rows = []
45
  in_data = False
46
 
47
  for line in lines:
48
  if line.startswith('Данные таблицы:'):
49
  in_data = True
50
- table_header_lines.append(line)
51
  elif in_data and line.startswith('Строка'):
52
  data_rows.append(line)
53
  elif not in_data:
54
- table_header_lines.append(line)
55
 
56
- table_header = '\n'.join(table_header_lines) + '\n'
57
 
 
58
  if not data_rows:
59
- # No rows, return as is
60
  return [doc]
61
 
62
- log_message(f"Таблица {table_num}: {len(data_rows)} строк")
63
-
64
- # Simple chunking
65
  chunks = []
66
- current_chunk_rows = []
67
- current_size = len(table_header)
68
 
69
  for row in data_rows:
70
- row_size = len(row) + 1
71
 
72
- # Check if adding this row exceeds limits
73
- if (len(current_chunk_rows) >= max_rows_per_chunk or
74
- current_size + row_size > max_chunk_size) and current_chunk_rows:
75
 
76
  # Save current chunk
77
- chunk_text = table_header + '\n'.join(current_chunk_rows)
78
  chunks.append(chunk_text)
79
- log_message(f" Чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
80
 
81
- # Start new chunk with overlap of 1 row
82
- if len(current_chunk_rows) > 0:
83
- current_chunk_rows = [current_chunk_rows[-1]]
84
- current_size = len(table_header) + len(current_chunk_rows[0]) + 1
85
- else:
86
- current_chunk_rows = []
87
- current_size = len(table_header)
88
 
89
- current_chunk_rows.append(row)
90
  current_size += row_size
91
 
92
- # Final chunk
93
- if current_chunk_rows:
94
- chunk_text = table_header + '\n'.join(current_chunk_rows)
95
  chunks.append(chunk_text)
96
- log_message(f" Последний чанк: {len(current_chunk_rows)} строк")
97
 
98
- log_message(f"Таблица {table_num} разделена на {len(chunks)} чанков")
99
-
100
- # Create documents
101
  chunked_docs = []
102
  for i, chunk_text in enumerate(chunks):
103
- chunk_metadata = doc.metadata.copy()
104
- chunk_metadata.update({
105
- "chunk_id": i,
106
- "total_chunks": len(chunks),
107
- "chunk_size": len(chunk_text),
108
- "is_chunked": True
109
- })
110
-
111
- chunked_doc = Document(text=chunk_text, metadata=chunk_metadata)
112
- chunked_docs.append(chunked_doc)
 
 
 
113
 
114
  return chunked_docs
115
 
116
 
117
  def table_to_document(table_data, document_id=None):
 
 
118
  if not isinstance(table_data, dict):
119
- log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
120
  return []
121
 
122
  doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
123
  table_num = table_data.get('table_number', 'Неизвестно')
124
  table_title = table_data.get('table_title', 'Неизвестно')
125
  section = table_data.get('section', 'Неизвестно')
126
-
127
  table_rows = table_data.get('data', [])
128
- if not table_rows or len(table_rows) == 0:
129
- log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
130
  return []
131
 
132
- content = create_table_content(table_data)
133
- content_size = len(content)
134
- row_count = len(table_rows)
 
 
 
 
 
 
 
 
 
 
 
 
135
 
 
136
  base_doc = Document(
137
  text=content,
138
  metadata={
139
  "type": "table",
140
  "table_number": table_num,
141
- "table_title": table_title,
142
  "document_id": doc_id,
143
- "section": section,
144
- "section_id": section,
145
- "total_rows": row_count,
146
- "content_size": content_size
147
  }
148
  )
 
 
149
 
150
- if content_size > CHUNK_SIZE:
151
- chunked_docs = chunk_table_document(base_doc)
152
- log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
153
- for i, chunk_doc in enumerate(chunked_docs):
154
- log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
155
- return chunked_docs
156
- else:
157
- log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
158
- f"Размер: {content_size} символов | Строк: {row_count}")
159
- return [base_doc]
160
 
161
 
162
  def load_table_data(repo_id, hf_token, table_data_dir):
 
35
  def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
36
  """Simple table chunking: max 5 rows or 2000 chars per chunk"""
37
 
 
 
 
38
  lines = doc.text.strip().split('\n')
39
 
40
+ # Separate header and data rows
41
+ header_lines = []
42
  data_rows = []
43
  in_data = False
44
 
45
  for line in lines:
46
  if line.startswith('Данные таблицы:'):
47
  in_data = True
48
+ header_lines.append(line)
49
  elif in_data and line.startswith('Строка'):
50
  data_rows.append(line)
51
  elif not in_data:
52
+ header_lines.append(line)
53
 
54
+ header = '\n'.join(header_lines) + '\n'
55
 
56
+ # No rows to chunk
57
  if not data_rows:
 
58
  return [doc]
59
 
60
+ # Chunk the data rows
 
 
61
  chunks = []
62
+ current_rows = []
63
+ current_size = len(header)
64
 
65
  for row in data_rows:
66
+ row_size = len(row) + 1 # +1 for newline
67
 
68
+ # Check if we need to create a new chunk
69
+ if (len(current_rows) >= max_rows_per_chunk or
70
+ current_size + row_size > max_chunk_size) and current_rows:
71
 
72
  # Save current chunk
73
+ chunk_text = header + '\n'.join(current_rows)
74
  chunks.append(chunk_text)
 
75
 
76
+ # Start new chunk (keep last row for overlap)
77
+ current_rows = [current_rows[-1]]
78
+ current_size = len(header) + len(current_rows[0]) + 1
 
 
 
 
79
 
80
+ current_rows.append(row)
81
  current_size += row_size
82
 
83
+ # Add final chunk
84
+ if current_rows:
85
+ chunk_text = header + '\n'.join(current_rows)
86
  chunks.append(chunk_text)
 
87
 
88
+ # Create Document objects
 
 
89
  chunked_docs = []
90
  for i, chunk_text in enumerate(chunks):
91
+ chunk_doc = Document(
92
+ text=chunk_text,
93
+ metadata={
94
+ "type": "table",
95
+ "table_number": doc.metadata.get('table_number'),
96
+ "document_id": doc.metadata.get('document_id'),
97
+ "section": doc.metadata.get('section'),
98
+ "chunk_id": i,
99
+ "total_chunks": len(chunks),
100
+ "is_chunked": True
101
+ }
102
+ )
103
+ chunked_docs.append(chunk_doc)
104
 
105
  return chunked_docs
106
 
107
 
108
  def table_to_document(table_data, document_id=None):
109
+ """Convert table data to Document, chunk if needed"""
110
+
111
  if not isinstance(table_data, dict):
 
112
  return []
113
 
114
  doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
115
  table_num = table_data.get('table_number', 'Неизвестно')
116
  table_title = table_data.get('table_title', 'Неизвестно')
117
  section = table_data.get('section', 'Неизвестно')
 
118
  table_rows = table_data.get('data', [])
119
+
120
+ if not table_rows:
121
  return []
122
 
123
+ # Build table content
124
+ content = f"Таблица: {table_num}\n"
125
+ content += f"Название: {table_title}\n"
126
+ content += f"Документ: {doc_id}\n"
127
+ content += f"Раздел: {section}\n"
128
+
129
+ headers = table_data.get('headers', [])
130
+ if headers:
131
+ content += f"\nЗаголовки: {' | '.join(headers)}\n"
132
+
133
+ content += "\nДанные таблицы:\n"
134
+ for row_idx, row in enumerate(table_rows, start=1):
135
+ if isinstance(row, dict):
136
+ row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
137
+ content += f"Строка {row_idx}: {row_text}\n"
138
 
139
+ # Create base document
140
  base_doc = Document(
141
  text=content,
142
  metadata={
143
  "type": "table",
144
  "table_number": table_num,
 
145
  "document_id": doc_id,
146
+ "section": section
 
 
 
147
  }
148
  )
149
+ if len(content) > 2000:
150
+ return chunk_table_document(base_doc)
151
 
152
+ return [base_doc]
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  def load_table_data(repo_id, hf_token, table_data_dir):