MrSimple07 commited on
Commit
0b28542
·
1 Parent(s): 9985d37

simplest version

Browse files
Files changed (1) hide show
  1. documents_prep.py +131 -23
documents_prep.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
  import pandas as pd
3
  from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
@@ -41,6 +42,10 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
41
  if not rows:
42
  return []
43
 
 
 
 
 
44
  # Small table: keep whole
45
  if len(rows) <= max_rows:
46
  content = format_table_content(table_data, headers, rows)
@@ -60,8 +65,12 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
60
  chunks = []
61
  for i in range(0, len(rows), max_rows):
62
  chunk_rows = rows[i:i+max_rows]
63
- content = format_table_content(table_data, headers, chunk_rows,
64
- chunk_info=f"Rows {i+1}-{i+len(chunk_rows)}")
 
 
 
 
65
 
66
  chunks.append(Document(
67
  text=content,
@@ -78,18 +87,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
78
  }
79
  ))
80
 
81
- log_message(f" 📊 Table {table_num}: {len(rows)} rows → {len(chunks)} chunks")
82
  return chunks
83
 
84
 
85
  def format_table_content(table_data, headers, rows, chunk_info=""):
86
  """Format table for semantic search"""
87
- doc_id = table_data.get('document_id', 'unknown')
88
  table_num = table_data.get('table_number', 'unknown')
89
  table_title = table_data.get('table_title', '')
90
  section = table_data.get('section', '')
91
 
92
- content = f"Документ: {doc_id}\n"
 
 
 
 
 
 
93
  content += f"Таблица: {table_num}\n"
94
  if table_title:
95
  content += f"Название: {table_title}\n"
@@ -97,29 +112,52 @@ def format_table_content(table_data, headers, rows, chunk_info=""):
97
  content += f"Раздел: {section}\n"
98
  if chunk_info:
99
  content += f"{chunk_info}\n"
100
- content += f"\nКолонки: {' | '.join(str(h) for h in headers)}\n\n"
 
 
 
 
 
 
 
 
101
 
102
- # Add rows
 
 
 
 
 
103
  for row in rows:
104
  if isinstance(row, dict):
105
  parts = [f"{k}: {v}" for k, v in row.items()
106
  if v and str(v).strip() and str(v) != 'nan']
107
- content += ' | '.join(parts) + "\n"
 
108
  elif isinstance(row, list):
109
  parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
110
- content += ' | '.join(parts) + "\n"
 
111
 
112
  return content
113
 
114
 
115
  def load_json_documents(repo_id, hf_token, json_dir):
116
- """Load text sections from JSON"""
 
 
 
117
  log_message("Loading JSON documents...")
118
 
119
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
120
  json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
 
 
 
121
 
122
  documents = []
 
 
123
  for file_path in json_files:
124
  try:
125
  local_path = hf_hub_download(
@@ -129,26 +167,91 @@ def load_json_documents(repo_id, hf_token, json_dir):
129
  token=hf_token
130
  )
131
 
132
- with open(local_path, 'r', encoding='utf-8') as f:
133
- data = json.load(f)
134
 
135
- doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- # Extract sections
138
- for section in data.get('sections', []):
139
- if section.get('section_text', '').strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  documents.append(Document(
141
- text=section['section_text'],
142
  metadata={
143
  'type': 'text',
144
  'document_id': doc_id,
145
- 'section_id': section.get('section_id', '')
146
  }
147
  ))
148
- except Exception as e:
149
- log_message(f"Error loading {file_path}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
- log_message(f"✓ Loaded {len(documents)} text sections")
152
  return documents
153
 
154
 
@@ -172,10 +275,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
172
  with open(local_path, 'r', encoding='utf-8') as f:
173
  data = json.load(f)
174
 
175
- doc_id = data.get('document_id', 'unknown')
 
176
 
177
  for sheet in data.get('sheets', []):
178
- chunks = chunk_table_by_rows(sheet, doc_id)
 
 
 
 
179
  all_chunks.extend(chunks)
180
 
181
  except Exception as e:
 
1
  import json
2
+ import zipfile
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
 
42
  if not rows:
43
  return []
44
 
45
+ # Ensure table_data has document_id for format_table_content
46
+ if 'document_id' not in table_data:
47
+ table_data['document_id'] = doc_id
48
+
49
  # Small table: keep whole
50
  if len(rows) <= max_rows:
51
  content = format_table_content(table_data, headers, rows)
 
65
  chunks = []
66
  for i in range(0, len(rows), max_rows):
67
  chunk_rows = rows[i:i+max_rows]
68
+ content = format_table_content(
69
+ table_data,
70
+ headers,
71
+ chunk_rows,
72
+ chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
73
+ )
74
 
75
  chunks.append(Document(
76
  text=content,
 
87
  }
88
  ))
89
 
90
+ log_message(f" 📊 Table {table_num} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
91
  return chunks
92
 
93
 
94
  def format_table_content(table_data, headers, rows, chunk_info=""):
95
  """Format table for semantic search"""
96
+ doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
97
  table_num = table_data.get('table_number', 'unknown')
98
  table_title = table_data.get('table_title', '')
99
  section = table_data.get('section', '')
100
 
101
+ # Normalize table number
102
+ if table_num and table_num != 'unknown':
103
+ if not str(table_num).startswith('№'):
104
+ table_num = f"№{table_num}"
105
+
106
+ content = f"=== ТАБЛИЦА ===\n"
107
+ content += f"Документ: {doc_id}\n"
108
  content += f"Таблица: {table_num}\n"
109
  if table_title:
110
  content += f"Название: {table_title}\n"
 
112
  content += f"Раздел: {section}\n"
113
  if chunk_info:
114
  content += f"{chunk_info}\n"
115
+ content += f"================\n\n"
116
+
117
+ # Add searchable description
118
+ content += f"Это таблица {table_num} из документа {doc_id}. "
119
+ if table_title:
120
+ content += f"{table_title}. "
121
+ if section:
122
+ content += f"Находится в разделе: {section}. "
123
+ content += f"\n\n"
124
 
125
+ # Headers
126
+ if headers:
127
+ header_str = ' | '.join(str(h) for h in headers)
128
+ content += f"Колонки: {header_str}\n\n"
129
+
130
+ # Rows
131
  for row in rows:
132
  if isinstance(row, dict):
133
  parts = [f"{k}: {v}" for k, v in row.items()
134
  if v and str(v).strip() and str(v) != 'nan']
135
+ if parts:
136
+ content += ' | '.join(parts) + "\n"
137
  elif isinstance(row, list):
138
  parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
139
+ if parts:
140
+ content += ' | '.join(parts) + "\n"
141
 
142
  return content
143
 
144
 
145
  def load_json_documents(repo_id, hf_token, json_dir):
146
+ """Load text sections from JSON (including ZIPs)"""
147
+ import zipfile
148
+ import tempfile
149
+
150
  log_message("Loading JSON documents...")
151
 
152
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
153
  json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
154
+ zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
155
+
156
+ log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
157
 
158
  documents = []
159
+
160
+ # Load direct JSON files
161
  for file_path in json_files:
162
  try:
163
  local_path = hf_hub_download(
 
167
  token=hf_token
168
  )
169
 
170
+ docs = extract_sections_from_json(local_path)
171
+ documents.extend(docs)
172
 
173
+ except Exception as e:
174
+ log_message(f"Error loading {file_path}: {e}")
175
+
176
+ # Extract and load ZIP files
177
+ for zip_path in zip_files:
178
+ try:
179
+ local_zip = hf_hub_download(
180
+ repo_id=repo_id,
181
+ filename=zip_path,
182
+ repo_type="dataset",
183
+ token=hf_token
184
+ )
185
 
186
+ with zipfile.ZipFile(local_zip, 'r') as zf:
187
+ for json_file in zf.namelist():
188
+ if json_file.endswith('.json') and not json_file.startswith('__MACOSX'):
189
+ with zf.open(json_file) as f:
190
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
191
+ tmp.write(f.read())
192
+ tmp_path = tmp.name
193
+
194
+ docs = extract_sections_from_json(tmp_path)
195
+ documents.extend(docs)
196
+
197
+ import os
198
+ os.unlink(tmp_path)
199
+
200
+ except Exception as e:
201
+ log_message(f"Error loading ZIP {zip_path}: {e}")
202
+
203
+ log_message(f"✓ Loaded {len(documents)} text sections")
204
+ return documents
205
+
206
+ def extract_sections_from_json(json_path):
207
+ """Extract sections from a single JSON file"""
208
+ documents = []
209
+
210
+ try:
211
+ with open(json_path, 'r', encoding='utf-8') as f:
212
+ data = json.load(f)
213
+
214
+ doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
215
+
216
+ # Extract all section levels
217
+ for section in data.get('sections', []):
218
+ if section.get('section_text', '').strip():
219
+ documents.append(Document(
220
+ text=section['section_text'],
221
+ metadata={
222
+ 'type': 'text',
223
+ 'document_id': doc_id,
224
+ 'section_id': section.get('section_id', '')
225
+ }
226
+ ))
227
+
228
+ # Subsections
229
+ for subsection in section.get('subsections', []):
230
+ if subsection.get('subsection_text', '').strip():
231
  documents.append(Document(
232
+ text=subsection['subsection_text'],
233
  metadata={
234
  'type': 'text',
235
  'document_id': doc_id,
236
+ 'section_id': subsection.get('subsection_id', '')
237
  }
238
  ))
239
+
240
+ # Sub-subsections
241
+ for sub_sub in subsection.get('sub_subsections', []):
242
+ if sub_sub.get('sub_subsection_text', '').strip():
243
+ documents.append(Document(
244
+ text=sub_sub['sub_subsection_text'],
245
+ metadata={
246
+ 'type': 'text',
247
+ 'document_id': doc_id,
248
+ 'section_id': sub_sub.get('sub_subsection_id', '')
249
+ }
250
+ ))
251
+
252
+ except Exception as e:
253
+ log_message(f"Error extracting from {json_path}: {e}")
254
 
 
255
  return documents
256
 
257
 
 
275
  with open(local_path, 'r', encoding='utf-8') as f:
276
  data = json.load(f)
277
 
278
+ # Extract file-level document_id
279
+ file_doc_id = data.get('document_id', data.get('document', 'unknown'))
280
 
281
  for sheet in data.get('sheets', []):
282
+ # Use sheet-level document_id if available, otherwise use file-level
283
+ sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
284
+
285
+ # CRITICAL: Pass document_id to chunk function
286
+ chunks = chunk_table_by_rows(sheet, sheet_doc_id)
287
  all_chunks.extend(chunks)
288
 
289
  except Exception as e: