Spaces:
Sleeping
Sleeping
Commit
·
0b28542
1
Parent(s):
9985d37
simplest version
Browse files- documents_prep.py +131 -23
documents_prep.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 4 |
from llama_index.core import Document
|
|
@@ -41,6 +42,10 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 41 |
if not rows:
|
| 42 |
return []
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Small table: keep whole
|
| 45 |
if len(rows) <= max_rows:
|
| 46 |
content = format_table_content(table_data, headers, rows)
|
|
@@ -60,8 +65,12 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 60 |
chunks = []
|
| 61 |
for i in range(0, len(rows), max_rows):
|
| 62 |
chunk_rows = rows[i:i+max_rows]
|
| 63 |
-
content = format_table_content(
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
chunks.append(Document(
|
| 67 |
text=content,
|
|
@@ -78,18 +87,24 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 78 |
}
|
| 79 |
))
|
| 80 |
|
| 81 |
-
log_message(f" 📊 Table {table_num}: {len(rows)} rows → {len(chunks)} chunks")
|
| 82 |
return chunks
|
| 83 |
|
| 84 |
|
| 85 |
def format_table_content(table_data, headers, rows, chunk_info=""):
|
| 86 |
"""Format table for semantic search"""
|
| 87 |
-
doc_id = table_data.get('document_id', 'unknown')
|
| 88 |
table_num = table_data.get('table_number', 'unknown')
|
| 89 |
table_title = table_data.get('table_title', '')
|
| 90 |
section = table_data.get('section', '')
|
| 91 |
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
content += f"Таблица: {table_num}\n"
|
| 94 |
if table_title:
|
| 95 |
content += f"Название: {table_title}\n"
|
|
@@ -97,29 +112,52 @@ def format_table_content(table_data, headers, rows, chunk_info=""):
|
|
| 97 |
content += f"Раздел: {section}\n"
|
| 98 |
if chunk_info:
|
| 99 |
content += f"{chunk_info}\n"
|
| 100 |
-
content += f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
for row in rows:
|
| 104 |
if isinstance(row, dict):
|
| 105 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 106 |
if v and str(v).strip() and str(v) != 'nan']
|
| 107 |
-
|
|
|
|
| 108 |
elif isinstance(row, list):
|
| 109 |
parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
|
| 110 |
-
|
|
|
|
| 111 |
|
| 112 |
return content
|
| 113 |
|
| 114 |
|
| 115 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 116 |
-
"""Load text sections from JSON"""
|
|
|
|
|
|
|
|
|
|
| 117 |
log_message("Loading JSON documents...")
|
| 118 |
|
| 119 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 120 |
json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
documents = []
|
|
|
|
|
|
|
| 123 |
for file_path in json_files:
|
| 124 |
try:
|
| 125 |
local_path = hf_hub_download(
|
|
@@ -129,26 +167,91 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 129 |
token=hf_token
|
| 130 |
)
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
documents.append(Document(
|
| 141 |
-
text=
|
| 142 |
metadata={
|
| 143 |
'type': 'text',
|
| 144 |
'document_id': doc_id,
|
| 145 |
-
'section_id':
|
| 146 |
}
|
| 147 |
))
|
| 148 |
-
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
log_message(f"✓ Loaded {len(documents)} text sections")
|
| 152 |
return documents
|
| 153 |
|
| 154 |
|
|
@@ -172,10 +275,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 172 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 173 |
data = json.load(f)
|
| 174 |
|
| 175 |
-
|
|
|
|
| 176 |
|
| 177 |
for sheet in data.get('sheets', []):
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
all_chunks.extend(chunks)
|
| 180 |
|
| 181 |
except Exception as e:
|
|
|
|
| 1 |
import json
|
| 2 |
+
import zipfile
|
| 3 |
import pandas as pd
|
| 4 |
from huggingface_hub import hf_hub_download, list_repo_files
|
| 5 |
from llama_index.core import Document
|
|
|
|
| 42 |
if not rows:
|
| 43 |
return []
|
| 44 |
|
| 45 |
+
# Ensure table_data has document_id for format_table_content
|
| 46 |
+
if 'document_id' not in table_data:
|
| 47 |
+
table_data['document_id'] = doc_id
|
| 48 |
+
|
| 49 |
# Small table: keep whole
|
| 50 |
if len(rows) <= max_rows:
|
| 51 |
content = format_table_content(table_data, headers, rows)
|
|
|
|
| 65 |
chunks = []
|
| 66 |
for i in range(0, len(rows), max_rows):
|
| 67 |
chunk_rows = rows[i:i+max_rows]
|
| 68 |
+
content = format_table_content(
|
| 69 |
+
table_data,
|
| 70 |
+
headers,
|
| 71 |
+
chunk_rows,
|
| 72 |
+
chunk_info=f"Строки {i+1}-{i+len(chunk_rows)} из {len(rows)}"
|
| 73 |
+
)
|
| 74 |
|
| 75 |
chunks.append(Document(
|
| 76 |
text=content,
|
|
|
|
| 87 |
}
|
| 88 |
))
|
| 89 |
|
| 90 |
+
log_message(f" 📊 Table {table_num} ({doc_id}): {len(rows)} rows → {len(chunks)} chunks")
|
| 91 |
return chunks
|
| 92 |
|
| 93 |
|
| 94 |
def format_table_content(table_data, headers, rows, chunk_info=""):
|
| 95 |
"""Format table for semantic search"""
|
| 96 |
+
doc_id = table_data.get('document_id', table_data.get('document', 'unknown'))
|
| 97 |
table_num = table_data.get('table_number', 'unknown')
|
| 98 |
table_title = table_data.get('table_title', '')
|
| 99 |
section = table_data.get('section', '')
|
| 100 |
|
| 101 |
+
# Normalize table number
|
| 102 |
+
if table_num and table_num != 'unknown':
|
| 103 |
+
if not str(table_num).startswith('№'):
|
| 104 |
+
table_num = f"№{table_num}"
|
| 105 |
+
|
| 106 |
+
content = f"=== ТАБЛИЦА ===\n"
|
| 107 |
+
content += f"Документ: {doc_id}\n"
|
| 108 |
content += f"Таблица: {table_num}\n"
|
| 109 |
if table_title:
|
| 110 |
content += f"Название: {table_title}\n"
|
|
|
|
| 112 |
content += f"Раздел: {section}\n"
|
| 113 |
if chunk_info:
|
| 114 |
content += f"{chunk_info}\n"
|
| 115 |
+
content += f"================\n\n"
|
| 116 |
+
|
| 117 |
+
# Add searchable description
|
| 118 |
+
content += f"Это таблица {table_num} из документа {doc_id}. "
|
| 119 |
+
if table_title:
|
| 120 |
+
content += f"{table_title}. "
|
| 121 |
+
if section:
|
| 122 |
+
content += f"Находится в разделе: {section}. "
|
| 123 |
+
content += f"\n\n"
|
| 124 |
|
| 125 |
+
# Headers
|
| 126 |
+
if headers:
|
| 127 |
+
header_str = ' | '.join(str(h) for h in headers)
|
| 128 |
+
content += f"Колонки: {header_str}\n\n"
|
| 129 |
+
|
| 130 |
+
# Rows
|
| 131 |
for row in rows:
|
| 132 |
if isinstance(row, dict):
|
| 133 |
parts = [f"{k}: {v}" for k, v in row.items()
|
| 134 |
if v and str(v).strip() and str(v) != 'nan']
|
| 135 |
+
if parts:
|
| 136 |
+
content += ' | '.join(parts) + "\n"
|
| 137 |
elif isinstance(row, list):
|
| 138 |
parts = [str(v) for v in row if v and str(v).strip() and str(v) != 'nan']
|
| 139 |
+
if parts:
|
| 140 |
+
content += ' | '.join(parts) + "\n"
|
| 141 |
|
| 142 |
return content
|
| 143 |
|
| 144 |
|
| 145 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 146 |
+
"""Load text sections from JSON (including ZIPs)"""
|
| 147 |
+
import zipfile
|
| 148 |
+
import tempfile
|
| 149 |
+
|
| 150 |
log_message("Loading JSON documents...")
|
| 151 |
|
| 152 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 153 |
json_files = [f for f in files if f.startswith(json_dir) and f.endswith('.json')]
|
| 154 |
+
zip_files = [f for f in files if f.startswith(json_dir) and f.endswith('.zip')]
|
| 155 |
+
|
| 156 |
+
log_message(f"Found {len(json_files)} JSON files and {len(zip_files)} ZIP files")
|
| 157 |
|
| 158 |
documents = []
|
| 159 |
+
|
| 160 |
+
# Load direct JSON files
|
| 161 |
for file_path in json_files:
|
| 162 |
try:
|
| 163 |
local_path = hf_hub_download(
|
|
|
|
| 167 |
token=hf_token
|
| 168 |
)
|
| 169 |
|
| 170 |
+
docs = extract_sections_from_json(local_path)
|
| 171 |
+
documents.extend(docs)
|
| 172 |
|
| 173 |
+
except Exception as e:
|
| 174 |
+
log_message(f"Error loading {file_path}: {e}")
|
| 175 |
+
|
| 176 |
+
# Extract and load ZIP files
|
| 177 |
+
for zip_path in zip_files:
|
| 178 |
+
try:
|
| 179 |
+
local_zip = hf_hub_download(
|
| 180 |
+
repo_id=repo_id,
|
| 181 |
+
filename=zip_path,
|
| 182 |
+
repo_type="dataset",
|
| 183 |
+
token=hf_token
|
| 184 |
+
)
|
| 185 |
|
| 186 |
+
with zipfile.ZipFile(local_zip, 'r') as zf:
|
| 187 |
+
for json_file in zf.namelist():
|
| 188 |
+
if json_file.endswith('.json') and not json_file.startswith('__MACOSX'):
|
| 189 |
+
with zf.open(json_file) as f:
|
| 190 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
|
| 191 |
+
tmp.write(f.read())
|
| 192 |
+
tmp_path = tmp.name
|
| 193 |
+
|
| 194 |
+
docs = extract_sections_from_json(tmp_path)
|
| 195 |
+
documents.extend(docs)
|
| 196 |
+
|
| 197 |
+
import os
|
| 198 |
+
os.unlink(tmp_path)
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
log_message(f"Error loading ZIP {zip_path}: {e}")
|
| 202 |
+
|
| 203 |
+
log_message(f"✓ Loaded {len(documents)} text sections")
|
| 204 |
+
return documents
|
| 205 |
+
|
| 206 |
+
def extract_sections_from_json(json_path):
|
| 207 |
+
"""Extract sections from a single JSON file"""
|
| 208 |
+
documents = []
|
| 209 |
+
|
| 210 |
+
try:
|
| 211 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 212 |
+
data = json.load(f)
|
| 213 |
+
|
| 214 |
+
doc_id = data.get('document_metadata', {}).get('document_id', 'unknown')
|
| 215 |
+
|
| 216 |
+
# Extract all section levels
|
| 217 |
+
for section in data.get('sections', []):
|
| 218 |
+
if section.get('section_text', '').strip():
|
| 219 |
+
documents.append(Document(
|
| 220 |
+
text=section['section_text'],
|
| 221 |
+
metadata={
|
| 222 |
+
'type': 'text',
|
| 223 |
+
'document_id': doc_id,
|
| 224 |
+
'section_id': section.get('section_id', '')
|
| 225 |
+
}
|
| 226 |
+
))
|
| 227 |
+
|
| 228 |
+
# Subsections
|
| 229 |
+
for subsection in section.get('subsections', []):
|
| 230 |
+
if subsection.get('subsection_text', '').strip():
|
| 231 |
documents.append(Document(
|
| 232 |
+
text=subsection['subsection_text'],
|
| 233 |
metadata={
|
| 234 |
'type': 'text',
|
| 235 |
'document_id': doc_id,
|
| 236 |
+
'section_id': subsection.get('subsection_id', '')
|
| 237 |
}
|
| 238 |
))
|
| 239 |
+
|
| 240 |
+
# Sub-subsections
|
| 241 |
+
for sub_sub in subsection.get('sub_subsections', []):
|
| 242 |
+
if sub_sub.get('sub_subsection_text', '').strip():
|
| 243 |
+
documents.append(Document(
|
| 244 |
+
text=sub_sub['sub_subsection_text'],
|
| 245 |
+
metadata={
|
| 246 |
+
'type': 'text',
|
| 247 |
+
'document_id': doc_id,
|
| 248 |
+
'section_id': sub_sub.get('sub_subsection_id', '')
|
| 249 |
+
}
|
| 250 |
+
))
|
| 251 |
+
|
| 252 |
+
except Exception as e:
|
| 253 |
+
log_message(f"Error extracting from {json_path}: {e}")
|
| 254 |
|
|
|
|
| 255 |
return documents
|
| 256 |
|
| 257 |
|
|
|
|
| 275 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 276 |
data = json.load(f)
|
| 277 |
|
| 278 |
+
# Extract file-level document_id
|
| 279 |
+
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 280 |
|
| 281 |
for sheet in data.get('sheets', []):
|
| 282 |
+
# Use sheet-level document_id if available, otherwise use file-level
|
| 283 |
+
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 284 |
+
|
| 285 |
+
# CRITICAL: Pass document_id to chunk function
|
| 286 |
+
chunks = chunk_table_by_rows(sheet, sheet_doc_id)
|
| 287 |
all_chunks.extend(chunks)
|
| 288 |
|
| 289 |
except Exception as e:
|