MrSimple07 commited on
Commit
a85d6bf
·
1 Parent(s): 65025a2

new version of rag

Browse files
Files changed (3) hide show
  1. app.py +304 -496
  2. documents_prep.py +263 -493
  3. table_prep.py +57 -199
app.py CHANGED
@@ -1,546 +1,354 @@
1
- import os
2
- import json
3
- import zipfile
4
- import logging
5
- from typing import List, Dict, Any
6
- import pandas as pd
7
- from huggingface_hub import hf_hub_download, list_repo_files
8
- from llama_index.core import Document, VectorStoreIndex, KeywordTableIndex, Settings
9
- from llama_index.core.retrievers import VectorIndexRetriever, QueryFusionRetriever
10
- from llama_index.retrievers.bm25 import BM25Retriever
11
- from llama_index.core.query_engine import RetrieverQueryEngine
12
- from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
13
- from llama_index.core.text_splitter import SentenceSplitter
14
- from sentence_transformers import SentenceTransformer
15
- from llama_index.llms.google_genai import GoogleGenAI
16
- from llama_index.llms.openai import OpenAI
17
  import gradio as gr
 
 
 
 
 
 
18
  import sys
19
-
20
- logging.basicConfig(
21
- level=logging.INFO,
22
- format='%(asctime)s - %(levelname)s - %(message)s',
23
- handlers=[
24
- logging.FileHandler('rag_system.log'),
25
- logging.StreamHandler(sys.stdout)
26
- ]
27
  )
28
- logger = logging.getLogger(__name__)
29
-
30
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
31
- OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
32
- HF_REPO_ID = "MrSimple01/AIEXP_RAG_FILES"
33
- HF_TOKEN = os.getenv('HF_TOKEN')
34
-
35
- AVAILABLE_MODELS = {
36
- "Gemini 2.5 Flash": {
37
- "provider": "google",
38
- "model_name": "gemini-2.5-flash",
39
- "api_key": GOOGLE_API_KEY
40
- },
41
- "Gemini 2.5 Pro": {
42
- "provider": "google",
43
- "model_name": "gemini-2.5-pro",
44
- "api_key": GOOGLE_API_KEY
45
- },
46
- "GPT-4o": {
47
- "provider": "openai",
48
- "model_name": "gpt-4o",
49
- "api_key": OPENAI_API_KEY
50
- },
51
- "GPT-4o Mini": {
52
- "provider": "openai",
53
- "model_name": "gpt-4o-mini",
54
- "api_key": OPENAI_API_KEY
55
- },
56
- "GPT-5": {
57
- "provider": "openai",
58
- "model_name": "gpt-5",
59
- "api_key": OPENAI_API_KEY
60
- }
61
- }
62
 
63
- DEFAULT_MODEL = "Gemini 2.5 Flash"
64
- DOWNLOAD_DIR = "rag_files"
65
- JSON_FILES_DIR = "JSON"
66
- TABLE_DATA_DIR = "Табличные данные_JSON"
67
- IMAGE_DATA_DIR = "Изображения"
68
- CHUNK_SIZE = 512
69
- CHUNK_OVERLAP = 50
70
- TABLE_MAX_ROWS_PER_CHUNK = 30
71
-
72
- os.makedirs(DOWNLOAD_DIR, exist_ok=True)
73
-
74
- stats = {
75
- 'total_documents': 0,
76
- 'total_text_chunks': 0,
77
- 'total_tables': 0,
78
- 'total_table_chunks': 0,
79
- 'total_images': 0,
80
- 'failed_files': 0,
81
- 'encoding_errors': []
82
- }
83
-
84
- def get_llm_model(model_name):
85
- try:
86
- logger.info(f"Initializing LLM model: {model_name}")
87
- model_config = AVAILABLE_MODELS.get(model_name)
88
- if not model_config:
89
- logger.warning(f"Model {model_name} not found, using default: {DEFAULT_MODEL}")
90
- model_config = AVAILABLE_MODELS[DEFAULT_MODEL]
91
 
92
- if not model_config.get("api_key"):
93
- raise Exception(f"API key not found for model {model_name}")
 
94
 
95
- if model_config["provider"] == "google":
96
- llm = GoogleGenAI(
97
- model=model_config["model_name"],
98
- api_key=model_config["api_key"]
99
- )
100
- logger.info(f"Successfully initialized Google model: {model_config['model_name']}")
101
- return llm
102
- elif model_config["provider"] == "openai":
103
- llm = OpenAI(
104
- model=model_config["model_name"],
105
- api_key=model_config["api_key"]
106
- )
107
- logger.info(f"Successfully initialized OpenAI model: {model_config['model_name']}")
108
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  else:
110
- raise Exception(f"Unsupported provider: {model_config['provider']}")
111
 
112
- except Exception as e:
113
- logger.error(f"Error initializing model {model_name}: {e}")
114
- logger.info("Falling back to default Gemini model")
115
- return GoogleGenAI(
116
- model="gemini-2.0-flash",
117
- api_key=GOOGLE_API_KEY
118
- )
119
-
120
- def get_embedding_model():
121
- logger.info("Initializing embedding model: all-MiniLM-L6-v2")
122
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
123
- embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
124
- logger.info("Embedding model initialized successfully")
125
- return embed_model
126
-
127
- def list_zip_files_in_repo(repo_id: str) -> List[str]:
128
- logger.info(f"Listing files in repository: {repo_id}")
129
- files = list_repo_files(repo_id, repo_type="dataset", token=HF_TOKEN)
130
- zip_files = [f for f in files if f.startswith(JSON_FILES_DIR) and f.endswith('.zip')]
131
- logger.info(f"Found {len(zip_files)} zip files in {JSON_FILES_DIR} directory")
132
- return zip_files
133
 
134
- def download_file_from_hf(repo_id: str, path_in_repo: str, dest_dir: str) -> str:
135
- logger.info(f"Downloading file: {path_in_repo}")
136
- local_path = hf_hub_download(
137
- repo_id=repo_id,
138
- filename=path_in_repo,
139
- repo_type="dataset",
140
- token=HF_TOKEN,
141
- local_dir=dest_dir
142
- )
143
- logger.info(f"File downloaded to: {local_path}")
144
- return local_path
145
-
146
- def read_jsons_from_zip(zip_path: str) -> List[Dict[str, Any]]:
147
- logger.info(f"Reading JSON files from zip: {zip_path}")
148
- docs = []
149
- json_count = 0
150
- failed_count = 0
151
-
152
- with zipfile.ZipFile(zip_path, 'r') as z:
153
- json_files = [name for name in z.namelist() if name.lower().endswith('.json')]
154
- logger.info(f"Found {len(json_files)} JSON files in zip")
155
 
156
- for name in json_files:
157
- try:
158
- with z.open(name) as f:
159
- raw_bytes = f.read()
160
-
161
- for encoding in ['utf-8', 'utf-8-sig', 'latin-1', 'cp1251', 'windows-1251']:
162
- try:
163
- text = raw_bytes.decode(encoding)
164
- data = json.loads(text)
165
- docs.append(data)
166
- json_count += 1
167
- logger.debug(f"Successfully loaded {name} with encoding {encoding}")
168
- break
169
- except (UnicodeDecodeError, json.JSONDecodeError):
170
- continue
171
- else:
172
- failed_count += 1
173
- stats['failed_files'] += 1
174
- stats['encoding_errors'].append(name)
175
- logger.warning(f"Failed to load {name} - tried all encodings")
176
-
177
- except Exception as e:
178
- failed_count += 1
179
- stats['failed_files'] += 1
180
- logger.error(f"Error processing {name}: {e}")
181
-
182
- logger.info(f"Successfully loaded {json_count} JSON files, failed: {failed_count}")
183
- return docs
184
-
185
- def chunk_text_field(text: str, doc_meta: Dict[str, Any], splitter: SentenceSplitter) -> List[Document]:
186
- nodes = splitter.split_text(text)
187
- chunks = []
188
-
189
- for i, node_text in enumerate(nodes):
190
- md = dict(doc_meta)
191
- md.update({
192
- 'chunk_id': f"{md.get('document_id','unknown')}_text_{i}",
193
- 'chunk_type': 'text'
194
- })
195
- chunks.append(Document(text=node_text, metadata=md))
196
-
197
- stats['total_text_chunks'] += len(chunks)
198
- logger.debug(f"Created {len(chunks)} text chunks for document {doc_meta.get('document_id')}")
199
- return chunks
200
-
201
- def chunk_table(table: Dict[str, Any], table_meta: Dict[str, Any], max_rows: int = TABLE_MAX_ROWS_PER_CHUNK) -> List[Document]:
202
- headers = table.get('headers') or []
203
- rows = table.get('data') or []
204
-
205
- stats['total_tables'] += 1
206
-
207
- if not rows:
208
- text = table.get('table_description') or table.get('table_title') or ''
209
- md = {**table_meta, 'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_single"}
210
- stats['total_table_chunks'] += 1
211
- logger.debug(f"Created single chunk for empty table: {table_meta.get('table_title')}")
212
- return [Document(text=text, metadata=md)]
213
-
214
- chunks = []
215
- num_chunks = (len(rows) + max_rows - 1) // max_rows
216
-
217
- for i in range(0, len(rows), max_rows):
218
- block = rows[i:i+max_rows]
219
- lines = []
220
- lines.append(f"Table {table_meta.get('table_number','?')} - {table_meta.get('table_title','')}")
221
- lines.append(f"Headers: {headers}")
222
- for r in block:
223
- row_items = [f"{k}: {v}" for k, v in r.items()]
224
- lines.append(" | ".join(row_items))
225
- chunk_text = "\n".join(lines)
226
- md = dict(table_meta)
227
- md.update({'chunk_type': 'table', 'chunk_id': f"{table_meta.get('document_id')}_table_{i // max_rows}"})
228
- chunks.append(Document(text=chunk_text, metadata=md))
229
-
230
- stats['total_table_chunks'] += len(chunks)
231
- logger.debug(f"Table '{table_meta.get('table_title')}': {len(rows)} rows split into {len(chunks)} chunks")
232
- return chunks
233
-
234
- def chunk_image(image_entry: Dict[str, Any], image_meta: Dict[str, Any]) -> Document:
235
- txt = f"Image: {image_entry.get('Название изображения') or image_entry.get('title','')}. "
236
- txt += f"Описание: {image_entry.get('Описание изображение') or image_entry.get('description','')}. "
237
- txt += f"Файл: {image_entry.get('Файл изображения') or image_entry.get('file','')}."
238
- md = dict(image_meta)
239
- md.update({'chunk_type': 'image', 'chunk_id': f"{image_meta.get('document_id')}_image_{image_entry.get('№ Изображения','0')}"})
240
-
241
- stats['total_images'] += 1
242
- logger.debug(f"Created image chunk: {image_entry.get('Название изображения', 'unknown')}")
243
- return Document(text=txt, metadata=md)
244
-
245
- def build_chunks_from_repo(repo_id: str) -> List[Document]:
246
- logger.info("=" * 80)
247
- logger.info("Starting document processing from repository")
248
- logger.info("=" * 80)
249
-
250
- zip_paths = list_zip_files_in_repo(repo_id)
251
- logger.info(f"Total zip files to process: {len(zip_paths)}")
252
-
253
- splitter = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
254
- logger.info(f"Text splitter configured: chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}")
255
-
256
- all_chunks = []
257
 
258
- for zip_idx, remote_path in enumerate(zip_paths, 1):
259
- logger.info(f"\n[{zip_idx}/{len(zip_paths)}] Processing zip file: {remote_path}")
260
- local_zip = download_file_from_hf(repo_id, remote_path, DOWNLOAD_DIR)
261
- json_docs = read_jsons_from_zip(local_zip)
262
 
263
- logger.info(f"Processing {len(json_docs)} documents from {remote_path}")
264
- stats['total_documents'] += len(json_docs)
 
 
 
 
 
 
 
 
265
 
266
- for doc_idx, doc in enumerate(json_docs, 1):
267
- doc_meta = doc.get('document_metadata', {})
268
- doc_id = doc_meta.get('document_id') or doc_meta.get('document_name') or f'unknown_doc_{doc_idx}'
269
- base_meta = {'document_id': doc_id, 'document_name': doc_meta.get('document_name','')}
270
 
271
- logger.info(f" Document [{doc_idx}/{len(json_docs)}]: {doc_id}")
 
 
 
 
 
 
 
 
272
 
273
- sections = doc.get('sections', [])
274
- if sections:
275
- logger.info(f" Processing {len(sections)} text sections")
276
- for sec in sections:
277
- sec_meta = dict(base_meta)
278
- sec_meta.update({'section_id': sec.get('section_id'), 'section_title': None})
279
- text = sec.get('section_text') or sec.get('text') or ''
280
- if text and text.strip():
281
- chunks = chunk_text_field(text, sec_meta, splitter)
282
- all_chunks.extend(chunks)
283
-
284
- tables = doc.get('sheets', []) + doc.get('tables', []) if (doc.get('sheets') or doc.get('tables')) else []
285
- if tables:
286
- logger.info(f" Processing {len(tables)} tables")
287
- for tbl_idx, sheet in enumerate(tables, 1):
288
- table_meta = dict(base_meta)
289
- table_meta.update({
290
- 'sheet_name': sheet.get('sheet_name') or sheet.get('table_title'),
291
- 'section': sheet.get('section'),
292
- 'table_number': sheet.get('table_number'),
293
- 'table_title': sheet.get('table_title')
294
- })
295
- table_chunks = chunk_table(sheet, table_meta, max_rows=TABLE_MAX_ROWS_PER_CHUNK)
296
- all_chunks.extend(table_chunks)
297
-
298
- images = doc.get('images', []) or doc.get('image_data', []) or doc.get('image_entries', [])
299
- if images:
300
- logger.info(f" Processing {len(images)} images")
301
- for img in images:
302
- img_meta = dict(base_meta)
303
- chunk = chunk_image(img, img_meta)
304
- all_chunks.append(chunk)
305
-
306
- logger.info("\n" + "=" * 80)
307
- logger.info("PROCESSING SUMMARY")
308
- logger.info("=" * 80)
309
- logger.info(f"Total documents processed: {stats['total_documents']}")
310
- logger.info(f"Total text chunks created: {stats['total_text_chunks']}")
311
- logger.info(f"Total tables processed: {stats['total_tables']}")
312
- logger.info(f"Total table chunks created: {stats['total_table_chunks']}")
313
- logger.info(f"Total images processed: {stats['total_images']}")
314
- logger.info(f"Total chunks created: {len(all_chunks)}")
315
- logger.info(f"Failed files: {stats['failed_files']}")
316
-
317
- if stats['encoding_errors']:
318
- logger.warning(f"Files with encoding errors ({len(stats['encoding_errors'])}):")
319
- for err_file in stats['encoding_errors'][:10]:
320
- logger.warning(f" - {err_file}")
321
- if len(stats['encoding_errors']) > 10:
322
- logger.warning(f" ... and {len(stats['encoding_errors']) - 10} more")
323
-
324
- logger.info("=" * 80)
325
- return all_chunks
326
-
327
- def create_hybrid_index(documents):
328
- logger.info("Creating hybrid index system")
329
- logger.info(f"Building vector index from {len(documents)} documents")
330
- vector_index = VectorStoreIndex.from_documents(documents)
331
- logger.info("Vector index created successfully")
332
-
333
- logger.info("Building keyword index")
334
- keyword_index = KeywordTableIndex.from_documents(documents)
335
- logger.info("Keyword index created successfully")
336
-
337
- return vector_index, keyword_index
338
-
339
- def create_fusion_retriever(vector_index, keyword_index, documents):
340
- logger.info("Creating fusion retriever with multiple retrieval strategies")
341
-
342
- vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
343
- logger.info("Vector retriever configured (top_k=5)")
344
-
345
- bm25_retriever = BM25Retriever.from_defaults(
346
- docstore=vector_index.docstore,
347
- similarity_top_k=5
348
- )
349
- logger.info("BM25 retriever configured (top_k=5)")
350
-
351
- fusion_retriever = QueryFusionRetriever(
352
- [vector_retriever, bm25_retriever],
353
- similarity_top_k=5,
354
- num_queries=1,
355
- mode="reciprocal_rerank",
356
- use_async=False
357
- )
358
- logger.info("Fusion retriever created with reciprocal rerank mode")
359
-
360
- return fusion_retriever
361
-
362
- def create_query_engine(vector_index, keyword_index, documents):
363
- logger.info("Creating query engine")
364
- fusion_retriever = create_fusion_retriever(vector_index, keyword_index, documents)
365
-
366
- response_synthesizer = get_response_synthesizer(
367
- response_mode=ResponseMode.COMPACT,
368
- use_async=False
369
- )
370
- logger.info("Response synthesizer configured (COMPACT mode)")
371
-
372
- query_engine = RetrieverQueryEngine(
373
- retriever=fusion_retriever,
374
- response_synthesizer=response_synthesizer
375
- )
376
- logger.info("Query engine created successfully")
377
-
378
- return query_engine
379
-
380
- def initialize_system():
381
- logger.info("\n" + "=" * 80)
382
- logger.info("INITIALIZING AIEXP RAG SYSTEM")
383
- logger.info("=" * 80)
384
-
385
- embed_model = get_embedding_model()
386
- llm = get_llm_model(DEFAULT_MODEL)
387
-
388
- Settings.embed_model = embed_model
389
- Settings.llm = llm
390
- Settings.chunk_size = CHUNK_SIZE
391
- Settings.chunk_overlap = CHUNK_OVERLAP
392
- logger.info("Global settings configured")
393
-
394
- documents = build_chunks_from_repo(HF_REPO_ID)
395
-
396
- vector_index, keyword_index = create_hybrid_index(documents)
397
-
398
- query_engine = create_query_engine(vector_index, keyword_index, documents)
399
-
400
- logger.info("=" * 80)
401
- logger.info("SYSTEM INITIALIZATION COMPLETE")
402
- logger.info("=" * 80)
403
- return query_engine, vector_index, keyword_index, documents
404
-
405
- def answer_question(question, query_engine):
406
- if not question.strip():
407
- return "<div style='color: black;'>Please enter a question</div>"
408
-
409
- try:
410
- logger.info(f"Processing query: {question[:100]}...")
411
- response = query_engine.query(question)
412
- logger.info(f"Query processed, found {len(response.source_nodes)} source nodes")
413
 
414
- answer_html = f"""
415
- <div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; color: black;'>
416
- <h3 style='color: #007bff;'>Answer:</h3>
417
- <p>{response.response}</p>
418
- </div>
419
- """
420
 
421
- sources_html = "<div style='background-color: #e9ecef; padding: 15px; border-radius: 8px; color: black;'>"
422
- sources_html += "<h4>Sources:</h4>"
423
- for i, node in enumerate(response.source_nodes):
424
- sources_html += f"""
425
- <div style='margin: 10px 0; padding: 10px; background-color: white; border-left: 3px solid #007bff;'>
426
- <strong>Document {i+1}:</strong> {node.metadata.get('document_id', 'unknown')}<br>
427
- <strong>Score:</strong> {node.score:.3f}<br>
428
- <strong>Text:</strong> {node.text[:200]}...
429
- </div>
430
- """
431
- sources_html += "</div>"
432
 
433
- return answer_html, sources_html
 
434
 
435
  except Exception as e:
436
- logger.error(f"Error processing query: {e}", exc_info=True)
437
- error_html = f"<div style='color: red;'>Error: {str(e)}</div>"
438
- return error_html, error_html
439
 
440
- def switch_model(model_name, vector_index, keyword_index, documents):
 
 
 
441
  try:
442
- logger.info(f"Switching to model: {model_name}")
 
443
  new_llm = get_llm_model(model_name)
444
  Settings.llm = new_llm
445
 
446
- new_query_engine = create_query_engine(vector_index, keyword_index, documents)
447
- logger.info(f"Successfully switched to model: {model_name}")
448
- return new_query_engine, f" Model switched to: {model_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  except Exception as e:
450
- logger.error(f"Error switching model: {e}")
451
- return None, f" Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
  query_engine = None
 
 
454
  vector_index = None
455
- keyword_index = None
456
- documents = None
457
  current_model = DEFAULT_MODEL
458
 
459
  def main_answer_question(question):
460
- global query_engine
461
- return answer_question(question, query_engine)
 
 
 
462
 
463
  def main_switch_model(model_name):
464
- global query_engine, vector_index, keyword_index, documents, current_model
465
- new_query_engine, status = switch_model(model_name, vector_index, keyword_index, documents)
 
466
  if new_query_engine:
467
  query_engine = new_query_engine
468
  current_model = model_name
469
- return status
470
-
471
- def create_interface():
472
- with gr.Blocks(title="AIEXP - RAG System", theme=gr.themes.Soft()) as demo:
473
- gr.Markdown("# AIEXP - AI Expert for Regulatory Documentation")
474
-
475
- with gr.Row():
476
- model_dropdown = gr.Dropdown(
477
- choices=list(AVAILABLE_MODELS.keys()),
478
- value=current_model,
479
- label="Select Language Model"
480
- )
481
- switch_btn = gr.Button("Switch Model")
482
- model_status = gr.Textbox(
483
- value=f"Current model: {current_model}",
484
- label="Model Status",
485
- interactive=False
486
- )
487
-
488
- with gr.Row():
489
- question_input = gr.Textbox(
490
- label="Your Question",
491
- placeholder="Ask a question about the documents...",
492
- lines=3
493
- )
494
-
495
- ask_btn = gr.Button("Get Answer", variant="primary")
496
-
497
- with gr.Row():
498
- answer_output = gr.HTML(
499
- label="Answer",
500
- value="<div style='padding: 20px; text-align: center;'>Answer will appear here...</div>"
501
- )
502
- sources_output = gr.HTML(
503
- label="Sources",
504
- value="<div style='padding: 20px; text-align: center;'>Sources will appear here...</div>"
505
- )
506
-
507
- switch_btn.click(
508
- fn=main_switch_model,
509
- inputs=[model_dropdown],
510
- outputs=[model_status]
511
- )
512
-
513
- ask_btn.click(
514
- fn=main_answer_question,
515
- inputs=[question_input],
516
- outputs=[answer_output, sources_output]
517
- )
518
-
519
- question_input.submit(
520
- fn=main_answer_question,
521
- inputs=[question_input],
522
- outputs=[answer_output, sources_output]
523
- )
524
 
525
- return demo
526
 
527
  def main():
528
- global query_engine, vector_index, keyword_index, documents
529
-
530
- logger.info("Starting AIEXP - AI Expert for Regulatory Documentation")
531
-
532
- query_engine, vector_index, keyword_index, documents = initialize_system()
 
 
 
 
 
 
 
 
533
 
534
  if query_engine:
535
- logger.info("Launching web interface on port 7860")
536
- demo = create_interface()
 
 
 
 
 
537
  demo.launch(
538
  server_name="0.0.0.0",
539
  server_port=7860,
540
- share=True
 
541
  )
542
  else:
543
- logger.error("Failed to initialize system")
544
  sys.exit(1)
545
 
546
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import os
3
+ from llama_index.core import Settings
4
+ from documents_prep import load_json_documents, load_table_data, load_image_data, load_csv_chunks
5
+ from utils import get_llm_model, get_embedding_model, get_reranker_model, answer_question
6
+ from my_logging import log_message
7
+ from index_retriever import create_vector_index, create_query_engine
8
  import sys
9
+ from config import (
10
+ HF_REPO_ID, HF_TOKEN, DOWNLOAD_DIR, CHUNKS_FILENAME,
11
+ JSON_FILES_DIR, TABLE_DATA_DIR, IMAGE_DATA_DIR, DEFAULT_MODEL, AVAILABLE_MODELS
 
 
 
 
 
12
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ def create_chunks_display_html(chunk_info):
15
+ if not chunk_info:
16
+ return "<div style='padding: 20px; text-align: center; color: black;'>Нет данных о чанках</div>"
17
+
18
+ html = "<div style='max-height: 500px; overflow-y: auto; padding: 10px; color: black;'>"
19
+ html += f"<h4 style='color: black;'>Найдено релевантных чанков: {len(chunk_info)}</h4>"
20
+
21
+ for i, chunk in enumerate(chunk_info):
22
+ bg_color = "#f8f9fa" if i % 2 == 0 else "#e9ecef"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Get section display info
25
+ section_display = get_section_display(chunk)
26
+ formatted_content = get_formatted_content(chunk)
27
 
28
+ html += f"""
29
+ <div style='background-color: {bg_color}; padding: 10px; margin: 5px 0; border-radius: 5px; border-left: 4px solid #007bff; color: black;'>
30
+ <strong style='color: black;'>Документ:</strong> <span style='color: black;'>{chunk['document_id']}</span><br>
31
+ <strong style='color: black;'>Раздел:</strong> <span style='color: black;'>{section_display}</span><br>
32
+ <strong style='color: black;'>Содержание:</strong><br>
33
+ <div style='background-color: white; padding: 8px; margin-top: 5px; border-radius: 3px; font-family: monospace; font-size: 12px; color: black; max-height: 200px; overflow-y: auto;'>
34
+ {formatted_content}
35
+ </div>
36
+ </div>
37
+ """
38
+
39
+ html += "</div>"
40
+ return html
41
+
42
+ def get_section_display(chunk):
43
+ section_path = chunk.get('section_path', '')
44
+ section_id = chunk.get('section_id', 'unknown')
45
+ doc_type = chunk.get('type', 'text')
46
+
47
+ if doc_type == 'table' and chunk.get('table_number'):
48
+ table_num = chunk.get('table_number')
49
+ if not str(table_num).startswith('№'):
50
+ table_num = f"№{table_num}"
51
+ return f"таблица {table_num}"
52
+
53
+ if doc_type == 'image' and chunk.get('image_number'):
54
+ image_num = chunk.get('image_number')
55
+ if not str(image_num).startswith('№'):
56
+ image_num = f"№{image_num}"
57
+ return f"рисунок {image_num}"
58
+
59
+ if section_path:
60
+ return section_path
61
+ elif section_id and section_id != 'unknown':
62
+ return section_id
63
+
64
+ return section_id
65
+
66
+ def get_formatted_content(chunk):
67
+ document_id = chunk.get('document_id', 'unknown')
68
+ section_path = chunk.get('section_path', '')
69
+ section_id = chunk.get('section_id', 'unknown')
70
+ section_text = chunk.get('section_text', '')
71
+ parent_section = chunk.get('parent_section', '')
72
+ parent_title = chunk.get('parent_title', '')
73
+ level = chunk.get('level', '')
74
+ chunk_text = chunk.get('chunk_text', '')
75
+ doc_type = chunk.get('type', 'text')
76
+
77
+ # For text documents
78
+ if level in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section:
79
+ current_section = section_path if section_path else section_id
80
+ parent_info = f"{parent_section} ({parent_title})" if parent_title else parent_section
81
+ return f"В разделе {parent_info} в документе {document_id}, пункт {current_section}: {chunk_text}"
82
+ else:
83
+ current_section = section_path if section_path else section_id
84
+ clean_text = chunk_text
85
+ if section_text and chunk_text.startswith(section_text):
86
+ section_title = section_text
87
+ elif chunk_text.startswith(f"{current_section} "):
88
+ clean_text = chunk_text[len(f"{current_section} "):].strip()
89
+ section_title = section_text if section_text else f"{current_section} {clean_text.split('.')[0] if '.' in clean_text else clean_text[:50]}"
90
  else:
91
+ section_title = section_text if section_text else current_section
92
 
93
+ return f"В разделе {current_section} в документе {document_id}, пункт {section_title}: {clean_text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ def initialize_system(repo_id, hf_token, download_dir, chunks_filename=None,
96
+ json_files_dir=None, table_data_dir=None, image_data_dir=None,
97
+ use_json_instead_csv=False):
98
+ try:
99
+ from documents_prep import process_documents_with_chunking
100
+ log_message("Инициализация системы")
101
+ os.makedirs(download_dir, exist_ok=True)
102
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
103
+ from llama_index.core.text_splitter import TokenTextSplitter
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ embed_model = get_embedding_model()
106
+ llm = get_llm_model(DEFAULT_MODEL)
107
+ reranker = get_reranker_model()
108
+
109
+ Settings.embed_model = embed_model
110
+ Settings.llm = llm
111
+ Settings.text_splitter = TokenTextSplitter(
112
+ chunk_size=CHUNK_SIZE,
113
+ chunk_overlap=CHUNK_OVERLAP,
114
+ separator=" ",
115
+ backup_separators=["\n", ".", "!", "?"]
116
+ )
117
+
118
+ log_message(f"Configured chunk size: {CHUNK_SIZE} tokens")
119
+ log_message(f"Configured chunk overlap: {CHUNK_OVERLAP} tokens")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ all_documents = []
122
+ chunks_df = None
123
+ chunk_info = []
 
124
 
125
+ if use_json_instead_csv and json_files_dir:
126
+ log_message("Используем JSON файлы вместо CSV")
127
+ json_documents, json_chunk_info = load_json_documents(repo_id, hf_token, json_files_dir, download_dir)
128
+ all_documents.extend(json_documents)
129
+ chunk_info.extend(json_chunk_info)
130
+ else:
131
+ if chunks_filename:
132
+ log_message("Загружаем данные из CSV")
133
+ csv_documents, chunks_df = load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir)
134
+ all_documents.extend(csv_documents)
135
 
136
+ if table_data_dir:
137
+ log_message("Добавляю табличные данные")
138
+ table_documents = load_table_data(repo_id, hf_token, table_data_dir)
139
+ log_message(f"Загружено {len(table_documents)} табличных документов")
140
 
141
+ # Process table documents through chunking
142
+ chunked_table_docs, table_chunk_info = process_documents_with_chunking(table_documents)
143
+ all_documents.extend(chunked_table_docs)
144
+ chunk_info.extend(table_chunk_info)
145
+
146
+ if image_data_dir:
147
+ log_message("Добавляю данные изображений")
148
+ image_documents = load_image_data(repo_id, hf_token, image_data_dir)
149
+ log_message(f"Загружено {len(image_documents)} документов изображений")
150
 
151
+ # Process image documents through chunking
152
+ chunked_image_docs, image_chunk_info = process_documents_with_chunking(image_documents)
153
+ all_documents.extend(chunked_image_docs)
154
+ chunk_info.extend(image_chunk_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ log_message(f"Всего документов после всей обработки: {len(all_documents)}")
 
 
 
 
 
157
 
158
+ vector_index = create_vector_index(all_documents)
159
+ query_engine = create_query_engine(vector_index)
 
 
 
 
 
 
 
 
 
160
 
161
+ log_message(f"Система успешно инициализирована")
162
+ return query_engine, chunks_df, reranker, vector_index, chunk_info
163
 
164
  except Exception as e:
165
+ log_message(f"Ошибка инициализации: {str(e)}")
166
+ return None, None, None, None, []
 
167
 
168
+ def switch_model(model_name, vector_index):
169
+ from llama_index.core import Settings
170
+ from index_retriever import create_query_engine
171
+
172
  try:
173
+ log_message(f"Переключение на модель: {model_name}")
174
+
175
  new_llm = get_llm_model(model_name)
176
  Settings.llm = new_llm
177
 
178
+ if vector_index is not None:
179
+ new_query_engine = create_query_engine(vector_index)
180
+ log_message(f"Модель успешно переключена на: {model_name}")
181
+ return new_query_engine, f"✅ Модель переключена на: {model_name}"
182
+ else:
183
+ return None, "❌ Ошибка: система не инициализирована"
184
+
185
+ except Exception as e:
186
+ error_msg = f"Ошибка переключения модели: {str(e)}"
187
+ log_message(error_msg)
188
+ return None, f"❌ {error_msg}"
189
+
190
+ def main_answer_question(question):
191
+ global query_engine, reranker, current_model, chunks_df
192
+ if not question.strip():
193
+ return ("<div style='color: black;'>Пожалуйста, введите вопрос</div>",
194
+ "<div style='color: black;'>Источники появятся после обработки запроса</div>",
195
+ "<div style='color: black;'>Чанки появятся после обработки запроса</div>")
196
+
197
+ try:
198
+ # Call the answer_question function which returns 3 values
199
+ answer_html, sources_html, chunks_html = answer_question(question, query_engine, reranker, current_model, chunks_df)
200
+ return answer_html, sources_html, chunks_html
201
+
202
  except Exception as e:
203
+ log_message(f"Ошибка при ответе на вопрос: {str(e)}")
204
+ return (f"<div style='color: red;'>Ошибка: {str(e)}</div>",
205
+ "<div style='color: black;'>Источники недоступны из-за ошибки</div>",
206
+ "<div style='color: black;'>Чанки недоступны из-за ошибки</div>")
207
+
208
+
209
+
210
+ def create_demo_interface(answer_question_func, switch_model_func, current_model, chunk_info=None):
211
+ with gr.Blocks(title="AIEXP - AI Expert для нормативной документации", theme=gr.themes.Soft()) as demo:
212
+
213
+ gr.Markdown("""
214
+ # AIEXP - Artificial Intelligence Expert
215
+
216
+ ## Инструмент для работы с нормативной документацией
217
+ """)
218
+
219
+ with gr.Tab("Поиск по нормативным документам"):
220
+ gr.Markdown("### Задайте вопрос по нормативной документации")
221
+
222
+ with gr.Row():
223
+ with gr.Column(scale=2):
224
+ model_dropdown = gr.Dropdown(
225
+ choices=list(AVAILABLE_MODELS.keys()),
226
+ value=current_model,
227
+ label="Выберите языковую модель",
228
+ info="Выберите модель для генерации ответов"
229
+ )
230
+ with gr.Column(scale=1):
231
+ switch_btn = gr.Button("Переключить модель", variant="secondary")
232
+ model_status = gr.Textbox(
233
+ value=f"Текущая модель: {current_model}",
234
+ label="Статус модели",
235
+ interactive=False
236
+ )
237
+
238
+ with gr.Row():
239
+ with gr.Column(scale=3):
240
+ question_input = gr.Textbox(
241
+ label="Ваш вопрос к базе знаний",
242
+ placeholder="Введите вопрос по нормативным документам...",
243
+ lines=3
244
+ )
245
+ ask_btn = gr.Button("Найти ответ", variant="primary", size="lg")
246
+
247
+ gr.Examples(
248
+ examples=[
249
+ "О чем этот рисунок: ГОСТ Р 50.04.07-2022 Приложение Л. Л.1.5 Рисунок Л.2",
250
+ "Л.9 Формула в ГОСТ Р 50.04.07 - 2022 что и о чем там?",
251
+ "Какой стандарт устанавливает порядок признания протоколов испытаний продукции в области использования атомной энергии?",
252
+ "Кто несет ответственность за организацию и проведение признания протоколов испытаний продукции?",
253
+ "В каких случаях могут быть признаны протоколы испытаний, проведенные лабораториями?",
254
+ "В какой таблице можно найти информацию о методы исследований при аттестационных испытаниях технологии термической обработки заготовок из легированных сталей? Какой документ и какой раздел?"
255
+ ],
256
+ inputs=question_input
257
+ )
258
+
259
+ with gr.Row():
260
+ with gr.Column(scale=2):
261
+ answer_output = gr.HTML(
262
+ label="",
263
+ value=f"<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появится ответ на ваш вопрос...<br><small>Текущая модель: {current_model}</small></div>",
264
+ )
265
+
266
+ with gr.Column(scale=1):
267
+ sources_output = gr.HTML(
268
+ label="",
269
+ value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
270
+ )
271
+
272
+ with gr.Column(scale=1):
273
+ chunks_output = gr.HTML(
274
+ label="Релевантные чанки",
275
+ value="<div style='background-color: #2d3748; color: white; padding: 20px; border-radius: 10px; text-align: center;'>Здесь появятся релевантные чанки...</div>",
276
+ )
277
+
278
+ switch_btn.click(
279
+ fn=switch_model_func,
280
+ inputs=[model_dropdown],
281
+ outputs=[model_status]
282
+ )
283
+
284
+ ask_btn.click(
285
+ fn=answer_question_func,
286
+ inputs=[question_input],
287
+ outputs=[answer_output, sources_output, chunks_output]
288
+ )
289
+
290
+ question_input.submit(
291
+ fn=answer_question_func,
292
+ inputs=[question_input],
293
+ outputs=[answer_output, sources_output, chunks_output]
294
+ )
295
+ return demo
296
+
297
 
298
  query_engine = None
299
+ chunks_df = None
300
+ reranker = None
301
  vector_index = None
 
 
302
  current_model = DEFAULT_MODEL
303
 
304
  def main_answer_question(question):
305
+ global query_engine, reranker, current_model, chunks_df
306
+ answer_html, sources_html, chunks_html = answer_question(
307
+ question, query_engine, reranker, current_model, chunks_df
308
+ )
309
+ return answer_html, sources_html, chunks_html
310
 
311
  def main_switch_model(model_name):
312
+ global query_engine, vector_index, current_model
313
+
314
+ new_query_engine, status_message = switch_model(model_name, vector_index)
315
  if new_query_engine:
316
  query_engine = new_query_engine
317
  current_model = model_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ return status_message
320
 
321
  def main():
322
+ global query_engine, chunks_df, reranker, vector_index, current_model
323
+
324
+ log_message("Запуск AIEXP - AI Expert для нормативной документации")
325
+
326
+ query_engine, chunks_df, reranker, vector_index, chunk_info = initialize_system(
327
+ repo_id=HF_REPO_ID,
328
+ hf_token=HF_TOKEN,
329
+ download_dir=DOWNLOAD_DIR,
330
+ json_files_dir=JSON_FILES_DIR,
331
+ table_data_dir=TABLE_DATA_DIR,
332
+ image_data_dir=IMAGE_DATA_DIR,
333
+ use_json_instead_csv=True,
334
+ )
335
 
336
  if query_engine:
337
+ log_message("Запуск веб-интерфейса")
338
+ demo = create_demo_interface(
339
+ answer_question_func=main_answer_question,
340
+ switch_model_func=main_switch_model,
341
+ current_model=current_model,
342
+ chunk_info=chunk_info
343
+ )
344
  demo.launch(
345
  server_name="0.0.0.0",
346
  server_port=7860,
347
+ share=True,
348
+ debug=False
349
  )
350
  else:
351
+ log_message("Невозможно запустить приложение из-за ошибки инициализации")
352
  sys.exit(1)
353
 
354
  if __name__ == "__main__":
documents_prep.py CHANGED
@@ -1,381 +1,229 @@
1
  import json
2
  import zipfile
3
  import pandas as pd
4
- from collections import Counter
5
  from huggingface_hub import hf_hub_download, list_repo_files
6
  from llama_index.core import Document
7
- from llama_index.core.text_splitter import SentenceSplitter
8
  from my_logging import log_message
 
9
  from config import CHUNK_SIZE, CHUNK_OVERLAP
 
10
 
11
 
12
- # ============================================================================
13
- # TEXT CHUNKING
14
- # ============================================================================
15
-
16
- def chunk_text_document(doc):
17
- """Split text document into chunks using sentence splitter"""
 
 
 
 
 
18
  text_splitter = SentenceSplitter(
19
- chunk_size=CHUNK_SIZE,
20
- chunk_overlap=CHUNK_OVERLAP,
21
  separator=" "
22
  )
23
 
24
  text_chunks = text_splitter.split_text(doc.text)
25
- chunked_docs = []
26
-
27
- for i, chunk_text in enumerate(text_chunks):
28
- chunk_metadata = doc.metadata.copy()
29
- chunk_metadata.update({
30
- "chunk_id": i,
31
- "total_chunks": len(text_chunks),
32
- "chunk_size": len(chunk_text)
33
- })
34
-
35
- chunked_docs.append(Document(text=chunk_text, metadata=chunk_metadata))
36
-
37
- return chunked_docs
38
-
39
-
40
- # ============================================================================
41
- # TABLE PROCESSING
42
- # ============================================================================
43
-
44
- def extract_table_metadata(table_text):
45
- """Extract key terms from table for enrichment"""
46
- words = table_text.split()
47
-
48
- # Filter stopwords and short words
49
- stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
50
- filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
51
-
52
- # Get top 15 most common terms
53
- common = Counter(filtered).most_common(15)
54
- key_terms = [w for w, _ in common]
55
-
56
- return {
57
- "summary": f"Таблица содержит {len(words)} слов",
58
- "key_terms": key_terms
59
- }
60
-
61
-
62
- def create_table_content(table_data):
63
- """Format table data as text"""
64
- doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
65
- table_num = table_data.get('table_number', 'Неизвестно')
66
- table_title = table_data.get('table_title', 'Неизвестно')
67
- section = table_data.get('section', 'Неизвестно')
68
-
69
- content = f"Таблица: {table_num}\n"
70
- content += f"Название: {table_title}\n"
71
- content += f"Документ: {doc_id}\n"
72
- content += f"Раздел: {section}\n"
73
-
74
- # Add headers
75
- headers = table_data.get('headers', [])
76
- if headers:
77
- content += f"\nЗаголовки: {' | '.join(headers)}\n"
78
-
79
- # Add data rows
80
- if 'data' in table_data and isinstance(table_data['data'], list):
81
- content += "\nДанные таблицы:\n"
82
- for row_idx, row in enumerate(table_data['data'], start=1):
83
- if isinstance(row, dict):
84
- row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
85
- content += f"Строка {row_idx}: {row_text}\n"
86
-
87
- return content
88
-
89
-
90
- def chunk_table_by_rows(doc):
91
- """Split large table into chunks by rows, preserving headers"""
92
- # Extract metadata
93
- table_metadata = extract_table_metadata(doc.text)
94
- table_num = doc.metadata.get('table_number', 'unknown')
95
- table_title = doc.metadata.get('table_title', 'unknown')
96
-
97
- # Parse table structure
98
- lines = doc.text.strip().split('\n')
99
 
100
- # Separate header and data rows
101
- table_header_lines = []
102
- data_rows = []
103
- in_data = False
104
-
105
- for line in lines:
106
- if line.startswith('Данные таблицы:'):
107
- in_data = True
108
- table_header_lines.append(line)
109
- elif in_data and line.startswith('Строка'):
110
- data_rows.append(line)
111
- elif not in_data:
112
- table_header_lines.append(line)
113
-
114
- table_header = '\n'.join(table_header_lines) + '\n'
115
-
116
- # If no rows, use standard text splitting
117
- if not data_rows:
118
- log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
119
- return chunk_text_document(doc)
120
-
121
- log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
122
-
123
- # Row-based chunking
124
- header_size = len(table_header)
125
- available_size = CHUNK_SIZE - header_size - 300 # Reserve space for enrichment
126
-
127
- text_chunks = []
128
- current_chunk_rows = []
129
- current_size = 0
130
-
131
- for row in data_rows:
132
- row_size = len(row) + 1
133
-
134
- # If adding this row exceeds limit, create chunk
135
- if current_size + row_size > available_size and current_chunk_rows:
136
- chunk_text = table_header + '\n'.join(current_chunk_rows)
137
- text_chunks.append(chunk_text)
138
- log_message(f" ✂️ Создан чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
139
-
140
- # Keep last 2 rows for overlap
141
- overlap_count = min(2, len(current_chunk_rows))
142
- current_chunk_rows = current_chunk_rows[-overlap_count:]
143
- current_size = sum(len(r) + 1 for r in current_chunk_rows)
144
-
145
- current_chunk_rows.append(row)
146
- current_size += row_size
147
-
148
- # Final chunk
149
- if current_chunk_rows:
150
- chunk_text = table_header + '\n'.join(current_chunk_rows)
151
- text_chunks.append(chunk_text)
152
- log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
153
-
154
- log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
155
-
156
- # Create enriched chunks with metadata
157
  chunked_docs = []
158
- key_terms = table_metadata.get("key_terms", [])
159
-
160
  for i, chunk_text in enumerate(text_chunks):
161
  chunk_metadata = doc.metadata.copy()
162
  chunk_metadata.update({
163
  "chunk_id": i,
164
  "total_chunks": len(text_chunks),
165
  "chunk_size": len(chunk_text),
166
- "is_chunked": True,
167
- "key_terms": key_terms
168
  })
169
 
170
- # Add enrichment prefix
171
- terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
172
- enriched_text = f"""[Таблица {table_num}: {table_title}]
173
- [Ключевые термины: {terms_str}]
174
-
175
- {chunk_text}"""
176
-
177
- chunked_docs.append(Document(text=enriched_text, metadata=chunk_metadata))
178
 
179
  return chunked_docs
180
 
181
 
182
- def table_to_document(table_data, document_id=None):
183
- """Convert table data to Document, chunking if needed"""
184
- if not isinstance(table_data, dict):
185
- log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
186
- return []
187
-
188
- doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
189
- table_num = table_data.get('table_number', 'Неизвестно')
190
- table_title = table_data.get('table_title', 'Неизвестно')
191
- section = table_data.get('section', 'Неизвестно')
192
-
193
- table_rows = table_data.get('data', [])
194
- if not table_rows:
195
- log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} - нет данных")
196
- return []
197
-
198
- content = create_table_content(table_data)
199
- content_size = len(content)
200
-
201
- base_doc = Document(
202
- text=content,
203
- metadata={
204
- "type": "table",
205
- "table_number": table_num,
206
- "table_title": table_title,
207
- "document_id": doc_id,
208
- "section": section,
209
- "section_id": section,
210
- "total_rows": len(table_rows),
211
- "content_size": content_size
212
- }
213
- )
214
-
215
- # Chunk if needed
216
- if content_size > CHUNK_SIZE:
217
- log_message(f"📊 CHUNKING: Таблица {table_num} | Размер: {content_size} > {CHUNK_SIZE}")
218
- return chunk_table_by_rows(base_doc)
219
- else:
220
- log_message(f"✓ Таблица {table_num} | Размер: {content_size} символов | Строк: {len(table_rows)}")
221
- return [base_doc]
222
-
223
-
224
- def load_table_data(repo_id, hf_token, table_data_dir):
225
- """Load all table data from HuggingFace repo"""
226
- log_message("=" * 60)
227
- log_message("ЗАГРУЗКА ТАБЛИЧНЫХ ДАННЫХ")
228
- log_message("=" * 60)
229
 
230
- try:
231
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
232
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
233
-
234
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
235
-
236
- table_documents = []
237
-
238
- for file_path in table_files:
239
- try:
240
- local_path = hf_hub_download(
241
- repo_id=repo_id,
242
- filename=file_path,
243
- local_dir='',
244
- repo_type="dataset",
245
- token=hf_token
246
- )
247
-
248
- log_message(f"\nОбработка файла: {file_path}")
249
-
250
- with open(local_path, 'r', encoding='utf-8') as f:
251
- table_data = json.load(f)
252
-
253
- if isinstance(table_data, dict):
254
- document_id = table_data.get('document', 'unknown')
255
-
256
- # Process sheets if present
257
- if 'sheets' in table_data:
258
- sorted_sheets = sorted(
259
- table_data['sheets'],
260
- key=lambda sheet: sheet.get('table_number', '')
261
- )
262
-
263
- for sheet in sorted_sheets:
264
- sheet['document'] = document_id
265
- docs_list = table_to_document(sheet, document_id)
266
- table_documents.extend(docs_list)
267
- else:
268
- docs_list = table_to_document(table_data, document_id)
269
- table_documents.extend(docs_list)
270
-
271
- except Exception as e:
272
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
273
- continue
274
 
275
- log_message(f"\n{'='*60}")
276
- log_message(f"Загружено {len(table_documents)} табличных документов")
277
- log_message("=" * 60)
 
 
 
 
278
 
279
- return table_documents
 
 
 
 
 
 
 
 
 
 
280
 
281
- except Exception as e:
282
- log_message(f"❌ ОШИБКА загрузки таблиц: {str(e)}")
283
- return []
284
-
285
-
286
- # ============================================================================
287
- # JSON TEXT DOCUMENTS
288
- # ============================================================================
289
-
290
- def extract_section_title(section_text):
291
- """Extract clean title from section text"""
292
- if not section_text.strip():
293
- return ""
294
-
295
- first_line = section_text.strip().split('\n')[0].strip()
296
-
297
- if len(first_line) < 200 and not first_line.endswith('.'):
298
- return first_line
299
 
300
- sentences = first_line.split('.')
301
- if len(sentences) > 1:
302
- return sentences[0].strip()
 
 
 
 
 
 
303
 
304
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
305
 
306
 
307
  def extract_text_from_json(data, document_id, document_name):
308
- """Extract text documents from JSON structure"""
309
  documents = []
310
 
311
- if 'sections' not in data:
312
- return documents
313
-
314
- for section in data['sections']:
315
- section_id = section.get('section_id', 'Unknown')
316
- section_text = section.get('section_text', '')
317
-
318
- if section_text.strip():
319
  section_title = extract_section_title(section_text)
320
- doc = Document(
321
- text=section_text,
322
- metadata={
323
- "type": "text",
324
- "document_id": document_id,
325
- "document_name": document_name,
326
- "section_id": section_id,
327
- "section_text": section_title[:200],
328
- "section_path": section_id,
329
- "level": "section"
330
- }
331
- )
332
- documents.append(doc)
333
-
334
- # Process subsections recursively
335
- if 'subsections' in section:
336
- for subsection in section['subsections']:
337
- subsection_id = subsection.get('subsection_id', 'Unknown')
338
- subsection_text = subsection.get('subsection_text', '')
339
-
340
- if subsection_text.strip():
341
  subsection_title = extract_section_title(subsection_text)
342
- doc = Document(
343
- text=subsection_text,
344
- metadata={
345
- "type": "text",
346
- "document_id": document_id,
347
- "document_name": document_name,
348
- "section_id": subsection_id,
349
- "section_text": subsection_title[:200],
350
- "section_path": f"{section_id}.{subsection_id}",
351
- "level": "subsection",
352
- "parent_section": section_id
353
- }
354
- )
355
- documents.append(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  return documents
358
 
359
-
360
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
361
- """Load JSON documents from HuggingFace repo"""
362
- log_message("=" * 60)
363
- log_message("ЗАГРУЗКА JSON ДОКУМЕНТОВ")
364
- log_message("=" * 60)
365
 
366
  try:
367
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
368
  zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
369
  json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
370
 
371
- log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} JSON файлов")
372
 
373
  all_documents = []
374
 
375
- # Process ZIP files
376
  for zip_file_path in zip_files:
377
  try:
378
- log_message(f"Загружаю ZIP: {zip_file_path}")
379
  local_zip_path = hf_hub_download(
380
  repo_id=repo_id,
381
  filename=zip_file_path,
@@ -384,30 +232,17 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
384
  token=hf_token
385
  )
386
 
387
- with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
388
- json_files_in_zip = [f for f in zip_ref.namelist()
389
- if f.endswith('.json') and not f.startswith('__MACOSX')]
390
-
391
- for json_file in json_files_in_zip:
392
- with zip_ref.open(json_file) as f:
393
- json_data = json.load(f)
394
-
395
- metadata = json_data.get('document_metadata', {})
396
- doc_id = metadata.get('document_id', 'unknown')
397
- doc_name = metadata.get('document_name', 'unknown')
398
-
399
- docs = extract_text_from_json(json_data, doc_id, doc_name)
400
- all_documents.extend(docs)
401
-
402
- log_message(f"Извлечено документов из ZIP: {len(all_documents)}")
403
 
404
  except Exception as e:
405
- log_message(f" ОШИБКА ZIP {zip_file_path}: {str(e)}")
406
  continue
407
 
408
- # Process direct JSON files
409
  for file_path in json_files:
410
  try:
 
411
  local_path = hf_hub_download(
412
  repo_id=repo_id,
413
  filename=file_path,
@@ -419,52 +254,100 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
419
  with open(local_path, 'r', encoding='utf-8') as f:
420
  json_data = json.load(f)
421
 
422
- metadata = json_data.get('document_metadata', {})
423
- doc_id = metadata.get('document_id', 'unknown')
424
- doc_name = metadata.get('document_name', 'unknown')
 
 
 
425
 
426
- docs = extract_text_from_json(json_data, doc_id, doc_name)
427
- all_documents.extend(docs)
428
 
429
  except Exception as e:
430
- log_message(f" ОШИБКА JSON {file_path}: {str(e)}")
431
  continue
432
 
433
- log_message(f"Всего загружено {len(all_documents)} текстовых документов")
434
 
435
- # Chunk all documents
436
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
437
 
438
- log_message(f"После chunking: {len(chunked_documents)} чанков")
439
- log_message("=" * 60)
440
 
441
  return chunked_documents, chunk_info
442
 
443
  except Exception as e:
444
- log_message(f" ОШИБКА загрузки JSON: {str(e)}")
445
  return [], []
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
- # ============================================================================
449
- # IMAGE DATA
450
- # ============================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
  def load_image_data(repo_id, hf_token, image_data_dir):
453
- """Load image metadata from CSV files"""
454
- log_message("=" * 60)
455
- log_message("ЗАГРУЗК�� ДАННЫХ ИЗОБРАЖЕНИЙ")
456
- log_message("=" * 60)
457
 
 
458
  try:
459
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
460
- image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.csv')]
 
 
461
 
462
  log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
463
 
464
  image_documents = []
465
-
466
  for file_path in image_files:
467
  try:
 
468
  local_path = hf_hub_download(
469
  repo_id=repo_id,
470
  filename=file_path,
@@ -474,14 +357,18 @@ def load_image_data(repo_id, hf_token, image_data_dir):
474
  )
475
 
476
  df = pd.read_csv(local_path)
477
- log_message(f"Загружено {len(df)} изображений из {file_path}")
478
 
 
479
  for _, row in df.iterrows():
 
 
480
  content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
481
  content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
482
- content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"
483
  content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
484
- content += f"Раздел: {row.get('Раздел документа', 'Неизвестно')}\n"
 
485
 
486
  doc = Document(
487
  text=content,
@@ -489,147 +376,29 @@ def load_image_data(repo_id, hf_token, image_data_dir):
489
  "type": "image",
490
  "image_number": str(row.get('№ Изображения', 'unknown')),
491
  "image_title": str(row.get('Название изображения', 'unknown')),
 
492
  "document_id": str(row.get('Обозначение документа', 'unknown')),
493
- "section": str(row.get('Раздел документа', 'unknown'))
 
 
494
  }
495
  )
496
  image_documents.append(doc)
497
 
498
  except Exception as e:
499
- log_message(f" ОШИБКА файла {file_path}: {str(e)}")
500
  continue
501
 
502
- log_message(f"Загружено {len(image_documents)} документов изображений")
503
- log_message("=" * 60)
504
-
505
  return image_documents
506
 
507
  except Exception as e:
508
- log_message(f" ОШИБКА загрузки изображений: {str(e)}")
509
  return []
510
 
511
 
512
- # ============================================================================
513
- # DOCUMENT PROCESSING WITH CHUNKING
514
- # ============================================================================
515
-
516
- def process_documents_with_chunking(documents):
517
- """Process all documents and chunk if needed"""
518
- all_chunked_docs = []
519
- chunk_info = []
520
-
521
- stats = {
522
- 'text_chunks': 0,
523
- 'table_whole': 0,
524
- 'table_chunks': 0,
525
- 'image_whole': 0,
526
- 'image_chunks': 0
527
- }
528
-
529
- for doc in documents:
530
- doc_type = doc.metadata.get('type', 'text')
531
- is_already_chunked = doc.metadata.get('is_chunked', False)
532
- doc_size = len(doc.text)
533
-
534
- # Tables - already chunked or whole
535
- if doc_type == 'table':
536
- if is_already_chunked:
537
- stats['table_chunks'] += 1
538
- else:
539
- stats['table_whole'] += 1
540
-
541
- all_chunked_docs.append(doc)
542
- chunk_info.append({
543
- 'document_id': doc.metadata.get('document_id', 'unknown'),
544
- 'section_id': doc.metadata.get('section_id', 'unknown'),
545
- 'chunk_id': doc.metadata.get('chunk_id', 0),
546
- 'total_chunks': doc.metadata.get('total_chunks', 1),
547
- 'chunk_size': doc_size,
548
- 'chunk_preview': doc.text[:200] + "..." if doc_size > 200 else doc.text,
549
- 'type': 'table',
550
- 'table_number': doc.metadata.get('table_number', 'unknown')
551
- })
552
-
553
- # Images - chunk if too large
554
- elif doc_type == 'image':
555
- if doc_size > CHUNK_SIZE:
556
- log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | Размер: {doc_size}")
557
- chunked_docs = chunk_text_document(doc)
558
- stats['image_chunks'] += len(chunked_docs)
559
- all_chunked_docs.extend(chunked_docs)
560
-
561
- for i, chunk_doc in enumerate(chunked_docs):
562
- chunk_info.append({
563
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
564
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
565
- 'chunk_id': i,
566
- 'chunk_size': len(chunk_doc.text),
567
- 'chunk_preview': chunk_doc.text[:200] + "...",
568
- 'type': 'image',
569
- 'image_number': chunk_doc.metadata.get('image_number', 'unknown')
570
- })
571
- else:
572
- stats['image_whole'] += 1
573
- all_chunked_docs.append(doc)
574
- chunk_info.append({
575
- 'document_id': doc.metadata.get('document_id', 'unknown'),
576
- 'section_id': doc.metadata.get('section_id', 'unknown'),
577
- 'chunk_id': 0,
578
- 'chunk_size': doc_size,
579
- 'chunk_preview': doc.text[:200] + "...",
580
- 'type': 'image',
581
- 'image_number': doc.metadata.get('image_number', 'unknown')
582
- })
583
-
584
- # Text - chunk if too large
585
- else:
586
- if doc_size > CHUNK_SIZE:
587
- log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | Размер: {doc_size}")
588
- chunked_docs = chunk_text_document(doc)
589
- stats['text_chunks'] += len(chunked_docs)
590
- all_chunked_docs.extend(chunked_docs)
591
-
592
- for i, chunk_doc in enumerate(chunked_docs):
593
- chunk_info.append({
594
- 'document_id': chunk_doc.metadata.get('document_id', 'unknown'),
595
- 'section_id': chunk_doc.metadata.get('section_id', 'unknown'),
596
- 'chunk_id': i,
597
- 'chunk_size': len(chunk_doc.text),
598
- 'chunk_preview': chunk_doc.text[:200] + "...",
599
- 'type': 'text'
600
- })
601
- else:
602
- all_chunked_docs.append(doc)
603
- chunk_info.append({
604
- 'document_id': doc.metadata.get('document_id', 'unknown'),
605
- 'section_id': doc.metadata.get('section_id', 'unknown'),
606
- 'chunk_id': 0,
607
- 'chunk_size': doc_size,
608
- 'chunk_preview': doc.text[:200] + "...",
609
- 'type': 'text'
610
- })
611
-
612
- # Log summary
613
- log_message(f"\n{'='*60}")
614
- log_message("ИТОГОВАЯ СТАТИСТИКА:")
615
- log_message(f" • Текстовые чанки: {stats['text_chunks']}")
616
- log_message(f" • Таблицы (целые): {stats['table_whole']}")
617
- log_message(f" • Таблицы (чанки): {stats['table_chunks']}")
618
- log_message(f" • Изображения (целые): {stats['image_whole']}")
619
- log_message(f" • Изображения (чанки): {stats['image_chunks']}")
620
- log_message(f" • ВСЕГО ДОКУМЕНТОВ: {len(all_chunked_docs)}")
621
- log_message(f"{'='*60}\n")
622
-
623
- return all_chunked_docs, chunk_info
624
-
625
-
626
- # ============================================================================
627
- # CSV CHUNKS (Legacy support)
628
- # ============================================================================
629
-
630
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
631
- """Load pre-chunked data from CSV (legacy support)"""
632
- log_message("Загрузка данны�� из CSV")
633
 
634
  try:
635
  chunks_csv_path = hf_hub_download(
@@ -643,16 +412,17 @@ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
643
  chunks_df = pd.read_csv(chunks_csv_path)
644
  log_message(f"Загружено {len(chunks_df)} чанков из CSV")
645
 
646
- # Find text column
647
  text_column = None
648
  for col in chunks_df.columns:
649
- if any(keyword in col.lower() for keyword in ['text', 'content', 'chunk']):
650
  text_column = col
651
  break
652
 
653
  if text_column is None:
654
  text_column = chunks_df.columns[0]
655
 
 
 
656
  documents = []
657
  for i, (_, row) in enumerate(chunks_df.iterrows()):
658
  doc = Document(
@@ -665,9 +435,9 @@ def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
665
  )
666
  documents.append(doc)
667
 
668
- log_message(f"Создано {len(documents)} документов из CSV")
669
  return documents, chunks_df
670
 
671
  except Exception as e:
672
- log_message(f" ОШИБКА загрузки CSV: {str(e)}")
673
  return [], None
 
1
  import json
2
  import zipfile
3
  import pandas as pd
 
4
  from huggingface_hub import hf_hub_download, list_repo_files
5
  from llama_index.core import Document
 
6
  from my_logging import log_message
7
+ from llama_index.core.text_splitter import SentenceSplitter
8
  from config import CHUNK_SIZE, CHUNK_OVERLAP
9
+ from table_prep import table_to_document, load_table_data
10
 
11
 
12
+ def chunk_document(doc, chunk_size=None, chunk_overlap=None):
13
+ """
14
+ Universal chunking for text and images.
15
+ Tables use their own row-block chunking.
16
+ """
17
+ if chunk_size is None:
18
+ chunk_size = CHUNK_SIZE
19
+ if chunk_overlap is None:
20
+ chunk_overlap = CHUNK_OVERLAP
21
+
22
+ # Use sentence-aware splitting
23
  text_splitter = SentenceSplitter(
24
+ chunk_size=chunk_size,
25
+ chunk_overlap=chunk_overlap,
26
  separator=" "
27
  )
28
 
29
  text_chunks = text_splitter.split_text(doc.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  chunked_docs = []
 
 
32
  for i, chunk_text in enumerate(text_chunks):
33
  chunk_metadata = doc.metadata.copy()
34
  chunk_metadata.update({
35
  "chunk_id": i,
36
  "total_chunks": len(text_chunks),
37
  "chunk_size": len(chunk_text),
38
+ "original_doc_id": doc.id_ if hasattr(doc, 'id_') else None
 
39
  })
40
 
41
+ chunked_doc = Document(
42
+ text=chunk_text,
43
+ metadata=chunk_metadata
44
+ )
45
+ chunked_docs.append(chunked_doc)
 
 
 
46
 
47
  return chunked_docs
48
 
49
 
50
+ def process_documents_with_chunking(documents):
51
+ """
52
+ Process all document types with appropriate chunking.
53
+ Tables: row-block chunking (handled in table_prep.py)
54
+ Text/Images: sentence-aware chunking
55
+ """
56
+ all_chunked_docs = []
57
+ stats = {
58
+ 'table_whole': 0,
59
+ 'table_chunks': 0,
60
+ 'image_whole': 0,
61
+ 'image_chunks': 0,
62
+ 'text_chunks': 0
63
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ for doc in documents:
66
+ doc_type = doc.metadata.get('type', 'text')
67
+ is_already_chunked = doc.metadata.get('is_chunked', False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # Tables: already chunked in table_prep.py if needed
70
+ if doc_type == 'table':
71
+ if is_already_chunked:
72
+ stats['table_chunks'] += 1
73
+ else:
74
+ stats['table_whole'] += 1
75
+ all_chunked_docs.append(doc)
76
 
77
+ # Images: chunk if too large
78
+ elif doc_type == 'image':
79
+ doc_size = len(doc.text)
80
+ if doc_size > CHUNK_SIZE:
81
+ log_message(f"📷 CHUNKING: Изображение {doc.metadata.get('image_number')} | {doc_size} > {CHUNK_SIZE}")
82
+ chunked_docs = chunk_document(doc)
83
+ stats['image_chunks'] += len(chunked_docs)
84
+ all_chunked_docs.extend(chunked_docs)
85
+ else:
86
+ stats['image_whole'] += 1
87
+ all_chunked_docs.append(doc)
88
 
89
+ # Text: chunk if too large
90
+ else:
91
+ doc_size = len(doc.text)
92
+ if doc_size > CHUNK_SIZE:
93
+ log_message(f"📝 CHUNKING: Текст '{doc.metadata.get('document_id')}' | {doc_size} > {CHUNK_SIZE}")
94
+ chunked_docs = chunk_document(doc)
95
+ stats['text_chunks'] += len(chunked_docs)
96
+ all_chunked_docs.extend(chunked_docs)
97
+ else:
98
+ all_chunked_docs.append(doc)
 
 
 
 
 
 
 
 
99
 
100
+ log_message(f"\n{'='*60}")
101
+ log_message(f"СТАТИСТИКА ОБРАБОТКИ:")
102
+ log_message(f" • Таблицы (целые): {stats['table_whole']}")
103
+ log_message(f" • Таблицы (чанки): {stats['table_chunks']}")
104
+ log_message(f" • Изображения (целые): {stats['image_whole']}")
105
+ log_message(f" • Изображения (чанки): {stats['image_chunks']}")
106
+ log_message(f" • Текстовые чанки: {stats['text_chunks']}")
107
+ log_message(f" • ВСЕГО: {len(all_chunked_docs)}")
108
+ log_message(f"{'='*60}\n")
109
 
110
+ return all_chunked_docs, [] # Second return value for backward compatibility
111
 
112
 
113
  def extract_text_from_json(data, document_id, document_name):
 
114
  documents = []
115
 
116
+ if 'sections' in data:
117
+ for section in data['sections']:
118
+ section_id = section.get('section_id', 'Unknown')
119
+ section_text = section.get('section_text', '')
120
+
121
+ section_path = f"{section_id}"
 
 
122
  section_title = extract_section_title(section_text)
123
+
124
+ if section_text.strip():
125
+ doc = Document(
126
+ text=section_text,
127
+ metadata={
128
+ "type": "text",
129
+ "document_id": document_id,
130
+ "document_name": document_name,
131
+ "section_id": section_id,
132
+ "section_text": section_title[:200],
133
+ "section_path": section_path,
134
+ "level": "section"
135
+ }
136
+ )
137
+ documents.append(doc)
138
+
139
+ if 'subsections' in section:
140
+ for subsection in section['subsections']:
141
+ subsection_id = subsection.get('subsection_id', 'Unknown')
142
+ subsection_text = subsection.get('subsection_text', '')
 
143
  subsection_title = extract_section_title(subsection_text)
144
+ subsection_path = f"{section_path}.{subsection_id}"
145
+
146
+ if subsection_text.strip():
147
+ doc = Document(
148
+ text=subsection_text,
149
+ metadata={
150
+ "type": "text",
151
+ "document_id": document_id,
152
+ "document_name": document_name,
153
+ "section_id": subsection_id,
154
+ "section_text": subsection_title[:200],
155
+ "section_path": subsection_path,
156
+ "level": "subsection",
157
+ "parent_section": section_id,
158
+ "parent_title": section_title[:100]
159
+ }
160
+ )
161
+ documents.append(doc)
162
+
163
+ if 'sub_subsections' in subsection:
164
+ for sub_subsection in subsection['sub_subsections']:
165
+ sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
166
+ sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
167
+ sub_subsection_title = extract_section_title(sub_subsection_text)
168
+ sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
169
+
170
+ if sub_subsection_text.strip():
171
+ doc = Document(
172
+ text=sub_subsection_text,
173
+ metadata={
174
+ "type": "text",
175
+ "document_id": document_id,
176
+ "document_name": document_name,
177
+ "section_id": sub_subsection_id,
178
+ "section_text": sub_subsection_title[:200],
179
+ "section_path": sub_subsection_path,
180
+ "level": "sub_subsection",
181
+ "parent_section": subsection_id,
182
+ "parent_title": subsection_title[:100]
183
+ }
184
+ )
185
+ documents.append(doc)
186
+
187
+ if 'sub_sub_subsections' in sub_subsection:
188
+ for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
189
+ sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
190
+ sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
191
+ sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
192
+
193
+ if sub_sub_subsection_text.strip():
194
+ doc = Document(
195
+ text=sub_sub_subsection_text,
196
+ metadata={
197
+ "type": "text",
198
+ "document_id": document_id,
199
+ "document_name": document_name,
200
+ "section_id": sub_sub_subsection_id,
201
+ "section_text": sub_sub_subsection_title[:200],
202
+ "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
203
+ "level": "sub_sub_subsection",
204
+ "parent_section": sub_subsection_id,
205
+ "parent_title": sub_subsection_title[:100]
206
+ }
207
+ )
208
+ documents.append(doc)
209
 
210
  return documents
211
 
 
212
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
213
+ log_message("Начинаю загрузку JSON документов")
 
 
 
214
 
215
  try:
216
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
217
  zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
218
  json_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.json')]
219
 
220
+ log_message(f"Найдено {len(zip_files)} ZIP файлов и {len(json_files)} прямых JSON файлов")
221
 
222
  all_documents = []
223
 
 
224
  for zip_file_path in zip_files:
225
  try:
226
+ log_message(f"Загружаю ZIP архив: {zip_file_path}")
227
  local_zip_path = hf_hub_download(
228
  repo_id=repo_id,
229
  filename=zip_file_path,
 
232
  token=hf_token
233
  )
234
 
235
+ documents = extract_zip_and_process_json(local_zip_path)
236
+ all_documents.extend(documents)
237
+ log_message(f"Извлечено {len(documents)} документов из ZIP архива {zip_file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  except Exception as e:
240
+ log_message(f"Ошибка обработки ZIP файла {zip_file_path}: {str(e)}")
241
  continue
242
 
 
243
  for file_path in json_files:
244
  try:
245
+ log_message(f"Обрабатываю прямой JSON файл: {file_path}")
246
  local_path = hf_hub_download(
247
  repo_id=repo_id,
248
  filename=file_path,
 
254
  with open(local_path, 'r', encoding='utf-8') as f:
255
  json_data = json.load(f)
256
 
257
+ document_metadata = json_data.get('document_metadata', {})
258
+ document_id = document_metadata.get('document_id', 'unknown')
259
+ document_name = document_metadata.get('document_name', 'unknown')
260
+
261
+ documents = extract_text_from_json(json_data, document_id, document_name)
262
+ all_documents.extend(documents)
263
 
264
+ log_message(f"Извлечено {len(documents)} документов из {file_path}")
 
265
 
266
  except Exception as e:
267
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
268
  continue
269
 
270
+ log_message(f"Всего создано {len(all_documents)} исходных документов из JSON файлов")
271
 
272
+ # Process documents through chunking function
273
  chunked_documents, chunk_info = process_documents_with_chunking(all_documents)
274
 
275
+ log_message(f"После chunking получено {len(chunked_documents)} чанков из JSON данных")
 
276
 
277
  return chunked_documents, chunk_info
278
 
279
  except Exception as e:
280
+ log_message(f"Ошибка загрузки JSON документов: {str(e)}")
281
  return [], []
282
 
283
+ def extract_section_title(section_text):
284
+ if not section_text.strip():
285
+ return ""
286
+
287
+ lines = section_text.strip().split('\n')
288
+ first_line = lines[0].strip()
289
+
290
+ if len(first_line) < 200 and not first_line.endswith('.'):
291
+ return first_line
292
+
293
+ # Otherwise, extract first sentence
294
+ sentences = first_line.split('.')
295
+ if len(sentences) > 1:
296
+ return sentences[0].strip()
297
+
298
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
299
 
300
+ def extract_zip_and_process_json(zip_path):
301
+ documents = []
302
+
303
+ try:
304
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
305
+ zip_files = zip_ref.namelist()
306
+ json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
307
+
308
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
309
+
310
+ for json_file in json_files:
311
+ try:
312
+ log_message(f"Обрабатываю файл из архива: {json_file}")
313
+
314
+ with zip_ref.open(json_file) as f:
315
+ json_data = json.load(f)
316
+
317
+ document_metadata = json_data.get('document_metadata', {})
318
+ document_id = document_metadata.get('document_id', 'unknown')
319
+ document_name = document_metadata.get('document_name', 'unknown')
320
+
321
+ docs = extract_text_from_json(json_data, document_id, document_name)
322
+ documents.extend(docs)
323
+
324
+ log_message(f"Извлечено {len(docs)} документов из {json_file}")
325
+
326
+ except Exception as e:
327
+ log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
328
+ continue
329
+
330
+ except Exception as e:
331
+ log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
332
+
333
+ return documents
334
 
335
  def load_image_data(repo_id, hf_token, image_data_dir):
336
+ log_message("Начинаю загрузку данных изображений")
 
 
 
337
 
338
+ image_files = []
339
  try:
340
  files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
341
+ for file in files:
342
+ if file.startswith(image_data_dir) and file.endswith('.csv'):
343
+ image_files.append(file)
344
 
345
  log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
346
 
347
  image_documents = []
 
348
  for file_path in image_files:
349
  try:
350
+ log_message(f"Обрабатываю файл изображений: {file_path}")
351
  local_path = hf_hub_download(
352
  repo_id=repo_id,
353
  filename=file_path,
 
357
  )
358
 
359
  df = pd.read_csv(local_path)
360
+ log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
361
 
362
+ # Обработка с правильными названиями колонок
363
  for _, row in df.iterrows():
364
+ section_value = row.get('Раздел документа', 'Неизвестно')
365
+
366
  content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
367
  content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
368
+ content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
369
  content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
370
+ content += f"Раздел: {section_value}\n"
371
+ content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
372
 
373
  doc = Document(
374
  text=content,
 
376
  "type": "image",
377
  "image_number": str(row.get('№ Изображения', 'unknown')),
378
  "image_title": str(row.get('Название изображения', 'unknown')),
379
+ "image_description": str(row.get('Описание изображение', 'unknown')),
380
  "document_id": str(row.get('Обозначение документа', 'unknown')),
381
+ "file_path": str(row.get('Файл изображения', 'unknown')),
382
+ "section": str(section_value),
383
+ "section_id": str(section_value)
384
  }
385
  )
386
  image_documents.append(doc)
387
 
388
  except Exception as e:
389
+ log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
390
  continue
391
 
392
+ log_message(f"Создано {len(image_documents)} документов из изображений")
 
 
393
  return image_documents
394
 
395
  except Exception as e:
396
+ log_message(f"Ошибка загрузки данных изображений: {str(e)}")
397
  return []
398
 
399
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
401
+ log_message("Загружаю данные чанков из CSV")
 
402
 
403
  try:
404
  chunks_csv_path = hf_hub_download(
 
412
  chunks_df = pd.read_csv(chunks_csv_path)
413
  log_message(f"Загружено {len(chunks_df)} чанков из CSV")
414
 
 
415
  text_column = None
416
  for col in chunks_df.columns:
417
+ if 'text' in col.lower() or 'content' in col.lower() or 'chunk' in col.lower():
418
  text_column = col
419
  break
420
 
421
  if text_column is None:
422
  text_column = chunks_df.columns[0]
423
 
424
+ log_message(f"Использую колонку: {text_column}")
425
+
426
  documents = []
427
  for i, (_, row) in enumerate(chunks_df.iterrows()):
428
  doc = Document(
 
435
  )
436
  documents.append(doc)
437
 
438
+ log_message(f"Создано {len(documents)} текстовых документов из CSV")
439
  return documents, chunks_df
440
 
441
  except Exception as e:
442
+ log_message(f"Ошибка загрузки CSV данных: {str(e)}")
443
  return [], None
table_prep.py CHANGED
@@ -1,7 +1,6 @@
1
- from collections import defaultdict
2
- import json
3
- from huggingface_hub import hf_hub_download, list_repo_files
4
  from llama_index.core import Document
 
5
  from my_logging import log_message
6
 
7
  def create_table_content(table_data):
@@ -11,6 +10,7 @@ def create_table_content(table_data):
11
  table_title = table_data.get('table_title', 'Неизвестно')
12
  section = table_data.get('section', 'Неизвестно')
13
 
 
14
  content = f"Таблица: {table_num}\n"
15
  content += f"Название: {table_title}\n"
16
  content += f"Документ: {doc_id}\n"
@@ -20,6 +20,7 @@ def create_table_content(table_data):
20
  if headers:
21
  content += f"\nЗаголовки: {' | '.join(headers)}\n"
22
 
 
23
  if 'data' in table_data and isinstance(table_data['data'], list):
24
  content += "\nДанные таблицы:\n"
25
  for row_idx, row in enumerate(table_data['data'], start=1):
@@ -29,42 +30,24 @@ def create_table_content(table_data):
29
 
30
  return content
31
 
32
- from llama_index.core.text_splitter import SentenceSplitter
33
- from config import CHUNK_SIZE, CHUNK_OVERLAP
34
-
35
- def extract_table_metadata(table_text: str) -> dict:
36
- words = table_text.split()
37
- unique_words = set(words)
38
-
39
- from collections import Counter
40
- stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
41
- filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
42
- common = Counter(filtered).most_common(15)
43
- key_terms = [w for w, _ in common]
44
-
45
- return {
46
- "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
47
- "materials": [], # if you want to extract material names, hook in regex or LLM here
48
- "key_terms": key_terms
49
- }
50
 
51
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
 
 
 
 
 
52
  if chunk_size is None:
53
  chunk_size = CHUNK_SIZE
54
  if chunk_overlap is None:
55
  chunk_overlap = CHUNK_OVERLAP
56
 
57
- # Extract critical metadata from table before chunking
58
- table_metadata = extract_table_metadata(doc.text)
59
  table_num = doc.metadata.get('table_number', 'unknown')
60
- table_title = doc.metadata.get('table_title', 'unknown')
61
  doc_id = doc.metadata.get('document_id', 'unknown')
62
- section = doc.metadata.get('section', 'unknown')
63
 
64
- # Parse table structure from your create_table_content format
65
  lines = doc.text.strip().split('\n')
66
 
67
- # Find where data rows start
68
  table_header_lines = []
69
  data_rows = []
70
  in_data = False
@@ -80,96 +63,68 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
80
 
81
  table_header = '\n'.join(table_header_lines) + '\n'
82
 
83
- if not data_rows:
84
- log_message(f" ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
85
- text_splitter = SentenceSplitter(
86
- chunk_size=chunk_size,
87
- chunk_overlap=chunk_overlap,
88
- separator="\n"
89
- )
90
- text_chunks = text_splitter.split_text(doc.text)
91
- log_message(f" 📊 Стандартное разбиение: {len(text_chunks)} чанков")
92
- else:
93
- # Row-based chunking
94
- log_message(f" 📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
95
-
96
- header_size = len(table_header)
97
- # Reserve space for enrichment prefix
98
- available_size = chunk_size - header_size - 300
99
-
100
- text_chunks = []
101
- current_chunk_rows = []
102
- current_size = 0
103
-
104
- for row in data_rows:
105
- row_size = len(row) + 1
106
-
107
- # Check if adding this row exceeds limit
108
- if current_size + row_size > available_size and current_chunk_rows:
109
- # Create chunk
110
- chunk_text = table_header + '\n'.join(current_chunk_rows)
111
- text_chunks.append(chunk_text)
112
- log_message(f" ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
113
-
114
- # Overlap: keep last 2 rows
115
- overlap_count = min(2, len(current_chunk_rows))
116
- current_chunk_rows = current_chunk_rows[-overlap_count:]
117
- current_size = sum(len(r) + 1 for r in current_chunk_rows)
118
-
119
- current_chunk_rows.append(row)
120
- current_size += row_size
121
 
122
- # Final chunk
123
- if current_chunk_rows:
 
124
  chunk_text = table_header + '\n'.join(current_chunk_rows)
125
  text_chunks.append(chunk_text)
126
- log_message(f" ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
 
 
 
 
 
 
 
127
 
128
- log_message(f" 📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
 
 
 
129
 
130
- # Create enriched chunks
131
- chunked_docs = []
132
- materials = table_metadata.get("materials", [])
133
- key_terms = table_metadata.get("key_terms", [])
134
 
 
 
135
  for i, chunk_text in enumerate(text_chunks):
136
  chunk_metadata = doc.metadata.copy()
137
  chunk_metadata.update({
138
  "chunk_id": i,
139
  "total_chunks": len(text_chunks),
140
  "chunk_size": len(chunk_text),
141
- "is_chunked": True,
142
- "materials": materials,
143
- "key_terms": key_terms,
144
- "table_summary": table_metadata.get("summary", "")
145
  })
146
 
147
- # Enrichment prefix
148
- materials_str = ', '.join(materials[:10]) if materials else 'нет'
149
- terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
150
-
151
- enriched_text = f"""[Таблица {table_num}: {table_title}]
152
- [Материалы в таблице: {materials_str}]
153
- [Ключевые термины: {terms_str}]
154
-
155
- {chunk_text}"""
156
-
157
- log_message(f" ✓ Чанк {i+1}/{len(text_chunks)}: "
158
- f"размер={len(enriched_text)}, "
159
- f"материалов={len(materials)}, "
160
- f"терминов={len(key_terms)}")
161
-
162
  chunked_doc = Document(
163
- text=enriched_text,
164
  metadata=chunk_metadata
165
  )
166
  chunked_docs.append(chunked_doc)
167
 
168
  return chunked_docs
169
 
 
170
  def table_to_document(table_data, document_id=None):
 
171
  if not isinstance(table_data, dict):
172
- log_message(f"⚠️ ПРОПУЩЕНА: table_data не является словарем")
173
  return []
174
 
175
  doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
@@ -178,13 +133,12 @@ def table_to_document(table_data, document_id=None):
178
  section = table_data.get('section', 'Неизвестно')
179
 
180
  table_rows = table_data.get('data', [])
181
- if not table_rows or len(table_rows) == 0:
182
- log_message(f"⚠️ ПРОПУЩЕНА: Таблица {table_num} из '{doc_id}' - нет данных в 'data'")
183
  return []
184
 
185
  content = create_table_content(table_data)
186
  content_size = len(content)
187
- row_count = len(table_rows)
188
 
189
  base_doc = Document(
190
  text=content,
@@ -195,111 +149,15 @@ def table_to_document(table_data, document_id=None):
195
  "document_id": doc_id,
196
  "section": section,
197
  "section_id": section,
198
- "total_rows": row_count,
199
  "content_size": content_size
200
  }
201
  )
202
 
 
203
  if content_size > CHUNK_SIZE:
204
- log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
205
- f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
206
- chunked_docs = chunk_table_document(base_doc)
207
- log_message(f" ✂️ Разделена на {len(chunked_docs)} чанков")
208
- for i, chunk_doc in enumerate(chunked_docs):
209
- log_message(f" Чанк {i+1}: {chunk_doc.metadata['chunk_size']} символов")
210
- return chunked_docs
211
  else:
212
- log_message(f"✓ ДОБАВЛЕНА: Таблица {table_num} из документа '{doc_id}' | "
213
- f"Размер: {content_size} символов | Строк: {row_count}")
214
- return [base_doc]
215
-
216
-
217
- def load_table_data(repo_id, hf_token, table_data_dir):
218
- log_message("=" * 60)
219
- log_message("НАЧАЛО ЗАГРУЗКИ ТАБЛИЧНЫХ ДАННЫХ")
220
- log_message("=" * 60)
221
-
222
- try:
223
- files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
224
- table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
225
-
226
- log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
227
-
228
- table_documents = []
229
- stats = {
230
- 'total_tables': 0,
231
- 'total_size': 0,
232
- 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
233
- }
234
-
235
- for file_path in table_files:
236
- try:
237
- local_path = hf_hub_download(
238
- repo_id=repo_id,
239
- filename=file_path,
240
- local_dir='',
241
- repo_type="dataset",
242
- token=hf_token
243
- )
244
-
245
- log_message(f"\nОбработка файла: {file_path}")
246
-
247
- with open(local_path, 'r', encoding='utf-8') as f:
248
- table_data = json.load(f)
249
-
250
- if isinstance(table_data, dict):
251
- document_id = table_data.get('document', 'unknown')
252
-
253
- if 'sheets' in table_data:
254
- sorted_sheets = sorted(
255
- table_data['sheets'],
256
- key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
257
- )
258
-
259
- for sheet in sorted_sheets:
260
- sheet['document'] = document_id
261
- docs_list = table_to_document(sheet, document_id)
262
- table_documents.extend(docs_list)
263
-
264
- for doc in docs_list:
265
- stats['total_tables'] += 1
266
- size = doc.metadata.get('content_size', 0)
267
- stats['total_size'] += size
268
- stats['by_document'][document_id]['count'] += 1
269
- stats['by_document'][document_id]['size'] += size
270
- else:
271
- docs_list = table_to_document(table_data, document_id)
272
- table_documents.extend(docs_list)
273
-
274
- for doc in docs_list:
275
- stats['total_tables'] += 1
276
- size = doc.metadata.get('content_size', 0)
277
- stats['total_size'] += size
278
- stats['by_document'][document_id]['count'] += 1
279
- stats['by_document'][document_id]['size'] += size
280
-
281
-
282
- except Exception as e:
283
- log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
284
- continue
285
-
286
- # Log summary statistics
287
- log_message("\n" + "=" * 60)
288
- log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
289
- log_message("=" * 60)
290
- log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
291
- log_message(f"Общий размер: {stats['total_size']:,} символов")
292
- log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
293
-
294
- log_message("\nПо документам:")
295
- for doc_id, doc_stats in sorted(stats['by_document'].items()):
296
- log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
297
- f"{doc_stats['size']:,} символов")
298
-
299
- log_message("=" * 60)
300
-
301
- return table_documents
302
-
303
- except Exception as e:
304
- log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
305
- return []
 
1
+ from llama_index.core.text_splitter import SentenceSplitter
 
 
2
  from llama_index.core import Document
3
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
4
  from my_logging import log_message
5
 
6
  def create_table_content(table_data):
 
10
  table_title = table_data.get('table_title', 'Неизвестно')
11
  section = table_data.get('section', 'Неизвестно')
12
 
13
+ # Header section
14
  content = f"Таблица: {table_num}\n"
15
  content += f"Название: {table_title}\n"
16
  content += f"Документ: {doc_id}\n"
 
20
  if headers:
21
  content += f"\nЗаголовки: {' | '.join(headers)}\n"
22
 
23
+ # Data section
24
  if 'data' in table_data and isinstance(table_data['data'], list):
25
  content += "\nДанные таблицы:\n"
26
  for row_idx, row in enumerate(table_data['data'], start=1):
 
30
 
31
  return content
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def chunk_table_document(doc, chunk_size=None, chunk_overlap=None):
35
+ """
36
+ Smart table chunking:
37
+ - Small tables: keep whole
38
+ - Large tables: split by row-blocks, preserve headers in each chunk
39
+ """
40
  if chunk_size is None:
41
  chunk_size = CHUNK_SIZE
42
  if chunk_overlap is None:
43
  chunk_overlap = CHUNK_OVERLAP
44
 
 
 
45
  table_num = doc.metadata.get('table_number', 'unknown')
 
46
  doc_id = doc.metadata.get('document_id', 'unknown')
 
47
 
48
+ # Parse table structure
49
  lines = doc.text.strip().split('\n')
50
 
 
51
  table_header_lines = []
52
  data_rows = []
53
  in_data = False
 
63
 
64
  table_header = '\n'.join(table_header_lines) + '\n'
65
 
66
+ # If no data rows or small table, use standard splitting
67
+ if not data_rows or len(doc.text) < chunk_size * 1.5:
68
+ log_message(f" 📊 Таблица {table_num}: малая, без разбиения")
69
+ return [doc]
70
+
71
+ # Row-block chunking for large tables
72
+ log_message(f" 📋 Таблица {table_num}: {len(data_rows)} строк → row-block chunking")
73
+
74
+ header_size = len(table_header)
75
+ available_size = chunk_size - header_size - 100 # Reserve space
76
+
77
+ text_chunks = []
78
+ current_chunk_rows = []
79
+ current_size = 0
80
+
81
+ for row in data_rows:
82
+ row_size = len(row) + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ # Check if adding this row exceeds limit
85
+ if current_size + row_size > available_size and current_chunk_rows:
86
+ # Create chunk with header + rows
87
  chunk_text = table_header + '\n'.join(current_chunk_rows)
88
  text_chunks.append(chunk_text)
89
+
90
+ # Overlap: keep last 2 rows for context continuity
91
+ overlap_count = min(2, len(current_chunk_rows))
92
+ current_chunk_rows = current_chunk_rows[-overlap_count:]
93
+ current_size = sum(len(r) + 1 for r in current_chunk_rows)
94
+
95
+ current_chunk_rows.append(row)
96
+ current_size += row_size
97
 
98
+ # Final chunk
99
+ if current_chunk_rows:
100
+ chunk_text = table_header + '\n'.join(current_chunk_rows)
101
+ text_chunks.append(chunk_text)
102
 
103
+ log_message(f" ✂️ Таблица {table_num} → {len(text_chunks)} чанков")
 
 
 
104
 
105
+ # Create Document objects
106
+ chunked_docs = []
107
  for i, chunk_text in enumerate(text_chunks):
108
  chunk_metadata = doc.metadata.copy()
109
  chunk_metadata.update({
110
  "chunk_id": i,
111
  "total_chunks": len(text_chunks),
112
  "chunk_size": len(chunk_text),
113
+ "is_chunked": True
 
 
 
114
  })
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  chunked_doc = Document(
117
+ text=chunk_text,
118
  metadata=chunk_metadata
119
  )
120
  chunked_docs.append(chunked_doc)
121
 
122
  return chunked_docs
123
 
124
+
125
  def table_to_document(table_data, document_id=None):
126
+ """Convert table data to Document, with smart chunking if needed"""
127
  if not isinstance(table_data, dict):
 
128
  return []
129
 
130
  doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
 
133
  section = table_data.get('section', 'Неизвестно')
134
 
135
  table_rows = table_data.get('data', [])
136
+ if not table_rows:
137
+ log_message(f"⚠️ Таблица {table_num} пропущена: нет данных")
138
  return []
139
 
140
  content = create_table_content(table_data)
141
  content_size = len(content)
 
142
 
143
  base_doc = Document(
144
  text=content,
 
149
  "document_id": doc_id,
150
  "section": section,
151
  "section_id": section,
152
+ "total_rows": len(table_rows),
153
  "content_size": content_size
154
  }
155
  )
156
 
157
+ # Apply smart chunking if too large
158
  if content_size > CHUNK_SIZE:
159
+ log_message(f"📊 CHUNKING: Таблица {table_num} | {content_size} > {CHUNK_SIZE}")
160
+ return chunk_table_document(base_doc)
 
 
 
 
 
161
  else:
162
+ log_message(f"✓ Таблица {table_num} добавлена целиком ({content_size} символов)")
163
+ return [base_doc]