Spaces:
Sleeping
Sleeping
Commit
·
7504d82
1
Parent(s):
ec64429
max rows = 20, 100 + 100 bm25
Browse files- documents_prep.py +62 -46
- index_retriever.py +3 -3
documents_prep.py
CHANGED
|
@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=
|
| 42 |
"""
|
| 43 |
Chunk tables by rows with fallback to character limit.
|
| 44 |
Keeps 3-4 rows together, but splits individual rows if they're too large.
|
|
@@ -501,53 +501,69 @@ def extract_sections_from_json(json_path):
|
|
| 501 |
return documents
|
| 502 |
|
| 503 |
|
| 504 |
-
def
|
| 505 |
-
"
|
| 506 |
-
log_message("Loading images...")
|
| 507 |
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
content += f"Раздел: {row.get('Раздел документа', '')}\n"
|
| 529 |
|
| 530 |
-
|
|
|
|
| 531 |
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
}
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 553 |
"""Main loader - combines all document types"""
|
|
@@ -563,7 +579,7 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
|
| 563 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 564 |
|
| 565 |
# Load images (no chunking needed)
|
| 566 |
-
image_docs =
|
| 567 |
|
| 568 |
all_docs = text_chunks + table_chunks + image_docs
|
| 569 |
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=20, max_chars=2000):
|
| 42 |
"""
|
| 43 |
Chunk tables by rows with fallback to character limit.
|
| 44 |
Keeps 3-4 rows together, but splits individual rows if they're too large.
|
|
|
|
| 501 |
return documents
|
| 502 |
|
| 503 |
|
| 504 |
+
def load_image_data(repo_id, hf_token, image_data_dir):
|
| 505 |
+
log_message("Начинаю загрузку данных изображений")
|
|
|
|
| 506 |
|
| 507 |
+
image_files = []
|
| 508 |
+
try:
|
| 509 |
+
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 510 |
+
for file in files:
|
| 511 |
+
if file.startswith(image_data_dir) and file.endswith('.csv'):
|
| 512 |
+
image_files.append(file)
|
| 513 |
+
|
| 514 |
+
log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
|
| 515 |
+
|
| 516 |
+
image_documents = []
|
| 517 |
+
for file_path in image_files:
|
| 518 |
+
try:
|
| 519 |
+
log_message(f"Обрабатываю файл изображений: {file_path}")
|
| 520 |
+
local_path = hf_hub_download(
|
| 521 |
+
repo_id=repo_id,
|
| 522 |
+
filename=file_path,
|
| 523 |
+
local_dir='',
|
| 524 |
+
repo_type="dataset",
|
| 525 |
+
token=hf_token
|
| 526 |
+
)
|
|
|
|
| 527 |
|
| 528 |
+
df = pd.read_csv(local_path)
|
| 529 |
+
log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
|
| 530 |
|
| 531 |
+
# Обработка с правильными названиями колонок
|
| 532 |
+
for _, row in df.iterrows():
|
| 533 |
+
section_value = row.get('Раздел документа', 'Неизвестно')
|
| 534 |
+
|
| 535 |
+
content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
|
| 536 |
+
content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
|
| 537 |
+
content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n" # Опечатка в названии колонки
|
| 538 |
+
content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
|
| 539 |
+
content += f"Раздел: {section_value}\n"
|
| 540 |
+
content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
|
| 541 |
+
|
| 542 |
+
doc = Document(
|
| 543 |
+
text=content,
|
| 544 |
+
metadata={
|
| 545 |
+
"type": "image",
|
| 546 |
+
"image_number": str(row.get('№ Изображения', 'unknown')),
|
| 547 |
+
"image_title": str(row.get('Название изображения', 'unknown')),
|
| 548 |
+
"image_description": str(row.get('Описание изображение', 'unknown')),
|
| 549 |
+
"document_id": str(row.get('Обозначение документа', 'unknown')),
|
| 550 |
+
"file_path": str(row.get('Файл изображения', 'unknown')),
|
| 551 |
+
"section": str(section_value),
|
| 552 |
+
"section_id": str(section_value)
|
| 553 |
+
}
|
| 554 |
+
)
|
| 555 |
+
image_documents.append(doc)
|
| 556 |
+
|
| 557 |
+
except Exception as e:
|
| 558 |
+
log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
|
| 559 |
+
continue
|
| 560 |
+
|
| 561 |
+
log_message(f"Создано {len(image_documents)} документов из изображений")
|
| 562 |
+
return image_documents
|
| 563 |
+
|
| 564 |
+
except Exception as e:
|
| 565 |
+
log_message(f"Ошибка загрузки данных изображений: {str(e)}")
|
| 566 |
+
return []
|
| 567 |
|
| 568 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 569 |
"""Main loader - combines all document types"""
|
|
|
|
| 579 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 580 |
|
| 581 |
# Load images (no chunking needed)
|
| 582 |
+
image_docs = load_image_data(repo_id, hf_token, image_dir)
|
| 583 |
|
| 584 |
all_docs = text_chunks + table_chunks + image_docs
|
| 585 |
|
index_retriever.py
CHANGED
|
@@ -108,17 +108,17 @@ def create_query_engine(vector_index):
|
|
| 108 |
|
| 109 |
vector_retriever = VectorIndexRetriever(
|
| 110 |
index=vector_index,
|
| 111 |
-
similarity_top_k=
|
| 112 |
)
|
| 113 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 114 |
docstore=vector_index.docstore,
|
| 115 |
-
similarity_top_k=
|
| 116 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 117 |
|
| 118 |
)
|
| 119 |
hybrid_retriever = QueryFusionRetriever(
|
| 120 |
[vector_retriever, bm25_retriever],
|
| 121 |
-
similarity_top_k=
|
| 122 |
num_queries=1
|
| 123 |
)
|
| 124 |
|
|
|
|
| 108 |
|
| 109 |
vector_retriever = VectorIndexRetriever(
|
| 110 |
index=vector_index,
|
| 111 |
+
similarity_top_k=200
|
| 112 |
)
|
| 113 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 114 |
docstore=vector_index.docstore,
|
| 115 |
+
similarity_top_k=200,
|
| 116 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 117 |
|
| 118 |
)
|
| 119 |
hybrid_retriever = QueryFusionRetriever(
|
| 120 |
[vector_retriever, bm25_retriever],
|
| 121 |
+
similarity_top_k=100,
|
| 122 |
num_queries=1
|
| 123 |
)
|
| 124 |
|