Spaces:
Sleeping
Sleeping
Commit
·
2eb8b63
1
Parent(s):
2b217eb
chunk size = 2048 + rows=15
Browse files- documents_prep.py +46 -62
- index_retriever.py +19 -15
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -501,69 +501,53 @@ def extract_sections_from_json(json_path):
|
|
| 501 |
return documents
|
| 502 |
|
| 503 |
|
| 504 |
-
def
|
| 505 |
-
|
|
|
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
)
|
|
|
|
| 527 |
|
| 528 |
-
|
| 529 |
-
log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
|
| 530 |
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
"file_path": str(row.get('Файл изображения', 'unknown')),
|
| 551 |
-
"section": str(section_value),
|
| 552 |
-
"section_id": str(section_value)
|
| 553 |
-
}
|
| 554 |
-
)
|
| 555 |
-
image_documents.append(doc)
|
| 556 |
-
|
| 557 |
-
except Exception as e:
|
| 558 |
-
log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
|
| 559 |
-
continue
|
| 560 |
-
|
| 561 |
-
log_message(f"Создано {len(image_documents)} документов из изображений")
|
| 562 |
-
return image_documents
|
| 563 |
-
|
| 564 |
-
except Exception as e:
|
| 565 |
-
log_message(f"Ошибка загрузки данных изображений: {str(e)}")
|
| 566 |
-
return []
|
| 567 |
|
| 568 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 569 |
"""Main loader - combines all document types"""
|
|
@@ -579,7 +563,7 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
|
| 579 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 580 |
|
| 581 |
# Load images (no chunking needed)
|
| 582 |
-
image_docs =
|
| 583 |
|
| 584 |
all_docs = text_chunks + table_chunks + image_docs
|
| 585 |
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1024
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 501 |
return documents
|
| 502 |
|
| 503 |
|
| 504 |
+
def load_image_documents(repo_id, hf_token, image_dir):
|
| 505 |
+
"""Load image descriptions"""
|
| 506 |
+
log_message("Loading images...")
|
| 507 |
|
| 508 |
+
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 509 |
+
csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
|
| 510 |
+
|
| 511 |
+
documents = []
|
| 512 |
+
for file_path in csv_files:
|
| 513 |
+
try:
|
| 514 |
+
local_path = hf_hub_download(
|
| 515 |
+
repo_id=repo_id,
|
| 516 |
+
filename=file_path,
|
| 517 |
+
repo_type="dataset",
|
| 518 |
+
token=hf_token
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
df = pd.read_csv(local_path)
|
| 522 |
+
|
| 523 |
+
for _, row in df.iterrows():
|
| 524 |
+
content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
|
| 525 |
+
content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
|
| 526 |
+
content += f"Название: {row.get('Название изображения', '')}\n"
|
| 527 |
+
content += f"Описание: {row.get('Описание изображение', '')}\n"
|
| 528 |
+
content += f"Раздел: {row.get('Раздел документа', '')}\n"
|
| 529 |
|
| 530 |
+
chunk_size = len(content)
|
|
|
|
| 531 |
|
| 532 |
+
documents.append(Document(
|
| 533 |
+
text=content,
|
| 534 |
+
metadata={
|
| 535 |
+
'type': 'image',
|
| 536 |
+
'document_id': str(row.get('Обозначение документа', 'unknown')),
|
| 537 |
+
'image_number': str(row.get('№ Изображения', 'unknown')),
|
| 538 |
+
'section': str(row.get('Раздел документа', '')),
|
| 539 |
+
'chunk_size': chunk_size
|
| 540 |
+
}
|
| 541 |
+
))
|
| 542 |
+
except Exception as e:
|
| 543 |
+
log_message(f"Error loading {file_path}: {e}")
|
| 544 |
+
|
| 545 |
+
if documents:
|
| 546 |
+
avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
|
| 547 |
+
log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
|
| 548 |
+
|
| 549 |
+
return documents
|
| 550 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
|
| 553 |
"""Main loader - combines all document types"""
|
|
|
|
| 563 |
table_chunks = load_table_documents(repo_id, hf_token, table_dir)
|
| 564 |
|
| 565 |
# Load images (no chunking needed)
|
| 566 |
+
image_docs = load_image_documents(repo_id, hf_token, image_dir)
|
| 567 |
|
| 568 |
all_docs = text_chunks + table_chunks + image_docs
|
| 569 |
|
index_retriever.py
CHANGED
|
@@ -31,21 +31,20 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
|
|
| 31 |
|
| 32 |
|
| 33 |
def normalize_doc_id(doc_id: str) -> str:
|
| 34 |
-
"""Normalize document ID
|
| 35 |
doc_id = doc_id.upper().strip()
|
| 36 |
-
doc_id = re.sub(r'\
|
| 37 |
doc_id = doc_id.replace("ГОСТР", "ГОСТ")
|
| 38 |
doc_id = doc_id.replace("GOSTR", "ГОСТ")
|
| 39 |
return doc_id
|
| 40 |
|
| 41 |
def base_number(doc_id: str) -> str:
|
| 42 |
-
"""Extract
|
| 43 |
-
|
| 44 |
-
m = re.search(r'(\d+(?:\.\d+)*)', doc_id)
|
| 45 |
return m.group(1) if m else ""
|
| 46 |
|
| 47 |
-
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.
|
| 48 |
-
"""Filter nodes by document ID with
|
| 49 |
if not doc_ids:
|
| 50 |
return nodes
|
| 51 |
|
|
@@ -58,17 +57,22 @@ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
|
|
| 58 |
node_base = base_number(node_doc_id)
|
| 59 |
|
| 60 |
for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
|
| 61 |
-
#
|
| 62 |
if q_base and node_base and q_base == node_base:
|
| 63 |
filtered.append(node)
|
| 64 |
break
|
| 65 |
-
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
filtered.append(node)
|
| 69 |
break
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
def extract_doc_id_from_query(query):
|
|
@@ -108,17 +112,17 @@ def create_query_engine(vector_index):
|
|
| 108 |
|
| 109 |
vector_retriever = VectorIndexRetriever(
|
| 110 |
index=vector_index,
|
| 111 |
-
similarity_top_k=
|
| 112 |
)
|
| 113 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 114 |
docstore=vector_index.docstore,
|
| 115 |
-
similarity_top_k=
|
| 116 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 117 |
|
| 118 |
)
|
| 119 |
hybrid_retriever = QueryFusionRetriever(
|
| 120 |
[vector_retriever, bm25_retriever],
|
| 121 |
-
similarity_top_k=
|
| 122 |
num_queries=1
|
| 123 |
)
|
| 124 |
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
def normalize_doc_id(doc_id: str) -> str:
|
| 34 |
+
"""Normalize document ID for consistent comparison."""
|
| 35 |
doc_id = doc_id.upper().strip()
|
| 36 |
+
doc_id = re.sub(r'[^\w\d\.]+', '', doc_id) # remove spaces, dashes, etc.
|
| 37 |
doc_id = doc_id.replace("ГОСТР", "ГОСТ")
|
| 38 |
doc_id = doc_id.replace("GOSTR", "ГОСТ")
|
| 39 |
return doc_id
|
| 40 |
|
| 41 |
def base_number(doc_id: str) -> str:
|
| 42 |
+
"""Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
|
| 43 |
+
m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
|
|
|
|
| 44 |
return m.group(1) if m else ""
|
| 45 |
|
| 46 |
+
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
|
| 47 |
+
"""Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
|
| 48 |
if not doc_ids:
|
| 49 |
return nodes
|
| 50 |
|
|
|
|
| 57 |
node_base = base_number(node_doc_id)
|
| 58 |
|
| 59 |
for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
|
| 60 |
+
# Strong match: same base number (e.g., 59023.4)
|
| 61 |
if q_base and node_base and q_base == node_base:
|
| 62 |
filtered.append(node)
|
| 63 |
break
|
| 64 |
+
|
| 65 |
+
# Medium match: similarity ratio > threshold
|
| 66 |
+
if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
|
| 67 |
filtered.append(node)
|
| 68 |
break
|
| 69 |
|
| 70 |
+
# Weak fallback: contains or partial substring
|
| 71 |
+
if q_base in node_doc_id or q_doc in node_doc_id:
|
| 72 |
+
filtered.append(node)
|
| 73 |
+
break
|
| 74 |
+
|
| 75 |
+
return filtered if filtered else nodes # Fallback: keep all if none matched
|
| 76 |
|
| 77 |
|
| 78 |
def extract_doc_id_from_query(query):
|
|
|
|
| 112 |
|
| 113 |
vector_retriever = VectorIndexRetriever(
|
| 114 |
index=vector_index,
|
| 115 |
+
similarity_top_k=100
|
| 116 |
)
|
| 117 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 118 |
docstore=vector_index.docstore,
|
| 119 |
+
similarity_top_k=100,
|
| 120 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 121 |
|
| 122 |
)
|
| 123 |
hybrid_retriever = QueryFusionRetriever(
|
| 124 |
[vector_retriever, bm25_retriever],
|
| 125 |
+
similarity_top_k=60,
|
| 126 |
num_queries=1
|
| 127 |
)
|
| 128 |
|