Spaces:
Sleeping
Sleeping
Commit
·
9f55dc6
1
Parent(s):
b867de8
top k = 80 + max chunk size is 3000
Browse files- config.py +1 -1
- documents_prep.py +41 -0
- index_retriever.py +3 -3
config.py
CHANGED
|
@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
|
|
| 52 |
CHUNK_SIZE = 1500
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
-
MAX_CHARS_TABLE =
|
| 56 |
MAX_ROWS_TABLE = 10
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
|
|
|
| 52 |
CHUNK_SIZE = 1500
|
| 53 |
CHUNK_OVERLAP = 128
|
| 54 |
|
| 55 |
+
MAX_CHARS_TABLE = 3000
|
| 56 |
MAX_ROWS_TABLE = 10
|
| 57 |
|
| 58 |
CUSTOM_PROMPT = """
|
documents_prep.py
CHANGED
|
@@ -196,8 +196,43 @@ def format_table_rows(rows):
|
|
| 196 |
|
| 197 |
|
| 198 |
def format_table_footer(table_identifier, doc_id):
|
|
|
|
| 199 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 202 |
import zipfile
|
| 203 |
import tempfile
|
|
@@ -327,6 +362,7 @@ def load_json_documents(repo_id, hf_token, json_dir):
|
|
| 327 |
return documents
|
| 328 |
|
| 329 |
def extract_sections_from_json(json_path):
|
|
|
|
| 330 |
documents = []
|
| 331 |
|
| 332 |
try:
|
|
@@ -378,6 +414,7 @@ def extract_sections_from_json(json_path):
|
|
| 378 |
|
| 379 |
|
| 380 |
def load_table_documents(repo_id, hf_token, table_dir):
|
|
|
|
| 381 |
log_message("Loading tables...")
|
| 382 |
|
| 383 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
@@ -395,11 +432,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
|
|
| 395 |
|
| 396 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 397 |
data = json.load(f)
|
|
|
|
|
|
|
| 398 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 399 |
|
| 400 |
for sheet in data.get('sheets', []):
|
|
|
|
| 401 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 402 |
|
|
|
|
| 403 |
chunks = chunk_table_by_content(sheet, sheet_doc_id)
|
| 404 |
all_chunks.extend(chunks)
|
| 405 |
|
|
|
|
| 196 |
|
| 197 |
|
| 198 |
def format_table_footer(table_identifier, doc_id):
|
| 199 |
+
"""Format table footer"""
|
| 200 |
return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
|
| 201 |
|
| 202 |
+
def load_table_documents(repo_id, hf_token, table_dir):
|
| 203 |
+
log_message("Loading tables...")
|
| 204 |
+
|
| 205 |
+
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
| 206 |
+
table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
|
| 207 |
+
|
| 208 |
+
all_chunks = []
|
| 209 |
+
for file_path in table_files:
|
| 210 |
+
try:
|
| 211 |
+
local_path = hf_hub_download(
|
| 212 |
+
repo_id=repo_id,
|
| 213 |
+
filename=file_path,
|
| 214 |
+
repo_type="dataset",
|
| 215 |
+
token=hf_token
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
with open(local_path, 'r', encoding='utf-8') as f:
|
| 219 |
+
data = json.load(f)
|
| 220 |
+
|
| 221 |
+
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 222 |
+
|
| 223 |
+
for sheet in data.get('sheets', []):
|
| 224 |
+
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 225 |
+
|
| 226 |
+
chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
|
| 227 |
+
all_chunks.extend(chunks)
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
log_message(f"Error loading {file_path}: {e}")
|
| 231 |
+
|
| 232 |
+
log_message(f"✓ Loaded {len(all_chunks)} table chunks")
|
| 233 |
+
return all_chunks
|
| 234 |
+
|
| 235 |
+
|
| 236 |
def load_json_documents(repo_id, hf_token, json_dir):
|
| 237 |
import zipfile
|
| 238 |
import tempfile
|
|
|
|
| 362 |
return documents
|
| 363 |
|
| 364 |
def extract_sections_from_json(json_path):
|
| 365 |
+
"""Extract sections from a single JSON file"""
|
| 366 |
documents = []
|
| 367 |
|
| 368 |
try:
|
|
|
|
| 414 |
|
| 415 |
|
| 416 |
def load_table_documents(repo_id, hf_token, table_dir):
|
| 417 |
+
"""Load and chunk tables"""
|
| 418 |
log_message("Loading tables...")
|
| 419 |
|
| 420 |
files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
|
|
|
|
| 432 |
|
| 433 |
with open(local_path, 'r', encoding='utf-8') as f:
|
| 434 |
data = json.load(f)
|
| 435 |
+
|
| 436 |
+
# Extract file-level document_id
|
| 437 |
file_doc_id = data.get('document_id', data.get('document', 'unknown'))
|
| 438 |
|
| 439 |
for sheet in data.get('sheets', []):
|
| 440 |
+
# Use sheet-level document_id if available, otherwise use file-level
|
| 441 |
sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
|
| 442 |
|
| 443 |
+
# CRITICAL: Pass document_id to chunk function
|
| 444 |
chunks = chunk_table_by_content(sheet, sheet_doc_id)
|
| 445 |
all_chunks.extend(chunks)
|
| 446 |
|
index_retriever.py
CHANGED
|
@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
-
similarity_top_k=
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
-
similarity_top_k=
|
| 55 |
similarity_cutoff=0.55
|
| 56 |
)
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
-
similarity_top_k=
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|
|
|
|
| 46 |
|
| 47 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 48 |
docstore=vector_index.docstore,
|
| 49 |
+
similarity_top_k=80
|
| 50 |
)
|
| 51 |
|
| 52 |
vector_retriever = VectorIndexRetriever(
|
| 53 |
index=vector_index,
|
| 54 |
+
similarity_top_k=80,
|
| 55 |
similarity_cutoff=0.55
|
| 56 |
)
|
| 57 |
|
| 58 |
hybrid_retriever = QueryFusionRetriever(
|
| 59 |
[vector_retriever, bm25_retriever],
|
| 60 |
+
similarity_top_k=80,
|
| 61 |
num_queries=1
|
| 62 |
)
|
| 63 |
|