Spaces:
Sleeping
Sleeping
Commit
·
4c7b0a2
1
Parent(s):
2eb8b63
max rows = 20, 150 + 150 bm25
Browse files- documents_prep.py +2 -6
- index_retriever.py +4 -4
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -38,11 +38,7 @@ def chunk_text_documents(documents):
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
-
def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=
|
| 42 |
-
"""
|
| 43 |
-
Chunk tables by rows with fallback to character limit.
|
| 44 |
-
Keeps 3-4 rows together, but splits individual rows if they're too large.
|
| 45 |
-
"""
|
| 46 |
headers = table_data.get('headers', [])
|
| 47 |
rows = table_data.get('data', [])
|
| 48 |
table_num = str(table_data.get('table_number', 'unknown')).strip()
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1500
|
| 11 |
CHUNK_OVERLAP = 128
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 38 |
return chunked
|
| 39 |
|
| 40 |
|
| 41 |
+
def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=20, max_chars=2000):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
headers = table_data.get('headers', [])
|
| 43 |
rows = table_data.get('data', [])
|
| 44 |
table_num = str(table_data.get('table_number', 'unknown')).strip()
|
index_retriever.py
CHANGED
|
@@ -43,7 +43,7 @@ def base_number(doc_id: str) -> str:
|
|
| 43 |
m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
|
| 44 |
return m.group(1) if m else ""
|
| 45 |
|
| 46 |
-
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.
|
| 47 |
"""Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
|
| 48 |
if not doc_ids:
|
| 49 |
return nodes
|
|
@@ -112,17 +112,17 @@ def create_query_engine(vector_index):
|
|
| 112 |
|
| 113 |
vector_retriever = VectorIndexRetriever(
|
| 114 |
index=vector_index,
|
| 115 |
-
similarity_top_k=
|
| 116 |
)
|
| 117 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 118 |
docstore=vector_index.docstore,
|
| 119 |
-
similarity_top_k=
|
| 120 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 121 |
|
| 122 |
)
|
| 123 |
hybrid_retriever = QueryFusionRetriever(
|
| 124 |
[vector_retriever, bm25_retriever],
|
| 125 |
-
similarity_top_k=
|
| 126 |
num_queries=1
|
| 127 |
)
|
| 128 |
|
|
|
|
| 43 |
m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
|
| 44 |
return m.group(1) if m else ""
|
| 45 |
|
| 46 |
+
def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.5):
|
| 47 |
"""Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
|
| 48 |
if not doc_ids:
|
| 49 |
return nodes
|
|
|
|
| 112 |
|
| 113 |
vector_retriever = VectorIndexRetriever(
|
| 114 |
index=vector_index,
|
| 115 |
+
similarity_top_k=150
|
| 116 |
)
|
| 117 |
bm25_retriever = BM25Retriever.from_defaults(
|
| 118 |
docstore=vector_index.docstore,
|
| 119 |
+
similarity_top_k=150,
|
| 120 |
tokenizer=russian_tokenizer # Add custom tokenizer
|
| 121 |
|
| 122 |
)
|
| 123 |
hybrid_retriever = QueryFusionRetriever(
|
| 124 |
[vector_retriever, bm25_retriever],
|
| 125 |
+
similarity_top_k=80,
|
| 126 |
num_queries=1
|
| 127 |
)
|
| 128 |
|