MrSimple07 commited on
Commit
4c7b0a2
·
1 Parent(s): 2eb8b63

max rows = 20, 150 + 150 bm25

Browse files
Files changed (2) hide show
  1. documents_prep.py +2 -6
  2. index_retriever.py +4 -4
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 1024
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
@@ -38,11 +38,7 @@ def chunk_text_documents(documents):
38
  return chunked
39
 
40
 
41
- def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=10, max_chars=2000):
42
- """
43
- Chunk tables by rows with fallback to character limit.
44
- Keeps 3-4 rows together, but splits individual rows if they're too large.
45
- """
46
  headers = table_data.get('headers', [])
47
  rows = table_data.get('data', [])
48
  table_num = str(table_data.get('table_number', 'unknown')).strip()
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1500
11
  CHUNK_OVERLAP = 128
12
 
13
  def chunk_text_documents(documents):
 
38
  return chunked
39
 
40
 
41
+ def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=20, max_chars=2000):
 
 
 
 
42
  headers = table_data.get('headers', [])
43
  rows = table_data.get('data', [])
44
  table_num = str(table_data.get('table_number', 'unknown')).strip()
index_retriever.py CHANGED
@@ -43,7 +43,7 @@ def base_number(doc_id: str) -> str:
43
  m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
44
  return m.group(1) if m else ""
45
 
46
- def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
47
  """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
48
  if not doc_ids:
49
  return nodes
@@ -112,17 +112,17 @@ def create_query_engine(vector_index):
112
 
113
  vector_retriever = VectorIndexRetriever(
114
  index=vector_index,
115
- similarity_top_k=100
116
  )
117
  bm25_retriever = BM25Retriever.from_defaults(
118
  docstore=vector_index.docstore,
119
- similarity_top_k=100,
120
  tokenizer=russian_tokenizer # Add custom tokenizer
121
 
122
  )
123
  hybrid_retriever = QueryFusionRetriever(
124
  [vector_retriever, bm25_retriever],
125
- similarity_top_k=60,
126
  num_queries=1
127
  )
128
 
 
43
  m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
44
  return m.group(1) if m else ""
45
 
46
+ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.5):
47
  """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
48
  if not doc_ids:
49
  return nodes
 
112
 
113
  vector_retriever = VectorIndexRetriever(
114
  index=vector_index,
115
+ similarity_top_k=150
116
  )
117
  bm25_retriever = BM25Retriever.from_defaults(
118
  docstore=vector_index.docstore,
119
+ similarity_top_k=150,
120
  tokenizer=russian_tokenizer # Add custom tokenizer
121
 
122
  )
123
  hybrid_retriever = QueryFusionRetriever(
124
  [vector_retriever, bm25_retriever],
125
+ similarity_top_k=80,
126
  num_queries=1
127
  )
128