Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

MrSimple07 commited on Oct 6, 2025

Commit

4c7b0a2

1 Parent(s): 2eb8b63

max rows = 20, 150 + 150 bm25

Files changed (2) hide show

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 1024
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
@@ -38,11 +38,7 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=10, max_chars=2000):
-    """
-    Chunk tables by rows with fallback to character limit.
-    Keeps 3-4 rows together, but splits individual rows if they're too large.
-    """
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = str(table_data.get('table_number', 'unknown')).strip()

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
     return chunked
+def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=20, max_chars=2000):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = str(table_data.get('table_number', 'unknown')).strip()

index_retriever.py CHANGED Viewed

@@ -43,7 +43,7 @@ def base_number(doc_id: str) -> str:
     m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
     return m.group(1) if m else ""
-def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
     """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
     if not doc_ids:
         return nodes
@@ -112,17 +112,17 @@ def create_query_engine(vector_index):
     vector_retriever = VectorIndexRetriever(
         index=vector_index,
-        similarity_top_k=100
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
-        similarity_top_k=100,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
-        similarity_top_k=60,
         num_queries=1
     )

     m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
     return m.group(1) if m else ""
+def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.5):
     """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
     if not doc_ids:
         return nodes
     vector_retriever = VectorIndexRetriever(
         index=vector_index,
+        similarity_top_k=150
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
+        similarity_top_k=150,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
+        similarity_top_k=80,
         num_queries=1
     )