Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

MrSimple07 commited on Oct 5, 2025

Commit

09fe356

1 Parent(s): 0b6ee4f

simplest version

Files changed (2) hide show

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 1024
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
@@ -39,7 +39,6 @@ def chunk_text_documents(documents):
 def normalize_doc_id(doc_id):
-    """Normalize document ID for consistent matching"""
     if not doc_id or doc_id == 'unknown':
         return doc_id
@@ -53,7 +52,7 @@ def normalize_doc_id(doc_id):
     return doc_id
-def chunk_table_by_rows(table_data, doc_id, max_rows=30):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
@@ -103,7 +102,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
         return [Document(text=content, metadata=metadata)]
     chunks = []
-    overlap = 3
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 256
 def chunk_text_documents(documents):
 def normalize_doc_id(doc_id):
     if not doc_id or doc_id == 'unknown':
         return doc_id
     return doc_id
+def chunk_table_by_rows(table_data, doc_id, max_rows=5):
     headers = table_data.get('headers', [])
     rows = table_data.get('data', [])
     table_num = table_data.get('table_number', 'unknown')
         return [Document(text=content, metadata=metadata)]
     chunks = []
+    overlap = 2
     for i in range(0, len(rows), max_rows - overlap):
         chunk_rows = rows[i:min(i+max_rows, len(rows))]

utils.py CHANGED Viewed

@@ -52,8 +52,6 @@ def preprocess_query(question):
         # Add normalized versions
         enhanced_query += f" {doc_type} Р {doc_num}"
-        enhanced_query += f" {doc_type}Р {doc_num}"
-        enhanced_query += f" {doc_type} {doc_num}"
     return enhanced_query

         # Add normalized versions
         enhanced_query += f" {doc_type} Р {doc_num}"
     return enhanced_query