Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 8, 2025

Commit

9f55dc6

1 Parent(s): b867de8

top k = 80 + max chunk size is 3000

Browse files

Files changed (3) hide show

config.py +1 -1
documents_prep.py +41 -0
index_retriever.py +3 -3

config.py CHANGED Viewed

@@ -52,7 +52,7 @@ DEFAULT_MODEL = "Gemini 2.5 Flash"
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
-MAX_CHARS_TABLE = 2500
 MAX_ROWS_TABLE = 10
 CUSTOM_PROMPT = """

 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
+MAX_CHARS_TABLE = 3000
 MAX_ROWS_TABLE = 10
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -196,8 +196,43 @@ def format_table_rows(rows):
 def format_table_footer(table_identifier, doc_id):
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
@@ -327,6 +362,7 @@ def load_json_documents(repo_id, hf_token, json_dir):
     return documents
 def extract_sections_from_json(json_path):
     documents = []
     try:
@@ -378,6 +414,7 @@ def extract_sections_from_json(json_path):
 def load_table_documents(repo_id, hf_token, table_dir):
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
@@ -395,11 +432,15 @@ def load_table_documents(repo_id, hf_token, table_dir):
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
                 chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)

 def format_table_footer(table_identifier, doc_id):
+    """Format table footer"""
     return f"\n{'='*70}\nКОНЕЦ ТАБЛИЦЫ {table_identifier} ИЗ {doc_id}\n"
+def load_table_documents(repo_id, hf_token, table_dir):
+    log_message("Loading tables...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    table_files = [f for f in files if f.startswith(table_dir) and f.endswith('.json')]
+    all_chunks = []
+    for file_path in table_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            with open(local_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            file_doc_id = data.get('document_id', data.get('document', 'unknown'))
+            for sheet in data.get('sheets', []):
+                sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                chunks = chunk_table_by_content(sheet, sheet_doc_id, max_chars=1000)
+                all_chunks.extend(chunks)
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    log_message(f"✓ Loaded {len(all_chunks)} table chunks")
+    return all_chunks
 def load_json_documents(repo_id, hf_token, json_dir):
     import zipfile
     import tempfile
     return documents
 def extract_sections_from_json(json_path):
+    """Extract sections from a single JSON file"""
     documents = []
     try:
 def load_table_documents(repo_id, hf_token, table_dir):
+    """Load and chunk tables"""
     log_message("Loading tables...")
     files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
             with open(local_path, 'r', encoding='utf-8') as f:
                 data = json.load(f)
+            # Extract file-level document_id
             file_doc_id = data.get('document_id', data.get('document', 'unknown'))
             for sheet in data.get('sheets', []):
+                # Use sheet-level document_id if available, otherwise use file-level
                 sheet_doc_id = sheet.get('document_id', sheet.get('document', file_doc_id))
+                # CRITICAL: Pass document_id to chunk function
                 chunks = chunk_table_by_content(sheet, sheet_doc_id)
                 all_chunks.extend(chunks)

index_retriever.py CHANGED Viewed

@@ -46,18 +46,18 @@ def create_query_engine(vector_index):
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=70
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=70,
             similarity_cutoff=0.55
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=70,
             num_queries=1
         )

         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=80
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=80,
             similarity_cutoff=0.55
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=80,
             num_queries=1
         )