Spaces:
Sleeping
Sleeping
Commit
·
09fe356
1
Parent(s):
0b6ee4f
simplest version
Browse files- documents_prep.py +3 -4
- utils.py +0 -2
documents_prep.py
CHANGED
|
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
-
CHUNK_SIZE =
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
@@ -39,7 +39,6 @@ def chunk_text_documents(documents):
|
|
| 39 |
|
| 40 |
|
| 41 |
def normalize_doc_id(doc_id):
|
| 42 |
-
"""Normalize document ID for consistent matching"""
|
| 43 |
if not doc_id or doc_id == 'unknown':
|
| 44 |
return doc_id
|
| 45 |
|
|
@@ -53,7 +52,7 @@ def normalize_doc_id(doc_id):
|
|
| 53 |
return doc_id
|
| 54 |
|
| 55 |
|
| 56 |
-
def chunk_table_by_rows(table_data, doc_id, max_rows=
|
| 57 |
headers = table_data.get('headers', [])
|
| 58 |
rows = table_data.get('data', [])
|
| 59 |
table_num = table_data.get('table_number', 'unknown')
|
|
@@ -103,7 +102,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
|
|
| 103 |
return [Document(text=content, metadata=metadata)]
|
| 104 |
|
| 105 |
chunks = []
|
| 106 |
-
overlap =
|
| 107 |
|
| 108 |
for i in range(0, len(rows), max_rows - overlap):
|
| 109 |
chunk_rows = rows[i:min(i+max_rows, len(rows))]
|
|
|
|
| 7 |
from my_logging import log_message
|
| 8 |
|
| 9 |
# Configuration
|
| 10 |
+
CHUNK_SIZE = 1500
|
| 11 |
CHUNK_OVERLAP = 256
|
| 12 |
|
| 13 |
def chunk_text_documents(documents):
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
def normalize_doc_id(doc_id):
|
|
|
|
| 42 |
if not doc_id or doc_id == 'unknown':
|
| 43 |
return doc_id
|
| 44 |
|
|
|
|
| 52 |
return doc_id
|
| 53 |
|
| 54 |
|
| 55 |
+
def chunk_table_by_rows(table_data, doc_id, max_rows=5):
|
| 56 |
headers = table_data.get('headers', [])
|
| 57 |
rows = table_data.get('data', [])
|
| 58 |
table_num = table_data.get('table_number', 'unknown')
|
|
|
|
| 102 |
return [Document(text=content, metadata=metadata)]
|
| 103 |
|
| 104 |
chunks = []
|
| 105 |
+
overlap = 2
|
| 106 |
|
| 107 |
for i in range(0, len(rows), max_rows - overlap):
|
| 108 |
chunk_rows = rows[i:min(i+max_rows, len(rows))]
|
utils.py
CHANGED
|
@@ -52,8 +52,6 @@ def preprocess_query(question):
|
|
| 52 |
|
| 53 |
# Add normalized versions
|
| 54 |
enhanced_query += f" {doc_type} Р {doc_num}"
|
| 55 |
-
enhanced_query += f" {doc_type}Р {doc_num}"
|
| 56 |
-
enhanced_query += f" {doc_type} {doc_num}"
|
| 57 |
|
| 58 |
return enhanced_query
|
| 59 |
|
|
|
|
| 52 |
|
| 53 |
# Add normalized versions
|
| 54 |
enhanced_query += f" {doc_type} Р {doc_num}"
|
|
|
|
|
|
|
| 55 |
|
| 56 |
return enhanced_query
|
| 57 |
|