MrSimple07 commited on
Commit
09fe356
·
1 Parent(s): 0b6ee4f

simplest version

Browse files
Files changed (2) hide show
  1. documents_prep.py +3 -4
  2. utils.py +0 -2
documents_prep.py CHANGED
@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
7
  from my_logging import log_message
8
 
9
  # Configuration
10
- CHUNK_SIZE = 1024
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
@@ -39,7 +39,6 @@ def chunk_text_documents(documents):
39
 
40
 
41
  def normalize_doc_id(doc_id):
42
- """Normalize document ID for consistent matching"""
43
  if not doc_id or doc_id == 'unknown':
44
  return doc_id
45
 
@@ -53,7 +52,7 @@ def normalize_doc_id(doc_id):
53
  return doc_id
54
 
55
 
56
- def chunk_table_by_rows(table_data, doc_id, max_rows=30):
57
  headers = table_data.get('headers', [])
58
  rows = table_data.get('data', [])
59
  table_num = table_data.get('table_number', 'unknown')
@@ -103,7 +102,7 @@ def chunk_table_by_rows(table_data, doc_id, max_rows=30):
103
  return [Document(text=content, metadata=metadata)]
104
 
105
  chunks = []
106
- overlap = 3
107
 
108
  for i in range(0, len(rows), max_rows - overlap):
109
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
 
7
  from my_logging import log_message
8
 
9
  # Configuration
10
+ CHUNK_SIZE = 1500
11
  CHUNK_OVERLAP = 256
12
 
13
  def chunk_text_documents(documents):
 
39
 
40
 
41
  def normalize_doc_id(doc_id):
 
42
  if not doc_id or doc_id == 'unknown':
43
  return doc_id
44
 
 
52
  return doc_id
53
 
54
 
55
+ def chunk_table_by_rows(table_data, doc_id, max_rows=5):
56
  headers = table_data.get('headers', [])
57
  rows = table_data.get('data', [])
58
  table_num = table_data.get('table_number', 'unknown')
 
102
  return [Document(text=content, metadata=metadata)]
103
 
104
  chunks = []
105
+ overlap = 2
106
 
107
  for i in range(0, len(rows), max_rows - overlap):
108
  chunk_rows = rows[i:min(i+max_rows, len(rows))]
utils.py CHANGED
@@ -52,8 +52,6 @@ def preprocess_query(question):
52
 
53
  # Add normalized versions
54
  enhanced_query += f" {doc_type} Р {doc_num}"
55
- enhanced_query += f" {doc_type}Р {doc_num}"
56
- enhanced_query += f" {doc_type} {doc_num}"
57
 
58
  return enhanced_query
59
 
 
52
 
53
  # Add normalized versions
54
  enhanced_query += f" {doc_type} Р {doc_num}"
 
 
55
 
56
  return enhanced_query
57