MrSimple07 commited on
Commit
6c977f5
·
1 Parent(s): 868adb6

improved chunk size to 2048

Browse files
Files changed (3) hide show
  1. config.py +1 -1
  2. documents_prep.py +67 -84
  3. utils.py +6 -3
config.py CHANGED
@@ -52,7 +52,7 @@ AVAILABLE_MODELS = {
52
 
53
  DEFAULT_MODEL = "Gemini 2.5 Flash"
54
 
55
- CHUNK_SIZE = 1500
56
  CHUNK_OVERLAP = 256
57
 
58
  CUSTOM_PROMPT = """
 
52
 
53
  DEFAULT_MODEL = "Gemini 2.5 Flash"
54
 
55
+ CHUNK_SIZE = 2048
56
  CHUNK_OVERLAP = 256
57
 
58
  CUSTOM_PROMPT = """
documents_prep.py CHANGED
@@ -14,7 +14,6 @@ def extract_text_from_json(data, document_id, document_name):
14
  section_id = section.get('section_id', 'Unknown')
15
  section_text = section.get('section_text', '')
16
 
17
- # Create hierarchical path for better context
18
  section_path = f"{section_id}"
19
  section_title = extract_section_title(section_text)
20
 
@@ -26,15 +25,13 @@ def extract_text_from_json(data, document_id, document_name):
26
  "document_id": document_id,
27
  "document_name": document_name,
28
  "section_id": section_id,
29
- "section_text": section_title, # Store section title
30
  "section_path": section_path,
31
- "level": "section",
32
- "parent_sections": [] # Empty for top level
33
  }
34
  )
35
  documents.append(doc)
36
 
37
- # Process subsections with inherited context
38
  if 'subsections' in section:
39
  for subsection in section['subsections']:
40
  subsection_id = subsection.get('subsection_id', 'Unknown')
@@ -43,25 +40,22 @@ def extract_text_from_json(data, document_id, document_name):
43
  subsection_path = f"{section_path}.{subsection_id}"
44
 
45
  if subsection_text.strip():
46
- # Include parent context in the text
47
- enhanced_text = f"[Раздел {section_id} {section_title}]\n{subsection_text}"
48
-
49
  doc = Document(
50
- text=enhanced_text,
51
  metadata={
52
  "type": "text",
53
  "document_id": document_id,
54
  "document_name": document_name,
55
  "section_id": subsection_id,
56
- "section_text": subsection_title,
57
  "section_path": subsection_path,
58
  "level": "subsection",
59
- "parent_sections": [{"id": section_id, "title": section_title}]
 
60
  }
61
  )
62
  documents.append(doc)
63
 
64
- # Process sub_subsections
65
  if 'sub_subsections' in subsection:
66
  for sub_subsection in subsection['sub_subsections']:
67
  sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
@@ -70,28 +64,22 @@ def extract_text_from_json(data, document_id, document_name):
70
  sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
71
 
72
  if sub_subsection_text.strip():
73
- # Include full hierarchical context
74
- enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n{sub_subsection_text}"
75
-
76
  doc = Document(
77
- text=enhanced_text,
78
  metadata={
79
  "type": "text",
80
  "document_id": document_id,
81
  "document_name": document_name,
82
  "section_id": sub_subsection_id,
83
- "section_text": sub_subsection_title,
84
  "section_path": sub_subsection_path,
85
  "level": "sub_subsection",
86
- "parent_sections": [
87
- {"id": section_id, "title": section_title},
88
- {"id": subsection_id, "title": subsection_title}
89
- ]
90
  }
91
  )
92
  documents.append(doc)
93
 
94
- # Process sub_sub_subsections
95
  if 'sub_sub_subsections' in sub_subsection:
96
  for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
97
  sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
@@ -99,82 +87,24 @@ def extract_text_from_json(data, document_id, document_name):
99
  sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
100
 
101
  if sub_sub_subsection_text.strip():
102
- # Full context chain
103
- enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n[Подподраздел {sub_subsection_id} {sub_subsection_title}]\n{sub_sub_subsection_text}"
104
-
105
  doc = Document(
106
- text=enhanced_text,
107
  metadata={
108
  "type": "text",
109
  "document_id": document_id,
110
  "document_name": document_name,
111
  "section_id": sub_sub_subsection_id,
112
- "section_text": sub_sub_subsection_title,
113
  "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
114
  "level": "sub_sub_subsection",
115
- "parent_sections": [
116
- {"id": section_id, "title": section_title},
117
- {"id": subsection_id, "title": subsection_title},
118
- {"id": sub_subsection_id, "title": sub_subsection_title}
119
- ]
120
  }
121
  )
122
  documents.append(doc)
123
 
124
  return documents
125
 
126
- def extract_section_title(section_text):
127
- if not section_text.strip():
128
- return ""
129
-
130
- lines = section_text.strip().split('\n')
131
- first_line = lines[0].strip()
132
-
133
- if len(first_line) < 200 and not first_line.endswith('.'):
134
- return first_line
135
-
136
- # Otherwise, extract first sentence
137
- sentences = first_line.split('.')
138
- if len(sentences) > 1:
139
- return sentences[0].strip()
140
-
141
- return first_line[:100] + "..." if len(first_line) > 100 else first_line
142
-
143
- def extract_zip_and_process_json(zip_path):
144
- documents = []
145
-
146
- try:
147
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
148
- zip_files = zip_ref.namelist()
149
- json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
150
-
151
- log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
152
-
153
- for json_file in json_files:
154
- try:
155
- log_message(f"Обрабатываю файл из архива: {json_file}")
156
-
157
- with zip_ref.open(json_file) as f:
158
- json_data = json.load(f)
159
-
160
- document_metadata = json_data.get('document_metadata', {})
161
- document_id = document_metadata.get('document_id', 'unknown')
162
- document_name = document_metadata.get('document_name', 'unknown')
163
-
164
- docs = extract_text_from_json(json_data, document_id, document_name)
165
- documents.extend(docs)
166
-
167
- log_message(f"Извлечено {len(docs)} документов из {json_file}")
168
-
169
- except Exception as e:
170
- log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
171
- continue
172
-
173
- except Exception as e:
174
- log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
175
-
176
- return documents
177
-
178
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
179
  log_message("Начинаю загрузку JSON документов")
180
 
@@ -238,6 +168,59 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
238
  except Exception as e:
239
  log_message(f"Ошибка загрузки JSON ��окументов: {str(e)}")
240
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  def table_to_document(table_data, document_id=None):
243
  content = ""
 
14
  section_id = section.get('section_id', 'Unknown')
15
  section_text = section.get('section_text', '')
16
 
 
17
  section_path = f"{section_id}"
18
  section_title = extract_section_title(section_text)
19
 
 
25
  "document_id": document_id,
26
  "document_name": document_name,
27
  "section_id": section_id,
28
+ "section_text": section_title[:200],
29
  "section_path": section_path,
30
+ "level": "section"
 
31
  }
32
  )
33
  documents.append(doc)
34
 
 
35
  if 'subsections' in section:
36
  for subsection in section['subsections']:
37
  subsection_id = subsection.get('subsection_id', 'Unknown')
 
40
  subsection_path = f"{section_path}.{subsection_id}"
41
 
42
  if subsection_text.strip():
 
 
 
43
  doc = Document(
44
+ text=subsection_text,
45
  metadata={
46
  "type": "text",
47
  "document_id": document_id,
48
  "document_name": document_name,
49
  "section_id": subsection_id,
50
+ "section_text": subsection_title[:200],
51
  "section_path": subsection_path,
52
  "level": "subsection",
53
+ "parent_section": section_id,
54
+ "parent_title": section_title[:100]
55
  }
56
  )
57
  documents.append(doc)
58
 
 
59
  if 'sub_subsections' in subsection:
60
  for sub_subsection in subsection['sub_subsections']:
61
  sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
 
64
  sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
65
 
66
  if sub_subsection_text.strip():
 
 
 
67
  doc = Document(
68
+ text=sub_subsection_text,
69
  metadata={
70
  "type": "text",
71
  "document_id": document_id,
72
  "document_name": document_name,
73
  "section_id": sub_subsection_id,
74
+ "section_text": sub_subsection_title[:200],
75
  "section_path": sub_subsection_path,
76
  "level": "sub_subsection",
77
+ "parent_section": subsection_id,
78
+ "parent_title": subsection_title[:100]
 
 
79
  }
80
  )
81
  documents.append(doc)
82
 
 
83
  if 'sub_sub_subsections' in sub_subsection:
84
  for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
85
  sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
 
87
  sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
88
 
89
  if sub_sub_subsection_text.strip():
 
 
 
90
  doc = Document(
91
+ text=sub_sub_subsection_text,
92
  metadata={
93
  "type": "text",
94
  "document_id": document_id,
95
  "document_name": document_name,
96
  "section_id": sub_sub_subsection_id,
97
+ "section_text": sub_sub_subsection_title[:200],
98
  "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
99
  "level": "sub_sub_subsection",
100
+ "parent_section": sub_subsection_id,
101
+ "parent_title": sub_subsection_title[:100]
 
 
 
102
  }
103
  )
104
  documents.append(doc)
105
 
106
  return documents
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
109
  log_message("Начинаю загрузку JSON документов")
110
 
 
168
  except Exception as e:
169
  log_message(f"Ошибка загрузки JSON ��окументов: {str(e)}")
170
  return []
171
+
172
+
173
+ def extract_section_title(section_text):
174
+ if not section_text.strip():
175
+ return ""
176
+
177
+ lines = section_text.strip().split('\n')
178
+ first_line = lines[0].strip()
179
+
180
+ if len(first_line) < 200 and not first_line.endswith('.'):
181
+ return first_line
182
+
183
+ # Otherwise, extract first sentence
184
+ sentences = first_line.split('.')
185
+ if len(sentences) > 1:
186
+ return sentences[0].strip()
187
+
188
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
189
+
190
+ def extract_zip_and_process_json(zip_path):
191
+ documents = []
192
+
193
+ try:
194
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
195
+ zip_files = zip_ref.namelist()
196
+ json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
197
+
198
+ log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
199
+
200
+ for json_file in json_files:
201
+ try:
202
+ log_message(f"Обрабатываю файл из архива: {json_file}")
203
+
204
+ with zip_ref.open(json_file) as f:
205
+ json_data = json.load(f)
206
+
207
+ document_metadata = json_data.get('document_metadata', {})
208
+ document_id = document_metadata.get('document_id', 'unknown')
209
+ document_name = document_metadata.get('document_name', 'unknown')
210
+
211
+ docs = extract_text_from_json(json_data, document_id, document_name)
212
+ documents.extend(docs)
213
+
214
+ log_message(f"Извлечено {len(docs)} документов из {json_file}")
215
+
216
+ except Exception as e:
217
+ log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
218
+ continue
219
+
220
+ except Exception as e:
221
+ log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
222
+
223
+ return documents
224
 
225
  def table_to_document(table_data, document_id=None):
226
  content = ""
utils.py CHANGED
@@ -52,11 +52,15 @@ def format_context_for_llm(nodes):
52
 
53
  section_info = ""
54
 
55
- # Handle hierarchical section information
56
  if metadata.get('section_path'):
57
  section_path = metadata['section_path']
58
  section_text = metadata.get('section_text', '')
59
- if section_text:
 
 
 
 
 
60
  section_info = f"пункт {section_path} ({section_text})"
61
  else:
62
  section_info = f"пункт {section_path}"
@@ -68,7 +72,6 @@ def format_context_for_llm(nodes):
68
  else:
69
  section_info = f"пункт {section_id}"
70
 
71
- # Handle tables and images as before
72
  if metadata.get('type') == 'table' and metadata.get('table_number'):
73
  table_num = metadata['table_number']
74
  if not str(table_num).startswith('№'):
 
52
 
53
  section_info = ""
54
 
 
55
  if metadata.get('section_path'):
56
  section_path = metadata['section_path']
57
  section_text = metadata.get('section_text', '')
58
+ parent_section = metadata.get('parent_section', '')
59
+ parent_title = metadata.get('parent_title', '')
60
+
61
+ if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
62
+ section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
63
+ elif section_text:
64
  section_info = f"пункт {section_path} ({section_text})"
65
  else:
66
  section_info = f"пункт {section_path}"
 
72
  else:
73
  section_info = f"пункт {section_id}"
74
 
 
75
  if metadata.get('type') == 'table' and metadata.get('table_number'):
76
  table_num = metadata['table_number']
77
  if not str(table_num).startswith('№'):