MrSimple07 commited on
Commit
f7d949d
·
1 Parent(s): 2370c98

improved the docs prep json txt

Browse files
Files changed (2) hide show
  1. documents_prep.py +67 -16
  2. utils.py +18 -4
documents_prep.py CHANGED
@@ -14,6 +14,10 @@ def extract_text_from_json(data, document_id, document_name):
14
  section_id = section.get('section_id', 'Unknown')
15
  section_text = section.get('section_text', '')
16
 
 
 
 
 
17
  if section_text.strip():
18
  doc = Document(
19
  text=section_text,
@@ -22,73 +26,120 @@ def extract_text_from_json(data, document_id, document_name):
22
  "document_id": document_id,
23
  "document_name": document_name,
24
  "section_id": section_id,
25
- "level": "section"
 
 
 
26
  }
27
  )
28
  documents.append(doc)
29
 
 
30
  if 'subsections' in section:
31
  for subsection in section['subsections']:
32
  subsection_id = subsection.get('subsection_id', 'Unknown')
33
  subsection_text = subsection.get('subsection_text', '')
 
 
34
 
35
  if subsection_text.strip():
 
 
 
36
  doc = Document(
37
- text=subsection_text,
38
  metadata={
39
  "type": "text",
40
  "document_id": document_id,
41
  "document_name": document_name,
42
- "section_id": section_id,
43
- "subsection_id": subsection_id,
44
- "level": "subsection"
 
 
45
  }
46
  )
47
  documents.append(doc)
48
 
 
49
  if 'sub_subsections' in subsection:
50
  for sub_subsection in subsection['sub_subsections']:
51
  sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
52
  sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
 
 
53
 
54
  if sub_subsection_text.strip():
 
 
 
55
  doc = Document(
56
- text=sub_subsection_text,
57
  metadata={
58
  "type": "text",
59
  "document_id": document_id,
60
  "document_name": document_name,
61
- "section_id": section_id,
62
- "subsection_id": subsection_id,
63
- "sub_subsection_id": sub_subsection_id,
64
- "level": "sub_subsection"
 
 
 
 
65
  }
66
  )
67
  documents.append(doc)
68
 
 
69
  if 'sub_sub_subsections' in sub_subsection:
70
  for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
71
  sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
72
  sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
 
73
 
74
  if sub_sub_subsection_text.strip():
 
 
 
75
  doc = Document(
76
- text=sub_sub_subsection_text,
77
  metadata={
78
  "type": "text",
79
  "document_id": document_id,
80
  "document_name": document_name,
81
- "section_id": section_id,
82
- "subsection_id": subsection_id,
83
- "sub_subsection_id": sub_subsection_id,
84
- "sub_sub_subsection_id": sub_sub_subsection_id,
85
- "level": "sub_sub_subsection"
 
 
 
 
86
  }
87
  )
88
  documents.append(doc)
89
 
90
  return documents
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def extract_zip_and_process_json(zip_path):
93
  documents = []
94
 
 
14
  section_id = section.get('section_id', 'Unknown')
15
  section_text = section.get('section_text', '')
16
 
17
+ # Create hierarchical path for better context
18
+ section_path = f"{section_id}"
19
+ section_title = extract_section_title(section_text)
20
+
21
  if section_text.strip():
22
  doc = Document(
23
  text=section_text,
 
26
  "document_id": document_id,
27
  "document_name": document_name,
28
  "section_id": section_id,
29
+ "section_text": section_title, # Store section title
30
+ "section_path": section_path,
31
+ "level": "section",
32
+ "parent_sections": [] # Empty for top level
33
  }
34
  )
35
  documents.append(doc)
36
 
37
+ # Process subsections with inherited context
38
  if 'subsections' in section:
39
  for subsection in section['subsections']:
40
  subsection_id = subsection.get('subsection_id', 'Unknown')
41
  subsection_text = subsection.get('subsection_text', '')
42
+ subsection_title = extract_section_title(subsection_text)
43
+ subsection_path = f"{section_path}.{subsection_id}"
44
 
45
  if subsection_text.strip():
46
+ # Include parent context in the text
47
+ enhanced_text = f"[Раздел {section_id} {section_title}]\n{subsection_text}"
48
+
49
  doc = Document(
50
+ text=enhanced_text,
51
  metadata={
52
  "type": "text",
53
  "document_id": document_id,
54
  "document_name": document_name,
55
+ "section_id": subsection_id,
56
+ "section_text": subsection_title,
57
+ "section_path": subsection_path,
58
+ "level": "subsection",
59
+ "parent_sections": [{"id": section_id, "title": section_title}]
60
  }
61
  )
62
  documents.append(doc)
63
 
64
+ # Process sub_subsections
65
  if 'sub_subsections' in subsection:
66
  for sub_subsection in subsection['sub_subsections']:
67
  sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
68
  sub_subsection_text = sub_subsection.get('sub_subsection_text', '')
69
+ sub_subsection_title = extract_section_title(sub_subsection_text)
70
+ sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
71
 
72
  if sub_subsection_text.strip():
73
+ # Include full hierarchical context
74
+ enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n{sub_subsection_text}"
75
+
76
  doc = Document(
77
+ text=enhanced_text,
78
  metadata={
79
  "type": "text",
80
  "document_id": document_id,
81
  "document_name": document_name,
82
+ "section_id": sub_subsection_id,
83
+ "section_text": sub_subsection_title,
84
+ "section_path": sub_subsection_path,
85
+ "level": "sub_subsection",
86
+ "parent_sections": [
87
+ {"id": section_id, "title": section_title},
88
+ {"id": subsection_id, "title": subsection_title}
89
+ ]
90
  }
91
  )
92
  documents.append(doc)
93
 
94
+ # Process sub_sub_subsections
95
  if 'sub_sub_subsections' in sub_subsection:
96
  for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
97
  sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
98
  sub_sub_subsection_text = sub_sub_subsection.get('sub_sub_subsection_text', '')
99
+ sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
100
 
101
  if sub_sub_subsection_text.strip():
102
+ # Full context chain
103
+ enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n[Подподраздел {sub_subsection_id} {sub_subsection_title}]\n{sub_sub_subsection_text}"
104
+
105
  doc = Document(
106
+ text=enhanced_text,
107
  metadata={
108
  "type": "text",
109
  "document_id": document_id,
110
  "document_name": document_name,
111
+ "section_id": sub_sub_subsection_id,
112
+ "section_text": sub_sub_subsection_title,
113
+ "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
114
+ "level": "sub_sub_subsection",
115
+ "parent_sections": [
116
+ {"id": section_id, "title": section_title},
117
+ {"id": subsection_id, "title": subsection_title},
118
+ {"id": sub_subsection_id, "title": sub_subsection_title}
119
+ ]
120
  }
121
  )
122
  documents.append(doc)
123
 
124
  return documents
125
 
126
+ def extract_section_title(section_text):
127
+ if not section_text.strip():
128
+ return ""
129
+
130
+ lines = section_text.strip().split('\n')
131
+ first_line = lines[0].strip()
132
+
133
+ if len(first_line) < 200 and not first_line.endswith('.'):
134
+ return first_line
135
+
136
+ # Otherwise, extract first sentence
137
+ sentences = first_line.split('.')
138
+ if len(sentences) > 1:
139
+ return sentences[0].strip()
140
+
141
+ return first_line[:100] + "..." if len(first_line) > 100 else first_line
142
+
143
  def extract_zip_and_process_json(zip_path):
144
  documents = []
145
 
utils.py CHANGED
@@ -51,10 +51,24 @@ def format_context_for_llm(nodes):
51
  doc_id = metadata.get('document_id', 'Неизвестный документ')
52
 
53
  section_info = ""
54
- if metadata.get('section_id') and metadata.get('section_text'):
55
- section_info = f"пункт {metadata['section_id']} {metadata['section_text']}"
56
-
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if metadata.get('type') == 'table' and metadata.get('table_number'):
59
  table_num = metadata['table_number']
60
  if not str(table_num).startswith('№'):
@@ -139,7 +153,7 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
139
  log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
140
 
141
  log_message("Применяю переранжировку")
142
- reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=15)
143
 
144
  formatted_context = format_context_for_llm(reranked_nodes)
145
  log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")
 
51
  doc_id = metadata.get('document_id', 'Неизвестный документ')
52
 
53
  section_info = ""
 
 
 
54
 
55
+ # Handle hierarchical section information
56
+ if metadata.get('section_path'):
57
+ section_path = metadata['section_path']
58
+ section_text = metadata.get('section_text', '')
59
+ if section_text:
60
+ section_info = f"пункт {section_path} ({section_text})"
61
+ else:
62
+ section_info = f"пункт {section_path}"
63
+ elif metadata.get('section_id'):
64
+ section_id = metadata['section_id']
65
+ section_text = metadata.get('section_text', '')
66
+ if section_text:
67
+ section_info = f"пункт {section_id} ({section_text})"
68
+ else:
69
+ section_info = f"пункт {section_id}"
70
+
71
+ # Handle tables and images as before
72
  if metadata.get('type') == 'table' and metadata.get('table_number'):
73
  table_num = metadata['table_number']
74
  if not str(table_num).startswith('№'):
 
153
  log_message(f"Пример узла {i+1}: {retrieved_nodes[i].text[:200]}...")
154
 
155
  log_message("Применяю переранжировку")
156
+ reranked_nodes = rerank_nodes(question, retrieved_nodes, reranker, top_k=10)
157
 
158
  formatted_context = format_context_for_llm(reranked_nodes)
159
  log_message(f"fорматированный контекст для LLM:\n{formatted_context[:500]}...")