Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Sep 13, 2025

Commit

6c977f5

1 Parent(s): 868adb6

improved chunk size to 2048

Browse files

Files changed (3) hide show

config.py +1 -1
documents_prep.py +67 -84
utils.py +6 -3

config.py CHANGED Viewed

@@ -52,7 +52,7 @@ AVAILABLE_MODELS = {
 DEFAULT_MODEL = "Gemini 2.5 Flash"
-CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

 DEFAULT_MODEL = "Gemini 2.5 Flash"
+CHUNK_SIZE = 2048
 CHUNK_OVERLAP = 256
 CUSTOM_PROMPT = """

documents_prep.py CHANGED Viewed

@@ -14,7 +14,6 @@ def extract_text_from_json(data, document_id, document_name):
             section_id = section.get('section_id', 'Unknown')
             section_text = section.get('section_text', '')
-            # Create hierarchical path for better context
             section_path = f"{section_id}"
             section_title = extract_section_title(section_text)
@@ -26,15 +25,13 @@ def extract_text_from_json(data, document_id, document_name):
                         "document_id": document_id,
                         "document_name": document_name,
                         "section_id": section_id,
-                        "section_text": section_title,  # Store section title
                         "section_path": section_path,
-                        "level": "section",
-                        "parent_sections": []  # Empty for top level
                     }
                 )
                 documents.append(doc)
-            # Process subsections with inherited context
             if 'subsections' in section:
                 for subsection in section['subsections']:
                     subsection_id = subsection.get('subsection_id', 'Unknown')
@@ -43,25 +40,22 @@ def extract_text_from_json(data, document_id, document_name):
                     subsection_path = f"{section_path}.{subsection_id}"
                     if subsection_text.strip():
-                        # Include parent context in the text
-                        enhanced_text = f"[Раздел {section_id} {section_title}]\n{subsection_text}"
                         doc = Document(
-                            text=enhanced_text,
                             metadata={
                                 "type": "text",
                                 "document_id": document_id,
                                 "document_name": document_name,
                                 "section_id": subsection_id,
-                                "section_text": subsection_title,
                                 "section_path": subsection_path,
                                 "level": "subsection",
-                                "parent_sections": [{"id": section_id, "title": section_title}]
                             }
                         )
                         documents.append(doc)
-                    # Process sub_subsections
                     if 'sub_subsections' in subsection:
                         for sub_subsection in subsection['sub_subsections']:
                             sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
@@ -70,28 +64,22 @@ def extract_text_from_json(data, document_id, document_name):
                             sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
                             if sub_subsection_text.strip():
-                                # Include full hierarchical context
-                                enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n{sub_subsection_text}"
                                 doc = Document(
-                                    text=enhanced_text,
                                     metadata={
                                         "type": "text",
                                         "document_id": document_id,
                                         "document_name": document_name,
                                         "section_id": sub_subsection_id,
-                                        "section_text": sub_subsection_title,
                                         "section_path": sub_subsection_path,
                                         "level": "sub_subsection",
-                                        "parent_sections": [
-                                            {"id": section_id, "title": section_title},
-                                            {"id": subsection_id, "title": subsection_title}
-                                        ]
                                     }
                                 )
                                 documents.append(doc)
-                            # Process sub_sub_subsections
                             if 'sub_sub_subsections' in sub_subsection:
                                 for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
                                     sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
@@ -99,82 +87,24 @@ def extract_text_from_json(data, document_id, document_name):
                                     sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
                                     if sub_sub_subsection_text.strip():
-                                        # Full context chain
-                                        enhanced_text = f"[Раздел {section_id} {section_title}]\n[Подраздел {subsection_id} {subsection_title}]\n[Подподраздел {sub_subsection_id} {sub_subsection_title}]\n{sub_sub_subsection_text}"
                                         doc = Document(
-                                            text=enhanced_text,
                                             metadata={
                                                 "type": "text",
                                                 "document_id": document_id,
                                                 "document_name": document_name,
                                                 "section_id": sub_sub_subsection_id,
-                                                "section_text": sub_sub_subsection_title,
                                                 "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
                                                 "level": "sub_sub_subsection",
-                                                "parent_sections": [
-                                                    {"id": section_id, "title": section_title},
-                                                    {"id": subsection_id, "title": subsection_title},
-                                                    {"id": sub_subsection_id, "title": sub_subsection_title}
-                                                ]
                                             }
                                         )
                                         documents.append(doc)
     return documents
-def extract_section_title(section_text):
-    if not section_text.strip():
-        return ""
-    lines = section_text.strip().split('\n')
-    first_line = lines[0].strip()
-    if len(first_line) < 200 and not first_line.endswith('.'):
-        return first_line
-    # Otherwise, extract first sentence
-    sentences = first_line.split('.')
-    if len(sentences) > 1:
-        return sentences[0].strip()
-    return first_line[:100] + "..." if len(first_line) > 100 else first_line
-def extract_zip_and_process_json(zip_path):
-    documents = []
-    try:
-        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-            zip_files = zip_ref.namelist()
-            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
-            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
-            for json_file in json_files:
-                try:
-                    log_message(f"Обрабатываю файл из архива: {json_file}")
-                    with zip_ref.open(json_file) as f:
-                        json_data = json.load(f)
-                    document_metadata = json_data.get('document_metadata', {})
-                    document_id = document_metadata.get('document_id', 'unknown')
-                    document_name = document_metadata.get('document_name', 'unknown')
-                    docs = extract_text_from_json(json_data, document_id, document_name)
-                    documents.extend(docs)
-                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
-                except Exception as e:
-                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
-                    continue
-    except Exception as e:
-        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
-    return documents
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     log_message("Начинаю загрузку JSON документов")
@@ -238,6 +168,59 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     except Exception as e:
         log_message(f"Ошибка загрузки JSON ��окументов: {str(e)}")
         return []
 def table_to_document(table_data, document_id=None):
     content = ""

             section_id = section.get('section_id', 'Unknown')
             section_text = section.get('section_text', '')
             section_path = f"{section_id}"
             section_title = extract_section_title(section_text)
                         "document_id": document_id,
                         "document_name": document_name,
                         "section_id": section_id,
+                        "section_text": section_title[:200],
                         "section_path": section_path,
+                        "level": "section"
                     }
                 )
                 documents.append(doc)
             if 'subsections' in section:
                 for subsection in section['subsections']:
                     subsection_id = subsection.get('subsection_id', 'Unknown')
                     subsection_path = f"{section_path}.{subsection_id}"
                     if subsection_text.strip():
                         doc = Document(
+                            text=subsection_text,
                             metadata={
                                 "type": "text",
                                 "document_id": document_id,
                                 "document_name": document_name,
                                 "section_id": subsection_id,
+                                "section_text": subsection_title[:200],
                                 "section_path": subsection_path,
                                 "level": "subsection",
+                                "parent_section": section_id,
+                                "parent_title": section_title[:100]
                             }
                         )
                         documents.append(doc)
                     if 'sub_subsections' in subsection:
                         for sub_subsection in subsection['sub_subsections']:
                             sub_subsection_id = sub_subsection.get('sub_subsection_id', 'Unknown')
                             sub_subsection_path = f"{subsection_path}.{sub_subsection_id}"
                             if sub_subsection_text.strip():
                                 doc = Document(
+                                    text=sub_subsection_text,
                                     metadata={
                                         "type": "text",
                                         "document_id": document_id,
                                         "document_name": document_name,
                                         "section_id": sub_subsection_id,
+                                        "section_text": sub_subsection_title[:200],
                                         "section_path": sub_subsection_path,
                                         "level": "sub_subsection",
+                                        "parent_section": subsection_id,
+                                        "parent_title": subsection_title[:100]
                                     }
                                 )
                                 documents.append(doc)
                             if 'sub_sub_subsections' in sub_subsection:
                                 for sub_sub_subsection in sub_subsection['sub_sub_subsections']:
                                     sub_sub_subsection_id = sub_sub_subsection.get('sub_sub_subsection_id', 'Unknown')
                                     sub_sub_subsection_title = extract_section_title(sub_sub_subsection_text)
                                     if sub_sub_subsection_text.strip():
                                         doc = Document(
+                                            text=sub_sub_subsection_text,
                                             metadata={
                                                 "type": "text",
                                                 "document_id": document_id,
                                                 "document_name": document_name,
                                                 "section_id": sub_sub_subsection_id,
+                                                "section_text": sub_sub_subsection_title[:200],
                                                 "section_path": f"{sub_subsection_path}.{sub_sub_subsection_id}",
                                                 "level": "sub_sub_subsection",
+                                                "parent_section": sub_subsection_id,
+                                                "parent_title": sub_subsection_title[:100]
                                             }
                                         )
                                         documents.append(doc)
     return documents
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     log_message("Начинаю загрузку JSON документов")
     except Exception as e:
         log_message(f"Ошибка загрузки JSON ��окументов: {str(e)}")
         return []
+def extract_section_title(section_text):
+    if not section_text.strip():
+        return ""
+    lines = section_text.strip().split('\n')
+    first_line = lines[0].strip()
+    if len(first_line) < 200 and not first_line.endswith('.'):
+        return first_line
+    # Otherwise, extract first sentence
+    sentences = first_line.split('.')
+    if len(sentences) > 1:
+        return sentences[0].strip()
+    return first_line[:100] + "..." if len(first_line) > 100 else first_line
+def extract_zip_and_process_json(zip_path):
+    documents = []
+    try:
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_files = zip_ref.namelist()
+            json_files = [f for f in zip_files if f.endswith('.json') and not f.startswith('__MACOSX')]
+            log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
+            for json_file in json_files:
+                try:
+                    log_message(f"Обрабатываю файл из архива: {json_file}")
+                    with zip_ref.open(json_file) as f:
+                        json_data = json.load(f)
+                    document_metadata = json_data.get('document_metadata', {})
+                    document_id = document_metadata.get('document_id', 'unknown')
+                    document_name = document_metadata.get('document_name', 'unknown')
+                    docs = extract_text_from_json(json_data, document_id, document_name)
+                    documents.extend(docs)
+                    log_message(f"Извлечено {len(docs)} документов из {json_file}")
+                except Exception as e:
+                    log_message(f"Ошибка обработки файла {json_file}: {str(e)}")
+                    continue
+    except Exception as e:
+        log_message(f"Ошибка извлечения ZIP архива {zip_path}: {str(e)}")
+    return documents
 def table_to_document(table_data, document_id=None):
     content = ""

utils.py CHANGED Viewed

@@ -52,11 +52,15 @@ def format_context_for_llm(nodes):
         section_info = ""
-        # Handle hierarchical section information
         if metadata.get('section_path'):
             section_path = metadata['section_path']
             section_text = metadata.get('section_text', '')
-            if section_text:
                 section_info = f"пункт {section_path} ({section_text})"
             else:
                 section_info = f"пункт {section_path}"
@@ -68,7 +72,6 @@ def format_context_for_llm(nodes):
             else:
                 section_info = f"пункт {section_id}"
-        # Handle tables and images as before
         if metadata.get('type') == 'table' and metadata.get('table_number'):
             table_num = metadata['table_number']
             if not str(table_num).startswith('№'):

         section_info = ""
         if metadata.get('section_path'):
             section_path = metadata['section_path']
             section_text = metadata.get('section_text', '')
+            parent_section = metadata.get('parent_section', '')
+            parent_title = metadata.get('parent_title', '')
+            if metadata.get('level') in ['subsection', 'sub_subsection', 'sub_sub_subsection'] and parent_section and parent_title:
+                section_info = f"пункт {section_path} ({section_text}) в разделе {parent_section} ({parent_title})"
+            elif section_text:
                 section_info = f"пункт {section_path} ({section_text})"
             else:
                 section_info = f"пункт {section_path}"
             else:
                 section_info = f"пункт {section_id}"
         if metadata.get('type') == 'table' and metadata.get('table_number'):
             table_num = metadata['table_number']
             if not str(table_num).startswith('№'):