Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 4, 2025

Commit

5fc122f

1 Parent(s): f0cb4f3

new documents_prep

Browse files

Files changed (1) hide show

documents_prep.py +173 -287

documents_prep.py CHANGED Viewed

@@ -1,12 +1,50 @@
 import json
 import zipfile
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-import os
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     log_message(f"Загрузка JSON документов из {json_files_dir}")
@@ -15,27 +53,27 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     chunk_info = []
     try:
-        files = list_repo_files(repo_id, token=hf_token)
         zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
-        log_message(f"Найдено {len(zip_files)} ZIP файлов")
         for zip_file in zip_files:
             zip_path = hf_hub_download(
                 repo_id=repo_id,
                 filename=zip_file,
-                token=hf_token,
                 repo_type="dataset",
-                local_dir=download_dir
             )
-            log_message(f"Обрабатываю архив: {zip_file}")
             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                 json_files = [f for f in zip_ref.namelist()
                              if f.endswith('.json') and not f.startswith('__MACOSX')]
-                log_message(f"Найдено {len(json_files)} JSON файлов в архиве")
                 for json_file in json_files:
                     try:
@@ -45,68 +83,60 @@ def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
                         doc_id = json_data.get('document_id', os.path.basename(json_file))
                         sections = json_data.get('sections', [])
-                        log_message(f"Обработка документа {doc_id}: {len(sections)} разделов")
                         for section in sections:
-                            doc, info = process_text_section(section, doc_id)
-                            if doc:
-                                documents.append(doc)
-                                chunk_info.append(info)
                     except Exception as e:
-                        log_message(f"Ошибка при обработке {json_file}: {str(e)}")
-        log_message(f"Загружено {len(documents)} текстовых документов")
-        return documents, chunk_info
     except Exception as e:
         log_message(f"Ошибка загрузки JSON: {str(e)}")
         return [], []
-def process_text_section(section, doc_id):
-    section_id = section.get('section_id', 'unknown')
-    section_path = section.get('section_path', '')
-    section_text = section.get('section_text', '')
-    section_content = section.get('section_content', '')
-    parent_section = section.get('parent_section', '')
-    parent_title = section.get('parent_title', '')
-    level = section.get('level', 'section')
-    full_text = f"{section_text}\n{section_content}".strip()
-    if not full_text:
-        return None, None
-    metadata = {
-        'document_id': doc_id,
-        'section_id': section_id,
-        'section_path': section_path,
-        'section_text': section_text,
-        'parent_section': parent_section,
-        'parent_title': parent_title,
-        'level': level,
-        'type': 'text',
-        'chunk_text': full_text
-    }
-    doc = Document(
-        text=full_text,
-        metadata=metadata
-    )
-    chunk_info = {
-        'document_id': doc_id,
-        'section_id': section_id,
-        'section_path': section_path,
-        'section_text': section_text,
-        'parent_section': parent_section,
-        'parent_title': parent_title,
-        'level': level,
-        'type': 'text',
-        'chunk_text': full_text
-    }
-    return doc, chunk_info
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message(f"Загрузка табличных данных из {table_data_dir}")
@@ -114,299 +144,155 @@ def load_table_data(repo_id, hf_token, table_data_dir):
     documents = []
     try:
-        files = list_repo_files(repo_id, token=hf_token)
-        json_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(json_files)} табличных JSON файлов")
-        for json_file in json_files:
             try:
                 file_path = hf_hub_download(
                     repo_id=repo_id,
-                    filename=json_file,
-                    token=hf_token,
-                    repo_type="dataset"
                 )
                 with open(file_path, 'r', encoding='utf-8') as f:
                     table_data = json.load(f)
-                doc = create_table_document(table_data)
-                if doc:
                     documents.append(doc)
             except Exception as e:
-                log_message(f"Ошибка при обработке таблицы {json_file}: {str(e)}")
         log_message(f"Загружено {len(documents)} табличных документов")
         return documents
     except Exception as e:
         log_message(f"Ошибка загрузки таблиц: {str(e)}")
         return []
-def create_table_document(table_data):
-    doc_id = table_data.get('document_id', 'unknown')
-    table_number = table_data.get('table_number', 'unknown')
-    table_title = table_data.get('table_title', '')
-    section = table_data.get('section', '')
-    headers = table_data.get('headers', [])
-    data = table_data.get('data', [])
-    if not data:
-        return None
-    token_count = estimate_tokens(str(table_data))
-    if token_count < 2000:
-        text = format_table_as_text(table_number, table_title, section, headers, data)
-        metadata = {
-            'document_id': doc_id,
-            'table_number': table_number,
-            'table_title': table_title,
-            'section': section,
-            'type': 'table',
-            'headers': str(headers),
-            'row_count': len(data)
-        }
-        return Document(text=text, metadata=metadata)
-    else:
-        return create_chunked_table_document(
-            doc_id, table_number, table_title, section, headers, data
-        )
-def create_chunked_table_document(doc_id, table_number, table_title, section, headers, data, rows_per_chunk=30):
-    chunks = []
-    for i in range(0, len(data), rows_per_chunk):
-        chunk_rows = data[i:i+rows_per_chunk]
-        text = format_table_as_text(
-            table_number,
-            table_title,
-            section,
-            headers,
-            chunk_rows,
-            chunk_info=f"строки {i+1}-{i+len(chunk_rows)}"
-        )
-        metadata = {
-            'document_id': doc_id,
-            'table_number': table_number,
-            'table_title': table_title,
-            'section': section,
-            'type': 'table',
-            'headers': str(headers),
-            'chunk_index': i // rows_per_chunk,
-            'row_start': i,
-            'row_end': i + len(chunk_rows),
-            'row_count': len(chunk_rows)
-        }
-        chunks.append(Document(text=text, metadata=metadata))
-    return chunks[0] if len(chunks) == 1 else chunks
-def format_table_as_text(table_number, table_title, section, headers, data, chunk_info=""):
-    text_parts = []
-    text_parts.append(f"Таблица {table_number}")
-    if table_title:
-        text_parts.append(f"Название: {table_title}")
-    if section:
-        text_parts.append(f"Раздел: {section}")
-    if chunk_info:
-        text_parts.append(f"({chunk_info})")
-    text_parts.append(f"\nЗаголовки: {', '.join(headers)}")
-    text_parts.append("\nДанные:")
-    for row in data[:100]:
-        row_text = " | ".join([str(cell) for cell in row])
-        text_parts.append(row_text)
-    return "\n".join(text_parts)
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message(f"Загрузка данных изображений из {image_data_dir}")
     documents = []
     try:
-        files = list_repo_files(repo_id, token=hf_token)
-        json_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
-        log_message(f"Найдено {len(json_files)} JSON файлов изображений")
-        for json_file in json_files:
             try:
                 file_path = hf_hub_download(
                     repo_id=repo_id,
-                    filename=json_file,
-                    token=hf_token,
-                    repo_type="dataset"
                 )
                 with open(file_path, 'r', encoding='utf-8') as f:
                     image_data = json.load(f)
-                doc = create_image_document(image_data)
-                if doc:
-                    documents.append(doc)
             except Exception as e:
-                log_message(f"Ошибка при обработке изображения {json_file}: {str(e)}")
         log_message(f"Загружено {len(documents)} документов изображений")
         return documents
     except Exception as e:
         log_message(f"Ошибка загрузки изображений: {str(e)}")
         return []
-def create_image_document(image_data):
-    doc_id = image_data.get('document_id', 'unknown')
-    image_number = image_data.get('image_number', 'unknown')
-    image_title = image_data.get('image_title', '')
-    image_description = image_data.get('image_description', '')
-    section = image_data.get('section', '')
-    text_parts = []
-    text_parts.append(f"Рисунок {image_number}")
-    if image_title:
-        text_parts.append(f"Название: {image_title}")
-    if section:
-        text_parts.append(f"Раздел: {section}")
-    if image_description:
-        text_parts.append(f"Описание: {image_description}")
-    text = "\n".join(text_parts)
-    metadata = {
-        'document_id': doc_id,
-        'image_number': image_number,
-        'image_title': image_title,
-        'section': section,
-        'type': 'image'
-    }
-    return Document(text=text, metadata=metadata)
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message(f"Загрузка CSV чанков из {chunks_filename}")
     try:
         csv_path = hf_hub_download(
             repo_id=repo_id,
             filename=chunks_filename,
-            token=hf_token,
             repo_type="dataset",
-            local_dir=download_dir
         )
-        df = pd.read_csv(csv_path)
-        log_message(f"Загружено {len(df)} строк из CSV")
-        documents = []
-        for _, row in df.iterrows():
             metadata = {
                 'document_id': row.get('document_id', 'unknown'),
-                'section_id': row.get('section_id', 'unknown'),
                 'section_path': row.get('section_path', ''),
                 'type': 'text'
             }
-            text = row.get('chunk_text', '')
-            if text:
-                doc = Document(text=text, metadata=metadata)
-                documents.append(doc)
         log_message(f"Создано {len(documents)} документов из CSV")
-        return documents, df
     except Exception as e:
         log_message(f"Ошибка загрузки CSV: {str(e)}")
-        return [], None
-def process_documents_with_chunking(documents):
-    log_message(f"Чанкинг {len(documents)} документов")
-    text_splitter = SentenceSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP,
-        separator=" ",
-        backup_separators=["\n", ".", "!", "?"]
-    )
-    chunked_documents = []
-    chunk_info = []
-    for doc in documents:
-        doc_type = doc.metadata.get('type', 'text')
-        if doc_type == 'table':
-            if isinstance(doc, list):
-                chunked_documents.extend(doc)
-                for d in doc:
-                    chunk_info.append(create_chunk_info(d))
-            else:
-                chunked_documents.append(doc)
-                chunk_info.append(create_chunk_info(doc))
-        elif doc_type == 'image':
-            chunked_documents.append(doc)
-            chunk_info.append(create_chunk_info(doc))
-        else:
-            token_count = estimate_tokens(doc.text)
-            if token_count <= CHUNK_SIZE:
-                chunked_documents.append(doc)
-                chunk_info.append(create_chunk_info(doc))
-            else:
-                nodes = text_splitter.get_nodes_from_documents([doc])
-                for node in nodes:
-                    new_doc = Document(
-                        text=node.text,
-                        metadata=doc.metadata
-                    )
-                    chunked_documents.append(new_doc)
-                    chunk_info.append(create_chunk_info(new_doc))
-    log_message(f"Получено {len(chunked_documents)} чанков после обработки")
-    return chunked_documents, chunk_info
-def create_chunk_info(doc):
-    metadata = doc.metadata
-    info = {
-        'document_id': metadata.get('document_id', 'unknown'),
-        'type': metadata.get('type', 'text'),
-        'chunk_text': doc.text[:500]
-    }
-    if metadata.get('type') == 'table':
-        info['table_number'] = metadata.get('table_number', 'unknown')
-        info['table_title'] = metadata.get('table_title', '')
-        info['section'] = metadata.get('section', '')
-    elif metadata.get('type') == 'image':
-        info['image_number'] = metadata.get('image_number', 'unknown')
-        info['image_title'] = metadata.get('image_title', '')
-        info['section'] = metadata.get('section', '')
-    else:
-        info['section_id'] = metadata.get('section_id', 'unknown')
-        info['section_path'] = metadata.get('section_path', '')
-        info['section_text'] = metadata.get('section_text', '')
-        info['parent_section'] = metadata.get('parent_section', '')
-        info['parent_title'] = metadata.get('parent_title', '')
-        info['level'] = metadata.get('level', 'section')
-    return info
-def estimate_tokens(text):
-    return len(text.split()) * 1.3

 import json
 import zipfile
+import os
 import pandas as pd
 from huggingface_hub import hf_hub_download, list_repo_files
 from llama_index.core import Document
 from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def process_documents_with_chunking(documents):
+    if not documents:
+        return [], []
+    log_message(f"Чанкинг {len(documents)} документов")
+    text_splitter = SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    )
+    chunked_docs = []
+    chunk_info = []
+    for doc in documents:
+        chunks = text_splitter.get_nodes_from_documents([doc])
+        for chunk in chunks:
+            chunked_docs.append(chunk)
+            metadata = doc.metadata.copy()
+            chunk_info.append({
+                'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', 'unknown'),
+                'section_path': metadata.get('section_path', ''),
+                'section_text': metadata.get('section_text', ''),
+                'parent_section': metadata.get('parent_section', ''),
+                'parent_title': metadata.get('parent_title', ''),
+                'level': metadata.get('level', ''),
+                'chunk_text': chunk.text,
+                'type': metadata.get('type', 'text'),
+                'table_number': metadata.get('table_number', ''),
+                'image_number': metadata.get('image_number', '')
+            })
+    log_message(f"Создано {len(chunked_docs)} чанков")
+    return chunked_docs, chunk_info
 def load_json_documents(repo_id, hf_token, json_files_dir, download_dir):
     log_message(f"Загрузка JSON документов из {json_files_dir}")
     chunk_info = []
     try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
         zip_files = [f for f in files if f.startswith(json_files_dir) and f.endswith('.zip')]
+        log_message(f"Найдено {len(zip_files)} ZIP архивов")
         for zip_file in zip_files:
+            log_message(f"Загружаю архив: {zip_file}")
             zip_path = hf_hub_download(
                 repo_id=repo_id,
                 filename=zip_file,
+                local_dir=download_dir,
                 repo_type="dataset",
+                token=hf_token
             )
             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                 json_files = [f for f in zip_ref.namelist()
                              if f.endswith('.json') and not f.startswith('__MACOSX')]
+                log_message(f"Найдено {len(json_files)} JSON файлов в {zip_file}")
                 for json_file in json_files:
                     try:
                         doc_id = json_data.get('document_id', os.path.basename(json_file))
                         sections = json_data.get('sections', [])
                         for section in sections:
+                            text = section.get('text', '').strip()
+                            if not text:
+                                continue
+                            metadata = {
+                                'document_id': doc_id,
+                                'section_id': section.get('section_id', ''),
+                                'section_path': section.get('section_path', ''),
+                                'section_text': section.get('section_text', ''),
+                                'parent_section': section.get('parent_section', ''),
+                                'parent_title': section.get('parent_title', ''),
+                                'level': section.get('level', ''),
+                                'type': 'text'
+                            }
+                            doc = Document(text=text, metadata=metadata)
+                            documents.append(doc)
                     except Exception as e:
+                        log_message(f"Ошибка обработки {json_file}: {str(e)}")
+        log_message(f"Всего загружено {len(documents)} текстовых документов")
+        if documents:
+            chunked_docs, chunk_info = process_documents_with_chunking(documents)
+            return chunked_docs, chunk_info
+        return [], []
     except Exception as e:
         log_message(f"Ошибка загрузки JSON: {str(e)}")
         return [], []
+def chunk_large_table(table_text, table_number, table_title, doc_id, max_tokens=1500):
+    chunks = []
+    lines = table_text.split('\n')
+    header_lines = [l for l in lines[:5] if l.strip()]
+    data_lines = [l for l in lines if l.strip() and l not in header_lines]
+    if len(table_text) < max_tokens:
+        return [table_text]
+    chunk_size = max(30, len(data_lines) // ((len(table_text) // max_tokens) + 1))
+    for i in range(0, len(data_lines), chunk_size):
+        chunk_data = data_lines[i:i+chunk_size]
+        chunk_text = f"Таблица {table_number} - {table_title}\n"
+        chunk_text += '\n'.join(header_lines) + '\n'
+        chunk_text += '\n'.join(chunk_data)
+        chunks.append(chunk_text)
+    return chunks
 def load_table_data(repo_id, hf_token, table_data_dir):
     log_message(f"Загрузка табличных данных из {table_data_dir}")
     documents = []
     try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(table_files)} файлов таблиц")
+        for table_file in table_files:
             try:
                 file_path = hf_hub_download(
                     repo_id=repo_id,
+                    filename=table_file,
+                    repo_type="dataset",
+                    token=hf_token
                 )
                 with open(file_path, 'r', encoding='utf-8') as f:
                     table_data = json.load(f)
+                doc_id = table_data.get('document_id', '')
+                table_number = table_data.get('table_number', '')
+                table_title = table_data.get('table_title', '')
+                table_text = f"Таблица {table_number} - {table_title}\n"
+                if 'headers' in table_data:
+                    table_text += "Заголовки: " + " | ".join(table_data['headers']) + "\n"
+                if 'data' in table_data:
+                    for row in table_data['data']:
+                        if isinstance(row, list):
+                            table_text += " | ".join(str(cell) for cell in row) + "\n"
+                        elif isinstance(row, dict):
+                            table_text += " | ".join(f"{k}: {v}" for k, v in row.items()) + "\n"
+                chunks = chunk_large_table(table_text, table_number, table_title, doc_id)
+                for idx, chunk_text in enumerate(chunks):
+                    metadata = {
+                        'document_id': doc_id,
+                        'table_number': table_number,
+                        'table_title': table_title,
+                        'type': 'table',
+                        'chunk_index': idx,
+                        'section_id': f"table_{table_number}",
+                        'section_path': f"Таблица {table_number}"
+                    }
+                    doc = Document(text=chunk_text, metadata=metadata)
                     documents.append(doc)
             except Exception as e:
+                log_message(f"Ошибка обработки таблицы {table_file}: {str(e)}")
         log_message(f"Загружено {len(documents)} табличных документов")
         return documents
     except Exception as e:
         log_message(f"Ошибка загрузки таблиц: {str(e)}")
         return []
 def load_image_data(repo_id, hf_token, image_data_dir):
     log_message(f"Загрузка данных изображений из {image_data_dir}")
     documents = []
     try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        image_files = [f for f in files if f.startswith(image_data_dir) and f.endswith('.json')]
+        log_message(f"Найдено {len(image_files)} файлов изображений")
+        for image_file in image_files:
             try:
                 file_path = hf_hub_download(
                     repo_id=repo_id,
+                    filename=image_file,
+                    repo_type="dataset",
+                    token=hf_token
                 )
                 with open(file_path, 'r', encoding='utf-8') as f:
                     image_data = json.load(f)
+                doc_id = image_data.get('document_id', '')
+                image_number = image_data.get('image_number', '')
+                image_title = image_data.get('image_title', '')
+                image_description = image_data.get('image_description', '')
+                text = f"Рисунок {image_number} - {image_title}\n"
+                if image_description:
+                    text += f"Описание: {image_description}"
+                metadata = {
+                    'document_id': doc_id,
+                    'image_number': image_number,
+                    'image_title': image_title,
+                    'type': 'image',
+                    'section_id': f"image_{image_number}",
+                    'section_path': f"Рисунок {image_number}"
+                }
+                doc = Document(text=text, metadata=metadata)
+                documents.append(doc)
             except Exception as e:
+                log_message(f"Ошибка обработки изображения {image_file}: {str(e)}")
         log_message(f"Загружено {len(documents)} документов изображений")
         return documents
     except Exception as e:
         log_message(f"Ошибка загрузки изображений: {str(e)}")
         return []
 def load_csv_chunks(repo_id, hf_token, chunks_filename, download_dir):
     log_message(f"Загрузка CSV чанков из {chunks_filename}")
+    documents = []
+    chunks_df = None
     try:
         csv_path = hf_hub_download(
             repo_id=repo_id,
             filename=chunks_filename,
+            local_dir=download_dir,
             repo_type="dataset",
+            token=hf_token
         )
+        chunks_df = pd.read_csv(csv_path)
+        log_message(f"Загружено {len(chunks_df)} строк из CSV")
+        for _, row in chunks_df.iterrows():
+            text = row.get('chunk_text', '')
+            if not text:
+                continue
             metadata = {
                 'document_id': row.get('document_id', 'unknown'),
+                'section_id': row.get('section_id', ''),
                 'section_path': row.get('section_path', ''),
                 'type': 'text'
             }
+            doc = Document(text=text, metadata=metadata)
+            documents.append(doc)
         log_message(f"Создано {len(documents)} документов из CSV")
+        return documents, chunks_df
     except Exception as e:
         log_message(f"Ошибка загрузки CSV: {str(e)}")
+        return [], None