Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

7504d82

1 Parent(s): ec64429

max rows = 20, 100 + 100 bm25

Browse files

Files changed (2) hide show

documents_prep.py +62 -46
index_retriever.py +3 -3

documents_prep.py CHANGED Viewed

@@ -38,7 +38,7 @@ def chunk_text_documents(documents):
     return chunked
-def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=10, max_chars=2000):
     """
     Chunk tables by rows with fallback to character limit.
     Keeps 3-4 rows together, but splits individual rows if they're too large.
@@ -501,53 +501,69 @@ def extract_sections_from_json(json_path):
     return documents
-def load_image_documents(repo_id, hf_token, image_dir):
-    """Load image descriptions"""
-    log_message("Loading images...")
-    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
-    documents = []
-    for file_path in csv_files:
-        try:
-            local_path = hf_hub_download(
-                repo_id=repo_id,
-                filename=file_path,
-                repo_type="dataset",
-                token=hf_token
-            )
-            df = pd.read_csv(local_path)
-            for _, row in df.iterrows():
-                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
-                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
-                content += f"Название: {row.get('Название изображения', '')}\n"
-                content += f"Описание: {row.get('Описание изображение', '')}\n"
-                content += f"Раздел: {row.get('Раздел документа', '')}\n"
-                chunk_size = len(content)
-                documents.append(Document(
-                    text=content,
-                    metadata={
-                        'type': 'image',
-                        'document_id': str(row.get('Обозначение документа', 'unknown')),
-                        'image_number': str(row.get('№ Изображения', 'unknown')),
-                        'section': str(row.get('Раздел документа', '')),
-                        'chunk_size': chunk_size
-                    }
-                ))
-        except Exception as e:
-            log_message(f"Error loading {file_path}: {e}")
-    if documents:
-        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
-        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
-    return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
@@ -563,7 +579,7 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
-    image_docs = load_image_documents(repo_id, hf_token, image_dir)
     all_docs = text_chunks + table_chunks + image_docs

     return chunked
+def chunk_table_by_rows(table_data, doc_id, rows_per_chunk=20, max_chars=2000):
     """
     Chunk tables by rows with fallback to character limit.
     Keeps 3-4 rows together, but splits individual rows if they're too large.
     return documents
+def load_image_data(repo_id, hf_token, image_data_dir):
+    log_message("Начинаю загрузку данных изображений")
+    image_files = []
+    try:
+        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+        for file in files:
+            if file.startswith(image_data_dir) and file.endswith('.csv'):
+                image_files.append(file)
+        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
+        image_documents = []
+        for file_path in image_files:
+            try:
+                log_message(f"Обрабатываю файл изображений: {file_path}")
+                local_path = hf_hub_download(
+                    repo_id=repo_id,
+                    filename=file_path,
+                    local_dir='',
+                    repo_type="dataset",
+                    token=hf_token
+                )
+                df = pd.read_csv(local_path)
+                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
+                # Обработка с правильными названиями колонок
+                for _, row in df.iterrows():
+                    section_value = row.get('Раздел документа', 'Неизвестно')
+                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
+                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
+                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в названии колонки
+                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
+                    content += f"Раздел: {section_value}\n"
+                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
+                    doc = Document(
+                        text=content,
+                        metadata={
+                            "type": "image",
+                            "image_number": str(row.get('№ Изображения', 'unknown')),
+                            "image_title": str(row.get('Название изображения', 'unknown')),
+                            "image_description": str(row.get('Описание изображение', 'unknown')),
+                            "document_id": str(row.get('Обозначение документа', 'unknown')),
+                            "file_path": str(row.get('Файл изображения', 'unknown')),
+                            "section": str(section_value),
+                            "section_id": str(section_value)
+                        }
+                    )
+                    image_documents.append(doc)
+            except Exception as e:
+                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
+                continue
+        log_message(f"Создано {len(image_documents)} документов из изображений")
+        return image_documents
+    except Exception as e:
+        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
+        return []
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
+    image_docs = load_image_data(repo_id, hf_token, image_dir)
     all_docs = text_chunks + table_chunks + image_docs

index_retriever.py CHANGED Viewed

@@ -108,17 +108,17 @@ def create_query_engine(vector_index):
     vector_retriever = VectorIndexRetriever(
         index=vector_index,
-        similarity_top_k=100
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
-        similarity_top_k=100,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
-        similarity_top_k=60,
         num_queries=1
     )

     vector_retriever = VectorIndexRetriever(
         index=vector_index,
+        similarity_top_k=200
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
+        similarity_top_k=200,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
+        similarity_top_k=100,
         num_queries=1
     )