Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

2eb8b63

1 Parent(s): 2b217eb

chunk size = 2048 + rows=15

Browse files

Files changed (2) hide show

documents_prep.py +46 -62
index_retriever.py +19 -15

documents_prep.py CHANGED Viewed

@@ -7,7 +7,7 @@ from llama_index.core.text_splitter import SentenceSplitter
 from my_logging import log_message
 # Configuration
-CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
@@ -501,69 +501,53 @@ def extract_sections_from_json(json_path):
     return documents
-def load_image_data(repo_id, hf_token, image_data_dir):
-    log_message("Начинаю загрузку данных изображений")
-    image_files = []
-    try:
-        files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
-        for file in files:
-            if file.startswith(image_data_dir) and file.endswith('.csv'):
-                image_files.append(file)
-        log_message(f"Найдено {len(image_files)} CSV файлов с изображениями")
-        image_documents = []
-        for file_path in image_files:
-            try:
-                log_message(f"Обрабатываю файл изображений: {file_path}")
-                local_path = hf_hub_download(
-                    repo_id=repo_id,
-                    filename=file_path,
-                    local_dir='',
-                    repo_type="dataset",
-                    token=hf_token
-                )
-                df = pd.read_csv(local_path)
-                log_message(f"Загружено {len(df)} записей изображений из файла {file_path}")
-                # Обработка с правильными названиями колонок
-                for _, row in df.iterrows():
-                    section_value = row.get('Раздел документа', 'Неизвестно')
-                    content = f"Изображение: {row.get('№ Изображения', 'Неизвестно')}\n"
-                    content += f"Название: {row.get('Название изображения', 'Неизвестно')}\n"
-                    content += f"Описание: {row.get('Описание изображение', 'Неизвестно')}\n"  # Опечатка в названии колонки
-                    content += f"Документ: {row.get('Обозначение документа', 'Неизвестно')}\n"
-                    content += f"Раздел: {section_value}\n"
-                    content += f"Файл: {row.get('Файл изображения', 'Неизвестно')}\n"
-                    doc = Document(
-                        text=content,
-                        metadata={
-                            "type": "image",
-                            "image_number": str(row.get('№ Изображения', 'unknown')),
-                            "image_title": str(row.get('Название изображения', 'unknown')),
-                            "image_description": str(row.get('Описание изображение', 'unknown')),
-                            "document_id": str(row.get('Обозначение документа', 'unknown')),
-                            "file_path": str(row.get('Файл изображения', 'unknown')),
-                            "section": str(section_value),
-                            "section_id": str(section_value)
-                        }
-                    )
-                    image_documents.append(doc)
-            except Exception as e:
-                log_message(f"Ошибка обработки файла {file_path}: {str(e)}")
-                continue
-        log_message(f"Создано {len(image_documents)} документов из изображений")
-        return image_documents
-    except Exception as e:
-        log_message(f"Ошибка загрузки данных изображений: {str(e)}")
-        return []
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
@@ -579,7 +563,7 @@ def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
-    image_docs = load_image_data(repo_id, hf_token, image_dir)
     all_docs = text_chunks + table_chunks + image_docs

 from my_logging import log_message
 # Configuration
+CHUNK_SIZE = 1024
 CHUNK_OVERLAP = 128
 def chunk_text_documents(documents):
     return documents
+def load_image_documents(repo_id, hf_token, image_dir):
+    """Load image descriptions"""
+    log_message("Loading images...")
+    files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
+    csv_files = [f for f in files if f.startswith(image_dir) and f.endswith('.csv')]
+    documents = []
+    for file_path in csv_files:
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=file_path,
+                repo_type="dataset",
+                token=hf_token
+            )
+            df = pd.read_csv(local_path)
+            for _, row in df.iterrows():
+                content = f"Документ: {row.get('Обозначение документа', 'unknown')}\n"
+                content += f"Рисунок: {row.get('№ Изображения', 'unknown')}\n"
+                content += f"Название: {row.get('Название изображения', '')}\n"
+                content += f"Описание: {row.get('Описание изображение', '')}\n"
+                content += f"Раздел: {row.get('Раздел документа', '')}\n"
+                chunk_size = len(content)
+                documents.append(Document(
+                    text=content,
+                    metadata={
+                        'type': 'image',
+                        'document_id': str(row.get('Обозначение документа', 'unknown')),
+                        'image_number': str(row.get('№ Изображения', 'unknown')),
+                        'section': str(row.get('Раздел документа', '')),
+                        'chunk_size': chunk_size
+                    }
+                ))
+        except Exception as e:
+            log_message(f"Error loading {file_path}: {e}")
+    if documents:
+        avg_size = sum(d.metadata['chunk_size'] for d in documents) / len(documents)
+        log_message(f"✓ Loaded {len(documents)} images (avg size: {avg_size:.0f} chars)")
+    return documents
 def load_all_documents(repo_id, hf_token, json_dir, table_dir, image_dir):
     """Main loader - combines all document types"""
     table_chunks = load_table_documents(repo_id, hf_token, table_dir)
     # Load images (no chunking needed)
+    image_docs = load_image_documents(repo_id, hf_token, image_dir)
     all_docs = text_chunks + table_chunks + image_docs

index_retriever.py CHANGED Viewed

@@ -31,21 +31,20 @@ def keyword_filter_nodes(query, nodes, min_keyword_matches=1):
 def normalize_doc_id(doc_id: str) -> str:
-    """Normalize document ID - KEEP dots for numeric parts"""
     doc_id = doc_id.upper().strip()
-    doc_id = re.sub(r'\s+', '', doc_id)  # Remove spaces only
     doc_id = doc_id.replace("ГОСТР", "ГОСТ")
     doc_id = doc_id.replace("GOSTR", "ГОСТ")
     return doc_id
 def base_number(doc_id: str) -> str:
-    """Extract full numeric pattern including all parts (e.g., '59023.6' from 'ГОСТ 59023.6')"""
-    # Match: 59023.6 or 59023.4 or 50.05.01 etc.
-    m = re.search(r'(\d+(?:\.\d+)*)', doc_id)
     return m.group(1) if m else ""
-def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
-    """Filter nodes by document ID with strict numeric matching"""
     if not doc_ids:
         return nodes
@@ -58,17 +57,22 @@ def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.85):
         node_base = base_number(node_doc_id)
         for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
-            # STRICT: base number must match exactly
             if q_base and node_base and q_base == node_base:
                 filtered.append(node)
                 break
-            # STRICT: full normalized ID must match exactly or have very high similarity
-            elif SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
                 filtered.append(node)
                 break
-    return filtered if filtered else nodes
 def extract_doc_id_from_query(query):
@@ -108,17 +112,17 @@ def create_query_engine(vector_index):
     vector_retriever = VectorIndexRetriever(
         index=vector_index,
-        similarity_top_k=200
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
-        similarity_top_k=200,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
-        similarity_top_k=100,
         num_queries=1
     )

 def normalize_doc_id(doc_id: str) -> str:
+    """Normalize document ID for consistent comparison."""
     doc_id = doc_id.upper().strip()
+    doc_id = re.sub(r'[^\w\d\.]+', '', doc_id)  # remove spaces, dashes, etc.
     doc_id = doc_id.replace("ГОСТР", "ГОСТ")
     doc_id = doc_id.replace("GOSTR", "ГОСТ")
     return doc_id
 def base_number(doc_id: str) -> str:
+    """Extract base numeric pattern (e.g., '59023.4' from 'ГОСТ Р 59023.4-2020')."""
+    m = re.search(r'(\d+(?:\.\d+)+)', doc_id)
     return m.group(1) if m else ""
+def filter_nodes_by_doc_id(nodes, doc_ids, threshold=0.75):
+    """Filter nodes by normalized document ID with fallback to fuzzy numeric match."""
     if not doc_ids:
         return nodes
         node_base = base_number(node_doc_id)
         for q_doc, q_base in zip(doc_ids_norm, doc_ids_base):
+            # Strong match: same base number (e.g., 59023.4)
             if q_base and node_base and q_base == node_base:
                 filtered.append(node)
                 break
+            # Medium match: similarity ratio > threshold
+            if SequenceMatcher(None, node_doc_id, q_doc).ratio() >= threshold:
                 filtered.append(node)
                 break
+            # Weak fallback: contains or partial substring
+            if q_base in node_doc_id or q_doc in node_doc_id:
+                filtered.append(node)
+                break
+    return filtered if filtered else nodes  # Fallback: keep all if none matched
 def extract_doc_id_from_query(query):
     vector_retriever = VectorIndexRetriever(
         index=vector_index,
+        similarity_top_k=100
     )
     bm25_retriever = BM25Retriever.from_defaults(
         docstore=vector_index.docstore,
+        similarity_top_k=100,
         tokenizer=russian_tokenizer  # Add custom tokenizer
     )
     hybrid_retriever = QueryFusionRetriever(
         [vector_retriever, bm25_retriever],
+        similarity_top_k=60,
         num_queries=1
     )