Spaces:

MrSimple01
/

RAG_AIEXP_01

Sleeping

App Files Files Community

MrSimple07 commited on Oct 6, 2025

Commit

a42e1ff

1 Parent(s): 40de98c

eski holat with utils

Browse files

Files changed (3) hide show

index_retriever.py +15 -50
table_prep.py +43 -119
utils.py +7 -31

index_retriever.py CHANGED Viewed

@@ -12,7 +12,7 @@ def create_vector_index(documents):
     log_message("Строю векторный индекс")
     return VectorStoreIndex.from_documents(documents)
-def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, diversity_penalty=0.3):
     if not nodes or not reranker:
         return nodes[:top_k]
@@ -25,53 +25,16 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
-        if min_score_threshold is not None:
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
-            log_message(f"После фильтрации по порогу {min_score_threshold}: {len(scored_nodes)} узлов")
-        if not scored_nodes:
-            log_message("Нет узлов после фильтрации, снижаю порог")
-            scored_nodes = list(zip(nodes, scores))
-            scored_nodes.sort(key=lambda x: x[1], reverse=True)
-            min_score_threshold = scored_nodes[0][1] * 0.6
-            scored_nodes = [(node, score) for node, score in scored_nodes
-                          if score >= min_score_threshold]
-        selected_nodes = []
-        selected_docs = set()
-        selected_sections = set()
-        for node, score in scored_nodes:
-            if len(selected_nodes) >= top_k:
-                break
-            metadata = node.metadata if hasattr(node, 'metadata') else {}
-            doc_id = metadata.get('document_id', 'unknown')
-            section_key = f"{doc_id}_{metadata.get('section_path', metadata.get('section_id', ''))}"
-            # Apply diversity penalty
-            penalty = 0
-            if doc_id in selected_docs:
-                penalty += diversity_penalty * 0.5
-            if section_key in selected_sections:
-                penalty += diversity_penalty
-            adjusted_score = score * (1 - penalty)
-            # Add if still competitive
-            if not selected_nodes or adjusted_score >= selected_nodes[0][1] * 0.6:
-                selected_nodes.append((node, score))
-                selected_docs.add(doc_id)
-                selected_sections.add(section_key)
-        log_message(f"Выбрано {len(selected_nodes)} узлов с разнообразием")
-        log_message(f"Уникальных документов: {len(selected_docs)}, секций: {len(selected_sections)}")
-        if selected_nodes:
-            log_message(f"Score range: {selected_nodes[0][1]:.3f} to {selected_nodes[-1][1]:.3f}")
-        return [node for node, score in selected_nodes]
     except Exception as e:
         log_message(f"Ошибка переранжировки: {str(e)}")
@@ -79,26 +42,28 @@ def rerank_nodes(query, nodes, reranker, top_k=25, min_score_threshold=0.5, dive
 def create_query_engine(vector_index):
     try:
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
-            similarity_top_k=20
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
-            similarity_top_k=30,
             similarity_cutoff=0.65
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
-            similarity_top_k=40,
             num_queries=1
         )
-        custom_prompt_template = PromptTemplate(PROMPT_SIMPLE_POISK)
         response_synthesizer = get_response_synthesizer(
-            response_mode=ResponseMode.TREE_SUMMARIZE,
             text_qa_template=custom_prompt_template
         )

     log_message("Строю векторный индекс")
     return VectorStoreIndex.from_documents(documents)
+def rerank_nodes(query, nodes, reranker, top_k=20, min_score_threshold=0.5):
     if not nodes or not reranker:
         return nodes[:top_k]
         scored_nodes.sort(key=lambda x: x[1], reverse=True)
+        # Apply threshold
+        filtered = [(node, score) for node, score in scored_nodes if score >= min_score_threshold]
+        if not filtered:
+            # Lower threshold if nothing passes
+            filtered = scored_nodes[:top_k]
+        log_message(f"Выбрано {min(len(filtered), top_k)} узлов")
+        return [node for node, score in filtered[:top_k]]
     except Exception as e:
         log_message(f"Ошибка переранжировки: {str(e)}")
 def create_query_engine(vector_index):
     try:
+        from config import CUSTOM_PROMPT
         bm25_retriever = BM25Retriever.from_defaults(
             docstore=vector_index.docstore,
+            similarity_top_k=40
         )
         vector_retriever = VectorIndexRetriever(
             index=vector_index,
+            similarity_top_k=40,
             similarity_cutoff=0.65
         )
         hybrid_retriever = QueryFusionRetriever(
             [vector_retriever, bm25_retriever],
+            similarity_top_k=40,
             num_queries=1
         )
+        custom_prompt_template = PromptTemplate(CUSTOM_PROMPT)
         response_synthesizer = get_response_synthesizer(
+            response_mode=ResponseMode.TREE_SUMMARIZE,
             text_qa_template=custom_prompt_template
         )

table_prep.py CHANGED Viewed

@@ -32,36 +32,12 @@ def create_table_content(table_data):
 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
-def extract_table_metadata(table_text: str) -> dict:
-    words = table_text.split()
-    unique_words = set(words)
-    from collections import Counter
-    stopwords = {"и", "в", "на", "по", "с", "для", "из", "при", "а", "как", "или", "но", "к", "от"}
-    filtered = [w for w in words if len(w) > 3 and w.lower() not in stopwords]
-    common = Counter(filtered).most_common(15)
-    key_terms = [w for w, _ in common]
-    return {
-        "summary": f"Таблица содержит около {len(words)} слов и {len(unique_words)} уникальных терминов.",
-        "materials": [],   # if you want to extract material names, hook in regex or LLM here
-        "key_terms": key_terms
-    }
-def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chunk=4):
-    if chunk_size is None:
-        chunk_size = CHUNK_SIZE
-    if chunk_overlap is None:
-        chunk_overlap = CHUNK_OVERLAP
-    # Extract critical metadata from table before chunking
-    table_metadata = extract_table_metadata(doc.text)
     table_num = doc.metadata.get('table_number', 'unknown')
-    table_title = doc.metadata.get('table_title', 'unknown')
-    doc_id = doc.metadata.get('document_id', 'unknown')
-    section = doc.metadata.get('section', 'unknown')
-    # Parse table structure
     lines = doc.text.strip().split('\n')
     table_header_lines = []
@@ -80,109 +56,59 @@ def chunk_table_document(doc, chunk_size=None, chunk_overlap=None, rows_per_chun
     table_header = '\n'.join(table_header_lines) + '\n'
     if not data_rows:
-        log_message(f"  ⚠️ Таблица {table_num}: нет строк данных, использую стандартное разбиение")
-        text_splitter = SentenceSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-            separator="\n"
-        )
-        text_chunks = text_splitter.split_text(doc.text)
-        log_message(f"  📊 Стандартное разбиение: {len(text_chunks)} чанков")
-    else:
-        log_message(f"  📋 Таблица {table_num}: найдено {len(data_rows)} строк данных")
-        header_size = len(table_header)
-        available_size = chunk_size - header_size - 300  # Reserve for enrichment
-        text_chunks = []
-        current_chunk_rows = []
-        current_size = 0
-        for row in data_rows:
-            row_size = len(row) + 1
-            # If single row exceeds available size, split it
-            if row_size > available_size:
-                log_message(f"    ⚠️ Строка слишком длинная ({row_size} символов), разбиваем внутри строки")
-                # Flush current chunk if exists
-                if current_chunk_rows:
-                    chunk_text = table_header + '\n'.join(current_chunk_rows)
-                    text_chunks.append(chunk_text)
-                    log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-                    current_chunk_rows = []
-                    current_size = 0
-                # Split the oversized row
-                text_splitter = SentenceSplitter(
-                    chunk_size=available_size,
-                    chunk_overlap=100,
-                    separator=" | "
-                )
-                row_parts = text_splitter.split_text(row)
-                log_message(f"      Строка разделена на {len(row_parts)} частей")
-                for part in row_parts:
-                    chunk_text = table_header + part
-                    text_chunks.append(chunk_text)
-                    log_message(f"      Под-чанк создан: {len(chunk_text)} символов")
-                continue
-            # Check if adding row would exceed rows_per_chunk OR size limit
-            if (len(current_chunk_rows) >= rows_per_chunk or
-                (current_size + row_size > available_size)) and current_chunk_rows:
-                chunk_text = table_header + '\n'.join(current_chunk_rows)
-                text_chunks.append(chunk_text)
-                log_message(f"    ✂️ Чанк создан: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-                # Overlap: keep last 1 row
-                overlap_count = min(1, len(current_chunk_rows))
-                current_chunk_rows = current_chunk_rows[-overlap_count:]
-                current_size = sum(len(r) + 1 for r in current_chunk_rows)
-            current_chunk_rows.append(row)
-            current_size += row_size
-        # Final chunk
-        if current_chunk_rows:
-            chunk_text = table_header + '\n'.join(current_chunk_rows)
-            text_chunks.append(chunk_text)
-            log_message(f"    ✂️ Последний чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
-    log_message(f"  📊 Таблица {table_num} разделена на {len(text_chunks)} чанков")
-    # Create enriched chunks (rest of the function remains the same)
-    chunked_docs = []
-    materials = table_metadata.get("materials", [])
-    key_terms = table_metadata.get("key_terms", [])
-    for i, chunk_text in enumerate(text_chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
-            "total_chunks": len(text_chunks),
             "chunk_size": len(chunk_text),
-            "is_chunked": True,
-            "materials": materials,
-            "key_terms": key_terms,
-            "table_summary": table_metadata.get("summary", "")
         })
-        materials_str = ', '.join(materials[:10]) if materials else 'нет'
-        terms_str = ', '.join(key_terms[:10]) if key_terms else 'нет'
-        enriched_text = f"""[Таблица {table_num}: {table_title}]
-[Материалы в таблице: {materials_str}]
-[Ключевые термины: {terms_str}]
-{chunk_text}"""
-        chunked_doc = Document(
-            text=enriched_text,
-            metadata=chunk_metadata
-        )
         chunked_docs.append(chunked_doc)
     return chunked_docs
@@ -222,8 +148,6 @@ def table_to_document(table_data, document_id=None):
     )
     if content_size > CHUNK_SIZE:
-        log_message(f"📊 CHUNKING: Таблица {table_num} из '{doc_id}' | "
-                   f"Размер: {content_size} > {CHUNK_SIZE} | Строк: {row_count}")
         chunked_docs = chunk_table_document(base_doc)
         log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
         for i, chunk_doc in enumerate(chunked_docs):

 from llama_index.core.text_splitter import SentenceSplitter
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+def chunk_table_document(doc, max_rows_per_chunk=5, max_chunk_size=2000):
+    """Simple table chunking: max 5 rows or 2000 chars per chunk"""
     table_num = doc.metadata.get('table_number', 'unknown')
+    # Parse table
     lines = doc.text.strip().split('\n')
     table_header_lines = []
     table_header = '\n'.join(table_header_lines) + '\n'
     if not data_rows:
+        # No rows, return as is
+        return [doc]
+    log_message(f"Таблица {table_num}: {len(data_rows)} строк")
+    # Simple chunking
+    chunks = []
+    current_chunk_rows = []
+    current_size = len(table_header)
+    for row in data_rows:
+        row_size = len(row) + 1
+        # Check if adding this row exceeds limits
+        if (len(current_chunk_rows) >= max_rows_per_chunk or
+            current_size + row_size > max_chunk_size) and current_chunk_rows:
+            # Save current chunk
+            chunk_text = table_header + '\n'.join(current_chunk_rows)
+            chunks.append(chunk_text)
+            log_message(f"  Чанк: {len(current_chunk_rows)} строк, {len(chunk_text)} символов")
+            # Start new chunk with overlap of 1 row
+            if len(current_chunk_rows) > 0:
+                current_chunk_rows = [current_chunk_rows[-1]]
+                current_size = len(table_header) + len(current_chunk_rows[0]) + 1
+            else:
+                current_chunk_rows = []
+                current_size = len(table_header)
+        current_chunk_rows.append(row)
+        current_size += row_size
+    # Final chunk
+    if current_chunk_rows:
+        chunk_text = table_header + '\n'.join(current_chunk_rows)
+        chunks.append(chunk_text)
+        log_message(f"  Последний чанк: {len(current_chunk_rows)} строк")
+    log_message(f"Таблица {table_num} разделена на {len(chunks)} чанков")
+    # Create documents
+    chunked_docs = []
+    for i, chunk_text in enumerate(chunks):
         chunk_metadata = doc.metadata.copy()
         chunk_metadata.update({
             "chunk_id": i,
+            "total_chunks": len(chunks),
             "chunk_size": len(chunk_text),
+            "is_chunked": True
         })
+        chunked_doc = Document(text=chunk_text, metadata=chunk_metadata)
         chunked_docs.append(chunked_doc)
     return chunked_docs
     )
     if content_size > CHUNK_SIZE:
         chunked_docs = chunk_table_document(base_doc)
         log_message(f"  ✂️ Разделена на {len(chunked_docs)} чанков")
         for i, chunk_doc in enumerate(chunked_docs):

utils.py CHANGED Viewed

@@ -261,41 +261,20 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
     try:
         start_time = time.time()
-        llm = get_llm_model(current_model)
-        # Direct retrieval without query expansion
         retrieved_nodes = query_engine.retriever.retrieve(question)
-        total_retrieved = len(retrieved_nodes)
-        log_message(f"RETRIEVED: {total_retrieved} nodes (before deduplication)")
         # Deduplicate
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
-        duplicates_removed = total_retrieved - len(unique_retrieved)
-        log_message(f"DEDUPLICATION: {duplicates_removed} duplicates removed")
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
-        reranked_nodes = rerank_nodes(
-            question,
-            unique_retrieved,
-            reranker,
-            top_k=20,
-            min_score_threshold=0.5,
-            diversity_penalty=0.3
-        )
-        formatted_context = format_context_for_llm(reranked_nodes)
-        enhanced_question = f"""Контекст из базы данных:
-{formatted_context}
-Вопрос пользователя: {question}
-Инструкция: Ответь на вопрос, используя ТОЛЬКО информацию из контекста выше.
-Если информации недостаточно, четко укажи это. Цитируй конкретные источники."""
-        response = query_engine.query(enhanced_question)
         end_time = time.time()
         processing_time = end_time - start_time
@@ -317,12 +296,9 @@ def answer_question(question, query_engine, reranker, current_model, chunks_df=N
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
-                'section_id': metadata.get('section_id', metadata.get('section', 'unknown')),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
-                'level': metadata.get('level', ''),
-                'parent_section': metadata.get('parent_section', ''),
-                'parent_title': metadata.get('parent_title', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),

     try:
         start_time = time.time()
+        # Simple retrieval
         retrieved_nodes = query_engine.retriever.retrieve(question)
+        log_message(f"RETRIEVED: {len(retrieved_nodes)} nodes")
         # Deduplicate
         unique_retrieved = deduplicate_nodes(retrieved_nodes)
         log_message(f"UNIQUE NODES: {len(unique_retrieved)} nodes")
+        # Simple reranking
+        reranked_nodes = rerank_nodes(question, unique_retrieved, reranker, top_k=20)
+        # Direct query without formatting
+        response = query_engine.query(question)
         end_time = time.time()
         processing_time = end_time - start_time
             metadata = node.metadata if hasattr(node, 'metadata') else {}
             chunk_info.append({
                 'document_id': metadata.get('document_id', 'unknown'),
+                'section_id': metadata.get('section_id', 'unknown'),
                 'section_path': metadata.get('section_path', ''),
                 'section_text': metadata.get('section_text', ''),
                 'type': metadata.get('type', 'text'),
                 'table_number': metadata.get('table_number', ''),
                 'image_number': metadata.get('image_number', ''),