Spaces:

Lana49
/

engineering-ai-assistant

Running

File size: 7,851 Bytes

96aaef2


import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import pickle
from pathlib import Path
import re


class QASystem:
    """Система вопрос-ответ на основе документов"""

    def __init__(self, use_llm: bool = False):
        print("🔄 Загрузка модели эмбеддингов...")
        self.model = SentenceTransformer('intfloat/multilingual-e5-small')
        self.index = None
        self.chunks = []
        self.dimension = 384
        self.is_ready = False
        self.use_llm = use_llm
        self.llm_engine = None

        if use_llm:
            try:
                from core.llm_engine import LLMEngine
                self.llm_engine = LLMEngine()
                print("✅ LLM Engine готов")
            except Exception as e:
                print(f"⚠️ LLM Engine не загружен: {e}")
                self.use_llm = False

    def chunk_text(self, text: str, doc_name: str, chunk_size=500, overlap=100):
        """Разбивает текст на чанки"""
        chunks = []
        sentences = re.split(r'[.!?]\s+', text)

        current_chunk = ""
        chunk_id = 0

        for sent in sentences:
            if len(current_chunk) + len(sent) < chunk_size:
                current_chunk += sent + ". "
            else:
                if current_chunk:
                    chunks.append({
                        'id': f"{doc_name}_{chunk_id}",
                        'text': current_chunk.strip(),
                        'doc_name': doc_name,
                        'chunk_id': chunk_id
                    })
                    chunk_id += 1
                    if overlap > 0:
                        words = current_chunk.split()
                        current_chunk = " ".join(words[-overlap//10:]) + " "
                current_chunk += sent + ". "

        if current_chunk:
            chunks.append({
                'id': f"{doc_name}_{chunk_id}",
                'text': current_chunk.strip(),
                'doc_name': doc_name,
                'chunk_id': chunk_id
            })

        return chunks

    def index_documents(self, documents_dir: Path):
        """Индексирует все документы в папке"""
        print(f"📁 Индексация документов из {documents_dir}")

        all_chunks = []

        # Читаем все файлы
        try:
            from core.parser import read_docx
        except ImportError:
            from docx import Document
            def read_docx(file_path):
                doc = Document(file_path)
                return '\n'.join([p.text for p in doc.paragraphs])

        for file_path in documents_dir.glob("*"):
            if file_path.suffix in ['.docx', '.txt']:
                try:
                    if file_path.suffix == '.docx':
                        text = read_docx(file_path)
                    else:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            text = f.read()

                    chunks = self.chunk_text(text, file_path.stem)
                    all_chunks.extend(chunks)
                    print(f"  ✅ {file_path.name}: {len(chunks)} чанков")
                except Exception as e:
                    print(f"  ❌ {file_path.name}: {e}")

        if not all_chunks:
            print("❌ Нет документов для индексации")
            return False

        # Генерируем эмбеддинги
        print(f"📊 Генерация эмбеддингов для {len(all_chunks)} чанков...")
        texts = [chunk['text'] for chunk in all_chunks]
        embeddings = self.model.encode(texts, show_progress_bar=True)

        # Нормализуем для косинусного сходства
        embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

        # Создаем FAISS индекс
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings.astype(np.float32))
        self.chunks = all_chunks

        print(f"✅ Индекс создан: {self.index.ntotal} векторов")
        self.is_ready = True
        return True

    def search(self, query: str, top_k: int = 5) -> List[Dict]:
        """Поиск релевантных чанков по вопросу"""
        if not self.is_ready:
            return []

        # Кодируем вопрос
        query_emb = self.model.encode([query])
        query_emb = query_emb / np.linalg.norm(query_emb)

        # Поиск
        scores, indices = self.index.search(query_emb.astype(np.float32), top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx >= 0 and score > 0.5:
                results.append({
                    'text': self.chunks[idx]['text'],
                    'doc_name': self.chunks[idx]['doc_name'],
                    'score': float(score)
                })

        return results

    def answer(self, question: str, top_k: int = 5) -> Dict:
        """Ответ на вопрос на основе найденных чанков"""
        relevant = self.search(question, top_k)

        if not relevant:
            return {
                'question': question,
                'answer': "Извините, не нашел информацию по вашему вопросу в документации.",
                'sources': []
            }

        # Если используем LLM, генерируем умный ответ
        if self.use_llm and self.llm_engine:
            # Формируем контекст
            context = ""
            for chunk in relevant[:3]:
                context += f"\n--- {chunk['doc_name']} ---\n"
                context += chunk['text'][:500] + "\n"

            # Генерируем ответ через LLM
            try:
                result = self.llm_engine.answer_with_context(question, context, relevant[:3])
                return result
            except Exception as e:
                print(f"❌ Ошибка LLM: {e}")
                # Fallback к простому ответу

        # Иначе простой ответ
        answer = f"**По вашему вопросу найдена информация:**\n\n"
        for i, chunk in enumerate(relevant[:3], 1):
            answer += f"**{i}. {chunk['doc_name']}** (релевантность: {chunk['score']:.2f})\n"
            answer += f"{chunk['text'][:400]}\n\n"

        return {
            'question': question,
            'answer': answer,
            'sources': relevant
        }

    def set_top_k(self, top_k: int):
        """Установить количество возвращаемых фрагментов"""
        self.top_k = top_k

    def save_index(self, path: Path):
        """Сохраняет индекс"""
        if not self.is_ready:
            return

        faiss.write_index(self.index, str(path / "index.faiss"))
        with open(path / "chunks.pkl", 'wb') as f:
            pickle.dump(self.chunks, f)
        print(f"✅ Индекс сохранен в {path}")

    def load_index(self, path: Path):
        """Загружает индекс"""
        index_path = path / "index.faiss"
        chunks_path = path / "chunks.pkl"

        if not index_path.exists() or not chunks_path.exists():
            return False

        self.index = faiss.read_index(str(index_path))
        with open(chunks_path, 'rb') as f:
            self.chunks = pickle.load(f)

        self.is_ready = True
        print(f"✅ Индекс загружен: {self.index.ntotal} векторов")
        return True