Spaces:

Rulga
/

status-law-gbot

Running

App Files Files Community

Rulga commited on Mar 26

Commit

0bb77b3

1 Parent(s): 9949f77

Translate comments and error messages to English for consistency and clarity

Browse files

Files changed (4) hide show

config/constants.py +19 -25
config/settings.py +11 -11
src/knowledge_base/loader.py +4 -4
src/knowledge_base/vector_store.py +15 -15

config/constants.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# URLs для создания базы знаний
 URLS = [
      "https://status.law",
     "https://status.law/about",
@@ -15,35 +15,29 @@ URLS = [
     "https://status.law/faq"
 ]
-# Настройки для чанкирования текста
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 100
-# Шаблон системного сообщения
 DEFAULT_SYSTEM_MESSAGE = """
 You are a helpful and polite legal assistant at Status Law.
-            You answer in the language in which the question was asked.
-            Answer the question based on the context provided.
-            If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
-            - For all users: +32465594521 (landline phone).
-            - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
-            - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
-            If the user has questions about specific services and their costs, suggest they visit the page https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/ for detailed information.
-            Ask the user additional questions to understand which service to recommend and provide an estimated cost. For example, clarify their situation and needs to suggest the most appropriate options.
-            Also, offer free consultations if they are available and suitable for the user's request.
-            Answer professionally but in a friendly manner.
-            Example:
-            Q: How can I challenge the sanctions?
-            A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
-            Context: {context}
-            Question: {question}
-            Response Guidelines:
-            1. Answer in the user's language
-            2. Cite sources when possible
-            3. Offer contact options if unsure
-"""

+# URLs for knowledge base creation
 URLS = [
      "https://status.law",
     "https://status.law/about",
     "https://status.law/faq"
 ]
+# Text chunking settings
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 100
+# System message template
 DEFAULT_SYSTEM_MESSAGE = """
 You are a helpful and polite legal assistant at Status Law.
+You answer in the language in which the question was asked.
+Answer the question based on the context provided.
+If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
+- For all users: +32465594521 (landline phone).
+- For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
+- Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
+Example:
+Q: How can I challenge the sanctions?
+A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
+Context: {context}
+Question: {question}
+Response Guidelines:
+1. Answer in the user's language
+2. Cite sources when possible
+3. Offer contact options if unsure
+"""

config/settings.py CHANGED Viewed

@@ -1,31 +1,31 @@
 import os
 from dotenv import load_dotenv
-# Отладочная информация
-print("Текущая директория:", os.getcwd())
 env_path = os.path.join(os.getcwd(), '.env')
-print("Путь к .env:", env_path)
-print("Файл .env существует:", os.path.exists(env_path))
 if os.path.exists(env_path):
     with open(env_path, 'r') as f:
-        print("Содержимое .env файла:", f.read())
-# Загрузка переменных окружения
 load_dotenv(verbose=True)
-# Пути к директориям
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 VECTOR_STORE_PATH = os.path.join(BASE_DIR, "data", "vector_store")
-# Настройки моделей
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
-# API токены
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 if not HF_TOKEN:
-    raise ValueError("HUGGINGFACE_TOKEN не найден в переменных окружения")
-# Настройки запросов
 USER_AGENT = "Status-Law-Assistant/1.0"

 import os
 from dotenv import load_dotenv
+# Debug information
+print("Current directory:", os.getcwd())
 env_path = os.path.join(os.getcwd(), '.env')
+print("Path to .env:", env_path)
+print(".env file exists:", os.path.exists(env_path))
 if os.path.exists(env_path):
     with open(env_path, 'r') as f:
+        print("Contents of .env file:", f.read())
+# Load environment variables
 load_dotenv(verbose=True)
+# Directory paths
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 VECTOR_STORE_PATH = os.path.join(BASE_DIR, "data", "vector_store")
+# Model settings
 EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
+# API tokens
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 if not HF_TOKEN:
+    raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
+# Request settings
 USER_AGENT = "Status-Law-Assistant/1.0"

src/knowledge_base/loader.py CHANGED Viewed

@@ -5,7 +5,7 @@ from langchain_core.documents import Document
 from config.constants import URLS
 def load_documents():
-    """Загрузка документов с веб-сайта"""
     documents = []
     headers = {
@@ -21,8 +21,8 @@ def load_documents():
             docs = loader.load()
             if docs:
                 documents.extend(docs)
-                print(f"Загружено {url}: {len(docs)} документов")
         except Exception as e:
-            print(f"Ошибка загрузки {url}: {str(e)}")
-    return documents

 from config.constants import URLS
 def load_documents():
+    """Load documents from website"""
     documents = []
     headers = {
             docs = loader.load()
             if docs:
                 documents.extend(docs)
+                print(f"Loaded {url}: {len(docs)} documents")
         except Exception as e:
+            print(f"Error loading {url}: {str(e)}")
+    return documents

src/knowledge_base/vector_store.py CHANGED Viewed

@@ -9,37 +9,37 @@ from config.settings import VECTOR_STORE_PATH, EMBEDDING_MODEL, HF_TOKEN
 from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
 def get_embeddings():
-    """Получение модели эмбеддингов"""
     return HuggingFaceEmbeddings(
         model_name=EMBEDDING_MODEL,
         model_kwargs={'device': 'cpu'}
     )
 def create_vector_store():
-    """Создание векторного хранилища и загрузка в датасет"""
-    # Загрузка документов
     documents = load_documents()
     if not documents:
-        return False, "Ошибка: документы не загружены"
-    # Разделение на чанки
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP
     )
     chunks = text_splitter.split_documents(documents)
-    # Инициализация эмбеддингов
     embeddings = get_embeddings()
-    # Создание векторного хранилища во временной директории
     with tempfile.TemporaryDirectory() as temp_dir:
         vector_store = FAISS.from_documents(chunks, embeddings)
-        # Сохраняем во временную директорию
         vector_store.save_local(folder_path=temp_dir)
-        # Копируем файлы в VECTOR_STORE_PATH для последующей загрузки
         os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
         for file in ["index.faiss", "index.pkl"]:
             shutil.copy2(
@@ -47,21 +47,21 @@ def create_vector_store():
                 os.path.join(VECTOR_STORE_PATH, file)
             )
-        # Загрузка в датасет с явной передачей токена
         from src.knowledge_base.dataset import DatasetManager
         dataset = DatasetManager(token=HF_TOKEN)
         success, message = dataset.upload_vector_store()
-        # Очищаем локальные файлы после загрузки
         shutil.rmtree(VECTOR_STORE_PATH)
         if not success:
-            return False, f"Ошибка загрузки в датасет: {message}"
-    return True, f"База знаний создана успешно! Загружено {len(documents)} документов, создано {len(chunks)} чанков."
 def load_vector_store():
-    """Загрузка векторного хранилища"""
     embeddings = get_embeddings()
     if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
@@ -75,5 +75,5 @@ def load_vector_store():
         )
         return vector_store
     except Exception as e:
-        print(f"Ошибка загрузки векторного хранилища: {str(e)}")
         return None

 from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
 def get_embeddings():
+    """Get embeddings model"""
     return HuggingFaceEmbeddings(
         model_name=EMBEDDING_MODEL,
         model_kwargs={'device': 'cpu'}
     )
 def create_vector_store():
+    """Create vector store and upload to dataset"""
+    # Load documents
     documents = load_documents()
     if not documents:
+        return False, "Error: documents not loaded"
+    # Split into chunks
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=CHUNK_SIZE,
         chunk_overlap=CHUNK_OVERLAP
     )
     chunks = text_splitter.split_documents(documents)
+    # Initialize embeddings
     embeddings = get_embeddings()
+    # Create vector store in temporary directory
     with tempfile.TemporaryDirectory() as temp_dir:
         vector_store = FAISS.from_documents(chunks, embeddings)
+        # Save to temporary directory
         vector_store.save_local(folder_path=temp_dir)
+        # Copy files to VECTOR_STORE_PATH for subsequent loading
         os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
         for file in ["index.faiss", "index.pkl"]:
             shutil.copy2(
                 os.path.join(VECTOR_STORE_PATH, file)
             )
+        # Upload to dataset with explicit token passing
         from src.knowledge_base.dataset import DatasetManager
         dataset = DatasetManager(token=HF_TOKEN)
         success, message = dataset.upload_vector_store()
+        # Clean up local files after upload
         shutil.rmtree(VECTOR_STORE_PATH)
         if not success:
+            return False, f"Error uploading to dataset: {message}"
+    return True, f"Knowledge base created successfully! Loaded {len(documents)} documents, created {len(chunks)} chunks."
 def load_vector_store():
+    """Load vector store"""
     embeddings = get_embeddings()
     if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
         )
         return vector_store
     except Exception as e:
+        print(f"Error loading vector store: {str(e)}")
         return None