Rulga commited on
Commit
0bb77b3
·
1 Parent(s): 9949f77

Translate comments and error messages to English for consistency and clarity

Browse files
config/constants.py CHANGED
@@ -1,4 +1,4 @@
1
- # URLs для создания базы знаний
2
  URLS = [
3
  "https://status.law",
4
  "https://status.law/about",
@@ -15,35 +15,29 @@ URLS = [
15
  "https://status.law/faq"
16
  ]
17
 
18
- # Настройки для чанкирования текста
19
  CHUNK_SIZE = 500
20
  CHUNK_OVERLAP = 100
21
 
22
- # Шаблон системного сообщения
23
  DEFAULT_SYSTEM_MESSAGE = """
24
  You are a helpful and polite legal assistant at Status Law.
25
- You answer in the language in which the question was asked.
26
- Answer the question based on the context provided.
27
- If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
28
- - For all users: +32465594521 (landline phone).
29
- - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
30
- - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
31
- If the user has questions about specific services and their costs, suggest they visit the page https://status.law/tariffs-for-services-of-protection-against-extradition-and-international-prosecution/ for detailed information.
32
 
33
- Ask the user additional questions to understand which service to recommend and provide an estimated cost. For example, clarify their situation and needs to suggest the most appropriate options.
 
 
34
 
35
- Also, offer free consultations if they are available and suitable for the user's request.
36
- Answer professionally but in a friendly manner.
37
 
38
- Example:
39
- Q: How can I challenge the sanctions?
40
- A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
41
-
42
- Context: {context}
43
- Question: {question}
44
-
45
- Response Guidelines:
46
- 1. Answer in the user's language
47
- 2. Cite sources when possible
48
- 3. Offer contact options if unsure
49
- """
 
1
+ # URLs for knowledge base creation
2
  URLS = [
3
  "https://status.law",
4
  "https://status.law/about",
 
15
  "https://status.law/faq"
16
  ]
17
 
18
+ # Text chunking settings
19
  CHUNK_SIZE = 500
20
  CHUNK_OVERLAP = 100
21
 
22
+ # System message template
23
  DEFAULT_SYSTEM_MESSAGE = """
24
  You are a helpful and polite legal assistant at Status Law.
25
+ You answer in the language in which the question was asked.
26
+ Answer the question based on the context provided.
27
+ If you cannot answer based on the context, say so politely and offer to contact Status Law directly via the following channels:
28
+ - For all users: +32465594521 (landline phone).
29
+ - For English and Swedish speakers only: +46728495129 (available on WhatsApp, Telegram, Signal, IMO).
30
+ - Provide a link to the contact form: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
 
31
 
32
+ Example:
33
+ Q: How can I challenge the sanctions?
34
+ A: To challenge the sanctions, you should consult with our legal team, who specialize in this area. Please contact us directly for detailed advice. You can fill out our contact form here: [Contact Form](https://status.law/law-firm-contact-legal-protection/).
35
 
36
+ Context: {context}
37
+ Question: {question}
38
 
39
+ Response Guidelines:
40
+ 1. Answer in the user's language
41
+ 2. Cite sources when possible
42
+ 3. Offer contact options if unsure
43
+ """
 
 
 
 
 
 
 
config/settings.py CHANGED
@@ -1,31 +1,31 @@
1
  import os
2
  from dotenv import load_dotenv
3
 
4
- # Отладочная информация
5
- print("Текущая директория:", os.getcwd())
6
  env_path = os.path.join(os.getcwd(), '.env')
7
- print("Путь к .env:", env_path)
8
- print("Файл .env существует:", os.path.exists(env_path))
9
 
10
  if os.path.exists(env_path):
11
  with open(env_path, 'r') as f:
12
- print("Содержимое .env файла:", f.read())
13
 
14
- # Загрузка переменных окружения
15
  load_dotenv(verbose=True)
16
 
17
- # Пути к директориям
18
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
  VECTOR_STORE_PATH = os.path.join(BASE_DIR, "data", "vector_store")
20
 
21
- # Настройки моделей
22
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
23
  DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
24
 
25
- # API токены
26
  HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
27
  if not HF_TOKEN:
28
- raise ValueError("HUGGINGFACE_TOKEN не найден в переменных окружения")
29
 
30
- # Настройки запросов
31
  USER_AGENT = "Status-Law-Assistant/1.0"
 
1
  import os
2
  from dotenv import load_dotenv
3
 
4
+ # Debug information
5
+ print("Current directory:", os.getcwd())
6
  env_path = os.path.join(os.getcwd(), '.env')
7
+ print("Path to .env:", env_path)
8
+ print(".env file exists:", os.path.exists(env_path))
9
 
10
  if os.path.exists(env_path):
11
  with open(env_path, 'r') as f:
12
+ print("Contents of .env file:", f.read())
13
 
14
+ # Load environment variables
15
  load_dotenv(verbose=True)
16
 
17
+ # Directory paths
18
  BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
  VECTOR_STORE_PATH = os.path.join(BASE_DIR, "data", "vector_store")
20
 
21
+ # Model settings
22
  EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
23
  DEFAULT_MODEL = "HuggingFaceH4/zephyr-7b-beta"
24
 
25
+ # API tokens
26
  HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
27
  if not HF_TOKEN:
28
+ raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
29
 
30
+ # Request settings
31
  USER_AGENT = "Status-Law-Assistant/1.0"
src/knowledge_base/loader.py CHANGED
@@ -5,7 +5,7 @@ from langchain_core.documents import Document
5
  from config.constants import URLS
6
 
7
  def load_documents():
8
- """Загрузка документов с веб-сайта"""
9
  documents = []
10
 
11
  headers = {
@@ -21,8 +21,8 @@ def load_documents():
21
  docs = loader.load()
22
  if docs:
23
  documents.extend(docs)
24
- print(f"Загружено {url}: {len(docs)} документов")
25
  except Exception as e:
26
- print(f"Ошибка загрузки {url}: {str(e)}")
27
 
28
- return documents
 
5
  from config.constants import URLS
6
 
7
  def load_documents():
8
+ """Load documents from website"""
9
  documents = []
10
 
11
  headers = {
 
21
  docs = loader.load()
22
  if docs:
23
  documents.extend(docs)
24
+ print(f"Loaded {url}: {len(docs)} documents")
25
  except Exception as e:
26
+ print(f"Error loading {url}: {str(e)}")
27
 
28
+ return documents
src/knowledge_base/vector_store.py CHANGED
@@ -9,37 +9,37 @@ from config.settings import VECTOR_STORE_PATH, EMBEDDING_MODEL, HF_TOKEN
9
  from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def get_embeddings():
12
- """Получение модели эмбеддингов"""
13
  return HuggingFaceEmbeddings(
14
  model_name=EMBEDDING_MODEL,
15
  model_kwargs={'device': 'cpu'}
16
  )
17
 
18
  def create_vector_store():
19
- """Создание векторного хранилища и загрузка в датасет"""
20
- # Загрузка документов
21
  documents = load_documents()
22
 
23
  if not documents:
24
- return False, "Ошибка: документы не загружены"
25
 
26
- # Разделение на чанки
27
  text_splitter = RecursiveCharacterTextSplitter(
28
  chunk_size=CHUNK_SIZE,
29
  chunk_overlap=CHUNK_OVERLAP
30
  )
31
  chunks = text_splitter.split_documents(documents)
32
 
33
- # Инициализация эмбеддингов
34
  embeddings = get_embeddings()
35
 
36
- # Создание векторного хранилища во временной директории
37
  with tempfile.TemporaryDirectory() as temp_dir:
38
  vector_store = FAISS.from_documents(chunks, embeddings)
39
- # Сохраняем во временную директорию
40
  vector_store.save_local(folder_path=temp_dir)
41
 
42
- # Копируем файлы в VECTOR_STORE_PATH для последующей загрузки
43
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
44
  for file in ["index.faiss", "index.pkl"]:
45
  shutil.copy2(
@@ -47,21 +47,21 @@ def create_vector_store():
47
  os.path.join(VECTOR_STORE_PATH, file)
48
  )
49
 
50
- # Загрузка в датасет с явной передачей токена
51
  from src.knowledge_base.dataset import DatasetManager
52
  dataset = DatasetManager(token=HF_TOKEN)
53
  success, message = dataset.upload_vector_store()
54
 
55
- # Очищаем локальные файлы после загрузки
56
  shutil.rmtree(VECTOR_STORE_PATH)
57
 
58
  if not success:
59
- return False, f"Ошибка загрузки в датасет: {message}"
60
 
61
- return True, f"База знаний создана успешно! Загружено {len(documents)} документов, создано {len(chunks)} чанков."
62
 
63
  def load_vector_store():
64
- """Загрузка векторного хранилища"""
65
  embeddings = get_embeddings()
66
 
67
  if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
@@ -75,5 +75,5 @@ def load_vector_store():
75
  )
76
  return vector_store
77
  except Exception as e:
78
- print(f"Ошибка загрузки векторного хранилища: {str(e)}")
79
  return None
 
9
  from config.constants import CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def get_embeddings():
12
+ """Get embeddings model"""
13
  return HuggingFaceEmbeddings(
14
  model_name=EMBEDDING_MODEL,
15
  model_kwargs={'device': 'cpu'}
16
  )
17
 
18
  def create_vector_store():
19
+ """Create vector store and upload to dataset"""
20
+ # Load documents
21
  documents = load_documents()
22
 
23
  if not documents:
24
+ return False, "Error: documents not loaded"
25
 
26
+ # Split into chunks
27
  text_splitter = RecursiveCharacterTextSplitter(
28
  chunk_size=CHUNK_SIZE,
29
  chunk_overlap=CHUNK_OVERLAP
30
  )
31
  chunks = text_splitter.split_documents(documents)
32
 
33
+ # Initialize embeddings
34
  embeddings = get_embeddings()
35
 
36
+ # Create vector store in temporary directory
37
  with tempfile.TemporaryDirectory() as temp_dir:
38
  vector_store = FAISS.from_documents(chunks, embeddings)
39
+ # Save to temporary directory
40
  vector_store.save_local(folder_path=temp_dir)
41
 
42
+ # Copy files to VECTOR_STORE_PATH for subsequent loading
43
  os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
44
  for file in ["index.faiss", "index.pkl"]:
45
  shutil.copy2(
 
47
  os.path.join(VECTOR_STORE_PATH, file)
48
  )
49
 
50
+ # Upload to dataset with explicit token passing
51
  from src.knowledge_base.dataset import DatasetManager
52
  dataset = DatasetManager(token=HF_TOKEN)
53
  success, message = dataset.upload_vector_store()
54
 
55
+ # Clean up local files after upload
56
  shutil.rmtree(VECTOR_STORE_PATH)
57
 
58
  if not success:
59
+ return False, f"Error uploading to dataset: {message}"
60
 
61
+ return True, f"Knowledge base created successfully! Loaded {len(documents)} documents, created {len(chunks)} chunks."
62
 
63
  def load_vector_store():
64
+ """Load vector store"""
65
  embeddings = get_embeddings()
66
 
67
  if not os.path.exists(os.path.join(VECTOR_STORE_PATH, "index.faiss")):
 
75
  )
76
  return vector_store
77
  except Exception as e:
78
+ print(f"Error loading vector store: {str(e)}")
79
  return None