Rulga commited on
Commit
1804ce0
·
1 Parent(s): 832f5b8

Refactor dataset initialization to use configuration settings for improved maintainability and clarity

Browse files
Files changed (2) hide show
  1. dataset.py +28 -23
  2. src/knowledge_base/dataset.py +18 -10
dataset.py CHANGED
@@ -1,31 +1,36 @@
1
  from huggingface_hub import HfApi
2
- api = HfApi()
 
 
 
 
 
 
 
3
 
4
- # Имя существующего датасета
5
- dataset_name = "Rulga/status-law-knowledge-base"
6
 
7
- # Создаем структуру с пустыми файлами
8
- try:
9
- # Создаем .gitkeep в vector_store
10
- api.upload_file(
11
- path_or_fileobj=b"", # пустой файл
12
- path_in_repo="vector_store/.gitkeep",
13
- repo_id=dataset_name,
14
- repo_type="dataset"
15
- )
16
- print("✓ Создана папка vector_store")
17
 
18
- # Создаем .gitkeep в chat_history
19
- api.upload_file(
20
- path_or_fileobj=b"",
21
- path_in_repo="chat_history/.gitkeep",
22
- repo_id=dataset_name,
23
- repo_type="dataset"
24
- )
25
- print("✓ Создана папка chat_history")
 
26
 
27
- print("\nСтруктура датасета успешно создана!")
28
 
29
  except Exception as e:
30
- print(f"Произошла ошибка: {str(e)}")
31
 
 
1
  from huggingface_hub import HfApi
2
+ from config.settings import (
3
+ DATASET_ID,
4
+ DATASET_VECTOR_STORE_PATH,
5
+ DATASET_CHAT_HISTORY_PATH,
6
+ DATASET_FINE_TUNED_PATH,
7
+ DATASET_ANNOTATIONS_PATH,
8
+ HF_TOKEN
9
+ )
10
 
11
+ api = HfApi(token=HF_TOKEN)
12
+ dataset_name = DATASET_ID
13
 
14
+ # Initialize dataset structure
15
+ directories = [
16
+ DATASET_VECTOR_STORE_PATH,
17
+ DATASET_CHAT_HISTORY_PATH,
18
+ DATASET_FINE_TUNED_PATH,
19
+ DATASET_ANNOTATIONS_PATH
20
+ ]
 
 
 
21
 
22
+ try:
23
+ for directory in directories:
24
+ api.upload_file(
25
+ path_or_fileobj=b"",
26
+ path_in_repo=f"{directory}/.gitkeep",
27
+ repo_id=dataset_name,
28
+ repo_type="dataset"
29
+ )
30
+ print(f"✓ Created directory: {directory}")
31
 
32
+ print("\nDataset structure successfully initialized!")
33
 
34
  except Exception as e:
35
+ print(f"Error occurred: {str(e)}")
36
 
src/knowledge_base/dataset.py CHANGED
@@ -9,26 +9,34 @@ from typing import Tuple, List, Dict, Any, Optional, Union
9
  from datetime import datetime
10
  from huggingface_hub import HfApi, HfFolder
11
  from langchain_community.vectorstores import FAISS
12
- from config.settings import VECTOR_STORE_PATH, HF_TOKEN, EMBEDDING_MODEL, DATASET_ID, CHAT_HISTORY_PATH
13
- from langchain_huggingface import HuggingFaceEmbeddings # новый импорт
 
 
 
 
 
 
 
 
 
 
14
  import logging
15
 
16
- # Настройка логирования
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
  class DatasetManager:
21
  def __init__(self, dataset_name: Optional[str] = None, token: Optional[str] = None):
22
- """
23
- Initialize dataset manager
24
-
25
- Args:
26
- dataset_name: Hugging Face Hub dataset name
27
- token: Hugging Face access token
28
- """
29
  self.dataset_name = dataset_name or DATASET_ID
30
  self.token = token if token else HF_TOKEN
31
  self.api = HfApi(token=self.token)
 
 
 
 
 
 
32
 
33
  def init_dataset_structure(self) -> Tuple[bool, str]:
34
  """
 
9
  from datetime import datetime
10
  from huggingface_hub import HfApi, HfFolder
11
  from langchain_community.vectorstores import FAISS
12
+ from config.settings import (
13
+ VECTOR_STORE_PATH,
14
+ HF_TOKEN,
15
+ EMBEDDING_MODEL,
16
+ DATASET_ID,
17
+ CHAT_HISTORY_PATH,
18
+ DATASET_CHAT_HISTORY_PATH,
19
+ DATASET_VECTOR_STORE_PATH,
20
+ DATASET_FINE_TUNED_PATH,
21
+ DATASET_ANNOTATIONS_PATH
22
+ )
23
+ from langchain_huggingface import HuggingFaceEmbeddings
24
  import logging
25
 
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
  class DatasetManager:
30
  def __init__(self, dataset_name: Optional[str] = None, token: Optional[str] = None):
 
 
 
 
 
 
 
31
  self.dataset_name = dataset_name or DATASET_ID
32
  self.token = token if token else HF_TOKEN
33
  self.api = HfApi(token=self.token)
34
+
35
+ # Use paths from settings
36
+ self.vector_store_path = DATASET_VECTOR_STORE_PATH
37
+ self.chat_history_path = DATASET_CHAT_HISTORY_PATH
38
+ self.fine_tuned_path = DATASET_FINE_TUNED_PATH
39
+ self.annotations_path = DATASET_ANNOTATIONS_PATH
40
 
41
  def init_dataset_structure(self) -> Tuple[bool, str]:
42
  """