Rulga commited on
Commit
c27741d
·
1 Parent(s): 56a8632

Refactor dataset configuration; streamline path definitions and ensure necessary directories exist

Browse files
Files changed (3) hide show
  1. app.py +2 -3
  2. config/settings.py +17 -9
  3. src/analytics/chat_evaluator.py +30 -12
app.py CHANGED
@@ -56,8 +56,7 @@ context_store = {}
56
  fallback_model_attempted = False
57
  chat_evaluator = ChatEvaluator(
58
  hf_token=HF_TOKEN,
59
- dataset_id=DATASET_ID,
60
- chat_history_path=CHAT_HISTORY_PATH
61
  )
62
 
63
  logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
@@ -1090,4 +1089,4 @@ if __name__ == "__main__":
1090
  if not load_vector_store():
1091
  logger.warning("Knowledge base not found. Please create it through the interface.")
1092
 
1093
- demo.launch(share=True)
 
56
  fallback_model_attempted = False
57
  chat_evaluator = ChatEvaluator(
58
  hf_token=HF_TOKEN,
59
+ dataset_id=DATASET_ID
 
60
  )
61
 
62
  logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
 
1089
  if not load_vector_store():
1090
  logger.warning("Knowledge base not found. Please create it through the interface.")
1091
 
1092
+ demo.launch(share=True)
config/settings.py CHANGED
@@ -20,9 +20,23 @@ API_CONFIG = {
20
 
21
  # Dataset configuration
22
  DATASET_ID = "Rulga/status-law-knowledge-base"
23
- CHAT_HISTORY_PATH = "chat_history"
24
- VECTOR_STORE_PATH = "vector_store"
25
- FINE_TUNED_PATH = "fine_tuned_models"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Paths configuration
28
  MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
@@ -32,12 +46,6 @@ TRAINING_OUTPUT_DIR = os.path.join(CHAT_HISTORY_PATH, FINE_TUNED_PATH)
32
  os.makedirs(MODEL_PATH, exist_ok=True)
33
  os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
34
 
35
- # Dataset paths
36
- DATASET_CHAT_HISTORY_PATH = f"{DATASET_ID}/chat_history"
37
- DATASET_VECTOR_STORE_PATH = f"{DATASET_ID}/vector_store"
38
- DATASET_FINE_TUNED_PATH = f"{DATASET_ID}/fine_tuned_models"
39
- MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
40
-
41
  # Models configuration with detailed information
42
  MODELS = {
43
  "zephyr-7b": {
 
20
 
21
  # Dataset configuration
22
  DATASET_ID = "Rulga/status-law-knowledge-base"
23
+
24
+ # Dataset paths
25
+ DATASET_CHAT_HISTORY_PATH = "chat_history"
26
+ DATASET_VECTOR_STORE_PATH = "vector_store"
27
+ DATASET_FINE_TUNED_PATH = "fine_tuned_models"
28
+ DATASET_ANNOTATIONS_PATH = "annotations"
29
+
30
+ # Local paths (temporary storage)
31
+ CHAT_HISTORY_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "chat_history")
32
+ VECTOR_STORE_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
33
+ FINE_TUNED_PATH = os.path.join(CHAT_HISTORY_PATH, "fine_tuned_models")
34
+ MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
35
+
36
+ # Create necessary directories if they don't exist
37
+ os.makedirs(CHAT_HISTORY_PATH, exist_ok=True)
38
+ os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
39
+ os.makedirs(FINE_TUNED_PATH, exist_ok=True)
40
 
41
  # Paths configuration
42
  MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
 
46
  os.makedirs(MODEL_PATH, exist_ok=True)
47
  os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
48
 
 
 
 
 
 
 
49
  # Models configuration with detailed information
50
  MODELS = {
51
  "zephyr-7b": {
src/analytics/chat_evaluator.py CHANGED
@@ -14,6 +14,13 @@ import logging
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
 
 
 
 
 
 
17
  class ChatEvaluator:
18
  def __init__(self, hf_token: str = None, dataset_id: str = None):
19
  """
@@ -23,26 +30,36 @@ class ChatEvaluator:
23
  hf_token: Hugging Face token
24
  dataset_id: Dataset ID on Hugging Face
25
  """
26
- self.hf_token = hf_token or os.getenv('HF_TOKEN')
27
- self.dataset_id = dataset_id or "Rulga/status-law-knowledge-base"
28
  self.api = HfApi(token=self.hf_token)
29
 
30
- # Path for annotations in the dataset
31
- self.annotations_path = "annotations"
 
32
 
33
- # Ensure annotations directory exists in dataset
34
  try:
35
- self._ensure_annotations_dir()
36
  except Exception as e:
37
- logger.error(f"Failed to ensure annotations directory: {e}")
38
 
39
- def _ensure_annotations_dir(self):
40
- """Ensure annotations directory exists in the dataset"""
41
  try:
42
- # Check if directory exists
43
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
 
 
 
 
 
 
 
 
 
 
 
44
  if self.annotations_path not in files:
45
- # Create empty file to initialize directory
46
  self.api.upload_file(
47
  path_or_fileobj=io.StringIO(""),
48
  path_in_repo=f"{self.annotations_path}/.gitkeep",
@@ -50,7 +67,7 @@ class ChatEvaluator:
50
  repo_type="dataset"
51
  )
52
  except Exception as e:
53
- logger.error(f"Error ensuring annotations directory: {e}")
54
  raise
55
 
56
  def get_chat_history(self) -> List[Dict[str, Any]]:
@@ -336,3 +353,4 @@ class ChatEvaluator:
336
 
337
 
338
 
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
+ from config.settings import (
18
+ DATASET_ID,
19
+ DATASET_CHAT_HISTORY_PATH,
20
+ DATASET_ANNOTATIONS_PATH,
21
+ HF_TOKEN
22
+ )
23
+
24
  class ChatEvaluator:
25
  def __init__(self, hf_token: str = None, dataset_id: str = None):
26
  """
 
30
  hf_token: Hugging Face token
31
  dataset_id: Dataset ID on Hugging Face
32
  """
33
+ self.hf_token = hf_token or HF_TOKEN
34
+ self.dataset_id = dataset_id or DATASET_ID
35
  self.api = HfApi(token=self.hf_token)
36
 
37
+ # Use dataset paths
38
+ self.chat_history_path = DATASET_CHAT_HISTORY_PATH
39
+ self.annotations_path = DATASET_ANNOTATIONS_PATH
40
 
41
+ # Ensure directories exist in dataset
42
  try:
43
+ self._ensure_dataset_structure()
44
  except Exception as e:
45
+ logger.error(f"Failed to ensure dataset structure: {e}")
46
 
47
+ def _ensure_dataset_structure(self):
48
+ """Ensure required directories exist in dataset"""
49
  try:
 
50
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
51
+
52
+ # Check and create chat history directory
53
+ if self.chat_history_path not in files:
54
+ self.api.upload_file(
55
+ path_or_fileobj=io.StringIO(""),
56
+ path_in_repo=f"{self.chat_history_path}/.gitkeep",
57
+ repo_id=self.dataset_id,
58
+ repo_type="dataset"
59
+ )
60
+
61
+ # Check and create annotations directory
62
  if self.annotations_path not in files:
 
63
  self.api.upload_file(
64
  path_or_fileobj=io.StringIO(""),
65
  path_in_repo=f"{self.annotations_path}/.gitkeep",
 
67
  repo_type="dataset"
68
  )
69
  except Exception as e:
70
+ logger.error(f"Error ensuring dataset structure: {e}")
71
  raise
72
 
73
  def get_chat_history(self) -> List[Dict[str, Any]]:
 
353
 
354
 
355
 
356
+