Spaces:

Rulga
/

status-law-gbot

Running

App Files Files Community

Rulga commited on Apr 4, 2025

Commit

c27741d

1 Parent(s): 56a8632

Refactor dataset configuration; streamline path definitions and ensure necessary directories exist

Browse files

Files changed (3) hide show

app.py +2 -3
config/settings.py +17 -9
src/analytics/chat_evaluator.py +30 -12

app.py CHANGED Viewed

@@ -56,8 +56,7 @@ context_store = {}
 fallback_model_attempted = False
 chat_evaluator = ChatEvaluator(
     hf_token=HF_TOKEN,
-    dataset_id=DATASET_ID,
-    chat_history_path=CHAT_HISTORY_PATH
 )
 logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
@@ -1090,4 +1089,4 @@ if __name__ == "__main__":
     if not load_vector_store():
         logger.warning("Knowledge base not found. Please create it through the interface.")
-    demo.launch(share=True)

 fallback_model_attempted = False
 chat_evaluator = ChatEvaluator(
     hf_token=HF_TOKEN,
+    dataset_id=DATASET_ID
 )
 logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
     if not load_vector_store():
         logger.warning("Knowledge base not found. Please create it through the interface.")
+    demo.launch(share=True)

config/settings.py CHANGED Viewed

@@ -20,9 +20,23 @@ API_CONFIG = {
 # Dataset configuration
 DATASET_ID = "Rulga/status-law-knowledge-base"
-CHAT_HISTORY_PATH = "chat_history"
-VECTOR_STORE_PATH = "vector_store"
-FINE_TUNED_PATH = "fine_tuned_models"
 # Paths configuration
 MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
@@ -32,12 +46,6 @@ TRAINING_OUTPUT_DIR = os.path.join(CHAT_HISTORY_PATH, FINE_TUNED_PATH)
 os.makedirs(MODEL_PATH, exist_ok=True)
 os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
-# Dataset paths
-DATASET_CHAT_HISTORY_PATH = f"{DATASET_ID}/chat_history"
-DATASET_VECTOR_STORE_PATH = f"{DATASET_ID}/vector_store"
-DATASET_FINE_TUNED_PATH = f"{DATASET_ID}/fine_tuned_models"
-MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
 # Models configuration with detailed information
 MODELS = {
     "zephyr-7b": {

 # Dataset configuration
 DATASET_ID = "Rulga/status-law-knowledge-base"
+# Dataset paths
+DATASET_CHAT_HISTORY_PATH = "chat_history"
+DATASET_VECTOR_STORE_PATH = "vector_store"
+DATASET_FINE_TUNED_PATH = "fine_tuned_models"
+DATASET_ANNOTATIONS_PATH = "annotations"
+# Local paths (temporary storage)
+CHAT_HISTORY_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "chat_history")
+VECTOR_STORE_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
+FINE_TUNED_PATH = os.path.join(CHAT_HISTORY_PATH, "fine_tuned_models")
+MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
+# Create necessary directories if they don't exist
+os.makedirs(CHAT_HISTORY_PATH, exist_ok=True)
+os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
+os.makedirs(FINE_TUNED_PATH, exist_ok=True)
 # Paths configuration
 MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
 os.makedirs(MODEL_PATH, exist_ok=True)
 os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
 # Models configuration with detailed information
 MODELS = {
     "zephyr-7b": {

src/analytics/chat_evaluator.py CHANGED Viewed

@@ -14,6 +14,13 @@ import logging
 logger = logging.getLogger(__name__)
 class ChatEvaluator:
     def __init__(self, hf_token: str = None, dataset_id: str = None):
         """
@@ -23,26 +30,36 @@ class ChatEvaluator:
             hf_token: Hugging Face token
             dataset_id: Dataset ID on Hugging Face
         """
-        self.hf_token = hf_token or os.getenv('HF_TOKEN')
-        self.dataset_id = dataset_id or "Rulga/status-law-knowledge-base"
         self.api = HfApi(token=self.hf_token)
-        # Path for annotations in the dataset
-        self.annotations_path = "annotations"
-        # Ensure annotations directory exists in dataset
         try:
-            self._ensure_annotations_dir()
         except Exception as e:
-            logger.error(f"Failed to ensure annotations directory: {e}")
-    def _ensure_annotations_dir(self):
-        """Ensure annotations directory exists in the dataset"""
         try:
-            # Check if directory exists
             files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
             if self.annotations_path not in files:
-                # Create empty file to initialize directory
                 self.api.upload_file(
                     path_or_fileobj=io.StringIO(""),
                     path_in_repo=f"{self.annotations_path}/.gitkeep",
@@ -50,7 +67,7 @@ class ChatEvaluator:
                     repo_type="dataset"
                 )
         except Exception as e:
-            logger.error(f"Error ensuring annotations directory: {e}")
             raise
     def get_chat_history(self) -> List[Dict[str, Any]]:
@@ -336,3 +353,4 @@ class ChatEvaluator:

 logger = logging.getLogger(__name__)
+from config.settings import (
+    DATASET_ID,
+    DATASET_CHAT_HISTORY_PATH,
+    DATASET_ANNOTATIONS_PATH,
+    HF_TOKEN
+)
 class ChatEvaluator:
     def __init__(self, hf_token: str = None, dataset_id: str = None):
         """
             hf_token: Hugging Face token
             dataset_id: Dataset ID on Hugging Face
         """
+        self.hf_token = hf_token or HF_TOKEN
+        self.dataset_id = dataset_id or DATASET_ID
         self.api = HfApi(token=self.hf_token)
+        # Use dataset paths
+        self.chat_history_path = DATASET_CHAT_HISTORY_PATH
+        self.annotations_path = DATASET_ANNOTATIONS_PATH
+        # Ensure directories exist in dataset
         try:
+            self._ensure_dataset_structure()
         except Exception as e:
+            logger.error(f"Failed to ensure dataset structure: {e}")
+    def _ensure_dataset_structure(self):
+        """Ensure required directories exist in dataset"""
         try:
             files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
+            # Check and create chat history directory
+            if self.chat_history_path not in files:
+                self.api.upload_file(
+                    path_or_fileobj=io.StringIO(""),
+                    path_in_repo=f"{self.chat_history_path}/.gitkeep",
+                    repo_id=self.dataset_id,
+                    repo_type="dataset"
+                )
+            # Check and create annotations directory
             if self.annotations_path not in files:
                 self.api.upload_file(
                     path_or_fileobj=io.StringIO(""),
                     path_in_repo=f"{self.annotations_path}/.gitkeep",
                     repo_type="dataset"
                 )
         except Exception as e:
+            logger.error(f"Error ensuring dataset structure: {e}")
             raise
     def get_chat_history(self) -> List[Dict[str, Any]]: