Spaces:
Running
Running
Refactor dataset configuration; streamline path definitions and ensure necessary directories exist
Browse files- app.py +2 -3
- config/settings.py +17 -9
- src/analytics/chat_evaluator.py +30 -12
app.py
CHANGED
|
@@ -56,8 +56,7 @@ context_store = {}
|
|
| 56 |
fallback_model_attempted = False
|
| 57 |
chat_evaluator = ChatEvaluator(
|
| 58 |
hf_token=HF_TOKEN,
|
| 59 |
-
dataset_id=DATASET_ID
|
| 60 |
-
chat_history_path=CHAT_HISTORY_PATH
|
| 61 |
)
|
| 62 |
|
| 63 |
logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
|
|
@@ -1090,4 +1089,4 @@ if __name__ == "__main__":
|
|
| 1090 |
if not load_vector_store():
|
| 1091 |
logger.warning("Knowledge base not found. Please create it through the interface.")
|
| 1092 |
|
| 1093 |
-
demo.launch(share=True)
|
|
|
|
| 56 |
fallback_model_attempted = False
|
| 57 |
chat_evaluator = ChatEvaluator(
|
| 58 |
hf_token=HF_TOKEN,
|
| 59 |
+
dataset_id=DATASET_ID
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
logger.info(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
|
|
|
|
| 1089 |
if not load_vector_store():
|
| 1090 |
logger.warning("Knowledge base not found. Please create it through the interface.")
|
| 1091 |
|
| 1092 |
+
demo.launch(share=True)
|
config/settings.py
CHANGED
|
@@ -20,9 +20,23 @@ API_CONFIG = {
|
|
| 20 |
|
| 21 |
# Dataset configuration
|
| 22 |
DATASET_ID = "Rulga/status-law-knowledge-base"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Paths configuration
|
| 28 |
MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
|
|
@@ -32,12 +46,6 @@ TRAINING_OUTPUT_DIR = os.path.join(CHAT_HISTORY_PATH, FINE_TUNED_PATH)
|
|
| 32 |
os.makedirs(MODEL_PATH, exist_ok=True)
|
| 33 |
os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
|
| 34 |
|
| 35 |
-
# Dataset paths
|
| 36 |
-
DATASET_CHAT_HISTORY_PATH = f"{DATASET_ID}/chat_history"
|
| 37 |
-
DATASET_VECTOR_STORE_PATH = f"{DATASET_ID}/vector_store"
|
| 38 |
-
DATASET_FINE_TUNED_PATH = f"{DATASET_ID}/fine_tuned_models"
|
| 39 |
-
MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
|
| 40 |
-
|
| 41 |
# Models configuration with detailed information
|
| 42 |
MODELS = {
|
| 43 |
"zephyr-7b": {
|
|
|
|
| 20 |
|
| 21 |
# Dataset configuration
|
| 22 |
DATASET_ID = "Rulga/status-law-knowledge-base"
|
| 23 |
+
|
| 24 |
+
# Dataset paths
|
| 25 |
+
DATASET_CHAT_HISTORY_PATH = "chat_history"
|
| 26 |
+
DATASET_VECTOR_STORE_PATH = "vector_store"
|
| 27 |
+
DATASET_FINE_TUNED_PATH = "fine_tuned_models"
|
| 28 |
+
DATASET_ANNOTATIONS_PATH = "annotations"
|
| 29 |
+
|
| 30 |
+
# Local paths (temporary storage)
|
| 31 |
+
CHAT_HISTORY_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "chat_history")
|
| 32 |
+
VECTOR_STORE_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "vector_store")
|
| 33 |
+
FINE_TUNED_PATH = os.path.join(CHAT_HISTORY_PATH, "fine_tuned_models")
|
| 34 |
+
MODELS_REGISTRY_PATH = os.path.join(CHAT_HISTORY_PATH, "models_registry.json")
|
| 35 |
+
|
| 36 |
+
# Create necessary directories if they don't exist
|
| 37 |
+
os.makedirs(CHAT_HISTORY_PATH, exist_ok=True)
|
| 38 |
+
os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
|
| 39 |
+
os.makedirs(FINE_TUNED_PATH, exist_ok=True)
|
| 40 |
|
| 41 |
# Paths configuration
|
| 42 |
MODEL_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
|
|
|
|
| 46 |
os.makedirs(MODEL_PATH, exist_ok=True)
|
| 47 |
os.makedirs(TRAINING_OUTPUT_DIR, exist_ok=True)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# Models configuration with detailed information
|
| 50 |
MODELS = {
|
| 51 |
"zephyr-7b": {
|
src/analytics/chat_evaluator.py
CHANGED
|
@@ -14,6 +14,13 @@ import logging
|
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
class ChatEvaluator:
|
| 18 |
def __init__(self, hf_token: str = None, dataset_id: str = None):
|
| 19 |
"""
|
|
@@ -23,26 +30,36 @@ class ChatEvaluator:
|
|
| 23 |
hf_token: Hugging Face token
|
| 24 |
dataset_id: Dataset ID on Hugging Face
|
| 25 |
"""
|
| 26 |
-
self.hf_token = hf_token or
|
| 27 |
-
self.dataset_id = dataset_id or
|
| 28 |
self.api = HfApi(token=self.hf_token)
|
| 29 |
|
| 30 |
-
#
|
| 31 |
-
self.
|
|
|
|
| 32 |
|
| 33 |
-
# Ensure
|
| 34 |
try:
|
| 35 |
-
self.
|
| 36 |
except Exception as e:
|
| 37 |
-
logger.error(f"Failed to ensure
|
| 38 |
|
| 39 |
-
def
|
| 40 |
-
"""Ensure
|
| 41 |
try:
|
| 42 |
-
# Check if directory exists
|
| 43 |
files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
if self.annotations_path not in files:
|
| 45 |
-
# Create empty file to initialize directory
|
| 46 |
self.api.upload_file(
|
| 47 |
path_or_fileobj=io.StringIO(""),
|
| 48 |
path_in_repo=f"{self.annotations_path}/.gitkeep",
|
|
@@ -50,7 +67,7 @@ class ChatEvaluator:
|
|
| 50 |
repo_type="dataset"
|
| 51 |
)
|
| 52 |
except Exception as e:
|
| 53 |
-
logger.error(f"Error ensuring
|
| 54 |
raise
|
| 55 |
|
| 56 |
def get_chat_history(self) -> List[Dict[str, Any]]:
|
|
@@ -336,3 +353,4 @@ class ChatEvaluator:
|
|
| 336 |
|
| 337 |
|
| 338 |
|
|
|
|
|
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
| 17 |
+
from config.settings import (
|
| 18 |
+
DATASET_ID,
|
| 19 |
+
DATASET_CHAT_HISTORY_PATH,
|
| 20 |
+
DATASET_ANNOTATIONS_PATH,
|
| 21 |
+
HF_TOKEN
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
class ChatEvaluator:
|
| 25 |
def __init__(self, hf_token: str = None, dataset_id: str = None):
|
| 26 |
"""
|
|
|
|
| 30 |
hf_token: Hugging Face token
|
| 31 |
dataset_id: Dataset ID on Hugging Face
|
| 32 |
"""
|
| 33 |
+
self.hf_token = hf_token or HF_TOKEN
|
| 34 |
+
self.dataset_id = dataset_id or DATASET_ID
|
| 35 |
self.api = HfApi(token=self.hf_token)
|
| 36 |
|
| 37 |
+
# Use dataset paths
|
| 38 |
+
self.chat_history_path = DATASET_CHAT_HISTORY_PATH
|
| 39 |
+
self.annotations_path = DATASET_ANNOTATIONS_PATH
|
| 40 |
|
| 41 |
+
# Ensure directories exist in dataset
|
| 42 |
try:
|
| 43 |
+
self._ensure_dataset_structure()
|
| 44 |
except Exception as e:
|
| 45 |
+
logger.error(f"Failed to ensure dataset structure: {e}")
|
| 46 |
|
| 47 |
+
def _ensure_dataset_structure(self):
|
| 48 |
+
"""Ensure required directories exist in dataset"""
|
| 49 |
try:
|
|
|
|
| 50 |
files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
|
| 51 |
+
|
| 52 |
+
# Check and create chat history directory
|
| 53 |
+
if self.chat_history_path not in files:
|
| 54 |
+
self.api.upload_file(
|
| 55 |
+
path_or_fileobj=io.StringIO(""),
|
| 56 |
+
path_in_repo=f"{self.chat_history_path}/.gitkeep",
|
| 57 |
+
repo_id=self.dataset_id,
|
| 58 |
+
repo_type="dataset"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# Check and create annotations directory
|
| 62 |
if self.annotations_path not in files:
|
|
|
|
| 63 |
self.api.upload_file(
|
| 64 |
path_or_fileobj=io.StringIO(""),
|
| 65 |
path_in_repo=f"{self.annotations_path}/.gitkeep",
|
|
|
|
| 67 |
repo_type="dataset"
|
| 68 |
)
|
| 69 |
except Exception as e:
|
| 70 |
+
logger.error(f"Error ensuring dataset structure: {e}")
|
| 71 |
raise
|
| 72 |
|
| 73 |
def get_chat_history(self) -> List[Dict[str, Any]]:
|
|
|
|
| 353 |
|
| 354 |
|
| 355 |
|
| 356 |
+
|