Rulga commited on
Commit
3595c1e
·
1 Parent(s): 8e9845c

Refactor training data export function and add dataset structure repair tool

Browse files

- Updated the export_training_data_action function call to change the order of parameters in app.py.
- Added new constants for training data paths in settings.py.
- Implemented ensure_dataset_directory function in dataset.py to create missing directories for training data.
- Enhanced dataset structure repair tool in repair_dataset_structure.py to move misplaced chat files and fix duplicated paths.
- Added logging functionality to track the repair process and errors.

app.py CHANGED
@@ -1375,7 +1375,7 @@ with gr.Blocks(css="""
1375
 
1376
  # Export training data
1377
  export_btn.click(
1378
- fn=lambda min_r, path: export_training_data_action(min_r, path, chat_evaluator),
1379
  inputs=[min_rating, export_path],
1380
  outputs=[export_status]
1381
  )
 
1375
 
1376
  # Export training data
1377
  export_btn.click(
1378
+ fn=lambda min_r, path: export_training_data_action(chat_evaluator, min_r, path),
1379
  inputs=[min_rating, export_path],
1380
  outputs=[export_status]
1381
  )
config/settings.py CHANGED
@@ -29,6 +29,9 @@ DATASET_FINE_TUNED_PATH = "fine_tuned_models"
29
  DATASET_ANNOTATIONS_PATH = "annotations"
30
  DATASET_ERROR_LOGS_PATH = "error_logs"
31
  DATASET_PREFERENCES_PATH = "preferences/user_preferences.json"
 
 
 
32
 
33
  # Temporary storage (using system temp directory)
34
  TEMP_DIR = tempfile.gettempdir()
 
29
  DATASET_ANNOTATIONS_PATH = "annotations"
30
  DATASET_ERROR_LOGS_PATH = "error_logs"
31
  DATASET_PREFERENCES_PATH = "preferences/user_preferences.json"
32
+ # Adding training data paths
33
+ DATASET_TRAINING_DATA_PATH = "training_data"
34
+ DATASET_TRAINING_LOGS_PATH = "training_logs"
35
 
36
  # Temporary storage (using system temp directory)
37
  TEMP_DIR = tempfile.gettempdir()
dataset.py CHANGED
@@ -5,18 +5,43 @@ from config.settings import (
5
  DATASET_CHAT_HISTORY_PATH,
6
  DATASET_FINE_TUNED_PATH,
7
  DATASET_ANNOTATIONS_PATH,
 
 
8
  HF_TOKEN
9
  )
10
 
11
  api = HfApi(token=HF_TOKEN)
12
  dataset_name = DATASET_ID
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # Initialize dataset structure
15
  directories = [
16
  DATASET_VECTOR_STORE_PATH,
17
  DATASET_CHAT_HISTORY_PATH,
18
  DATASET_FINE_TUNED_PATH,
19
- DATASET_ANNOTATIONS_PATH
 
 
20
  ]
21
 
22
  try:
 
5
  DATASET_CHAT_HISTORY_PATH,
6
  DATASET_FINE_TUNED_PATH,
7
  DATASET_ANNOTATIONS_PATH,
8
+ DATASET_TRAINING_DATA_PATH,
9
+ DATASET_TRAINING_LOGS_PATH,
10
  HF_TOKEN
11
  )
12
 
13
  api = HfApi(token=HF_TOKEN)
14
  dataset_name = DATASET_ID
15
 
16
+ def ensure_dataset_directory(directory: str) -> None:
17
+ """
18
+ Check if directory exists in dataset and create if not
19
+ """
20
+ try:
21
+ files = api.list_repo_files(repo_id=dataset_name, repo_type="dataset")
22
+ if not any(f.startswith(f"{directory}/") for f in files):
23
+ api.upload_file(
24
+ path_or_fileobj=b"",
25
+ path_in_repo=f"{directory}/.gitkeep",
26
+ repo_id=dataset_name,
27
+ repo_type="dataset"
28
+ )
29
+ print(f"✓ Created missing directory: {directory}")
30
+ except Exception as e:
31
+ print(f"Error ensuring directory {directory}: {str(e)}")
32
+
33
+ # Ensure training directories exist
34
+ ensure_dataset_directory(DATASET_TRAINING_DATA_PATH)
35
+ ensure_dataset_directory(DATASET_TRAINING_LOGS_PATH)
36
+
37
  # Initialize dataset structure
38
  directories = [
39
  DATASET_VECTOR_STORE_PATH,
40
  DATASET_CHAT_HISTORY_PATH,
41
  DATASET_FINE_TUNED_PATH,
42
+ DATASET_ANNOTATIONS_PATH,
43
+ DATASET_TRAINING_DATA_PATH,
44
+ DATASET_TRAINING_LOGS_PATH
45
  ]
46
 
47
  try:
repair_dataset_structure.log ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-10 17:42:04,332 - __main__ - INFO - === Starting Dataset Structure Repair ===
2
+ 2025-04-10 17:42:04,333 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
3
+ 2025-04-10 17:42:04,727 - __main__ - INFO - Found 3 misplaced chat files
4
+ 2025-04-10 17:42:08,843 - __main__ - INFO - Moved annotations/annotation_01be5785-10a7-460b-9374-f749897987c7.json to chat_history/annotations/annotation_01be5785-10a7-460b-9374-f749897987c7.json
5
+ 2025-04-10 17:42:12,475 - __main__ - INFO - Moved annotations/annotation_46035aa7-b1c9-4c33-af58-0cf335f6727c.json to chat_history/annotations/annotation_46035aa7-b1c9-4c33-af58-0cf335f6727c.json
6
+ 2025-04-10 17:42:16,223 - __main__ - INFO - Moved annotations/annotation_6f98aeb4-58ca-48a4-a236-2719592c67b2.json to chat_history/annotations/annotation_6f98aeb4-58ca-48a4-a236-2719592c67b2.json
7
+ 2025-04-10 17:42:16,223 - __main__ - INFO - Successfully moved 3 files
8
+ 2025-04-10 17:42:16,223 - __main__ - INFO - === Repair Complete ===
9
+ 2025-04-10 18:43:48,372 - __main__ - INFO - === Starting Dataset Structure Repair ===
10
+ 2025-04-10 18:43:48,372 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
11
+ 2025-04-10 18:43:48,893 - __main__ - INFO - Found 42 misplaced chat files
12
+ 2025-04-10 18:43:53,235 - __main__ - INFO - Moved chat_history\20250403-003446.json to chat_history/chat_history\20250403-003446.json
13
+ 2025-04-10 18:43:57,233 - __main__ - INFO - Moved chat_history\20250403-004332.json to chat_history/chat_history\20250403-004332.json
14
+ 2025-04-10 18:44:01,825 - __main__ - INFO - Moved chat_history\20250403-005218.json to chat_history/chat_history\20250403-005218.json
15
+ 2025-04-10 18:44:06,138 - __main__ - INFO - Moved chat_history\20250403-005243.json to chat_history/chat_history\20250403-005243.json
16
+ 2025-04-10 18:44:10,245 - __main__ - INFO - Moved chat_history\20250403-005335.json to chat_history/chat_history\20250403-005335.json
17
+ 2025-04-10 18:44:14,219 - __main__ - INFO - Moved chat_history\20250403-115212.json to chat_history/chat_history\20250403-115212.json
18
+ 2025-04-10 18:44:19,303 - __main__ - INFO - Moved chat_history\20250403-154557.json to chat_history/chat_history\20250403-154557.json
19
+ 2025-04-10 18:44:22,916 - __main__ - INFO - Moved chat_history\20250403-170727.json to chat_history/chat_history\20250403-170727.json
20
+ 2025-04-10 18:44:26,650 - __main__ - INFO - Moved chat_history\20250403-171345.json to chat_history/chat_history\20250403-171345.json
21
+ 2025-04-10 18:44:31,024 - __main__ - INFO - Moved chat_history\20250403-180146.json to chat_history/chat_history\20250403-180146.json
22
+ 2025-04-10 18:44:34,740 - __main__ - INFO - Moved chat_history\20250403-181618.json to chat_history/chat_history\20250403-181618.json
23
+ 2025-04-10 18:44:38,314 - __main__ - INFO - Moved chat_history\20250403-182229.json to chat_history/chat_history\20250403-182229.json
24
+ 2025-04-10 18:44:41,931 - __main__ - INFO - Moved chat_history\20250404-131413.json to chat_history/chat_history\20250404-131413.json
25
+ 2025-04-10 18:44:45,580 - __main__ - INFO - Moved chat_history\20250404-131503.json to chat_history/chat_history\20250404-131503.json
26
+ 2025-04-10 18:44:49,412 - __main__ - INFO - Moved chat_history\20250404-134635.json to chat_history/chat_history\20250404-134635.json
27
+ 2025-04-10 18:44:53,247 - __main__ - INFO - Moved chat_history\20250404-140648.json to chat_history/chat_history\20250404-140648.json
28
+ 2025-04-10 18:44:56,894 - __main__ - INFO - Moved chat_history\20250404-140914.json to chat_history/chat_history\20250404-140914.json
29
+ 2025-04-10 18:45:00,593 - __main__ - INFO - Moved chat_history\20250404-140955.json to chat_history/chat_history\20250404-140955.json
30
+ 2025-04-10 18:45:04,591 - __main__ - INFO - Moved chat_history\20250404-175337.json to chat_history/chat_history\20250404-175337.json
31
+ 2025-04-10 18:45:08,298 - __main__ - INFO - Moved chat_history\20250404-180019.json to chat_history/chat_history\20250404-180019.json
32
+ 2025-04-10 18:45:11,886 - __main__ - INFO - Moved chat_history\20250404-180053.json to chat_history/chat_history\20250404-180053.json
33
+ 2025-04-10 18:45:15,675 - __main__ - INFO - Moved chat_history\20250404-180123.json to chat_history/chat_history\20250404-180123.json
34
+ 2025-04-10 18:45:19,630 - __main__ - INFO - Moved chat_history\20250404-180651.json to chat_history/chat_history\20250404-180651.json
35
+ 2025-04-10 18:45:23,351 - __main__ - INFO - Moved chat_history\20250404-180725.json to chat_history/chat_history\20250404-180725.json
36
+ 2025-04-10 18:45:27,772 - __main__ - INFO - Moved chat_history\20250404-180940.json to chat_history/chat_history\20250404-180940.json
37
+ 2025-04-10 18:45:31,678 - __main__ - INFO - Moved chat_history\20250404-181628.json to chat_history/chat_history\20250404-181628.json
38
+ 2025-04-10 18:45:35,959 - __main__ - INFO - Moved chat_history\20250404-181730.json to chat_history/chat_history\20250404-181730.json
39
+ 2025-04-10 18:45:39,765 - __main__ - INFO - Moved chat_history\20250404-184103.json to chat_history/chat_history\20250404-184103.json
40
+ 2025-04-10 18:45:43,638 - __main__ - INFO - Moved chat_history\20250404-184613.json to chat_history/chat_history\20250404-184613.json
41
+ 2025-04-10 18:45:47,378 - __main__ - INFO - Moved chat_history\20250404-184705.json to chat_history/chat_history\20250404-184705.json
42
+ 2025-04-10 18:45:51,172 - __main__ - INFO - Moved chat_history\20250404-185243.json to chat_history/chat_history\20250404-185243.json
43
+ 2025-04-10 18:45:54,867 - __main__ - INFO - Moved chat_history\20250404-185810.json to chat_history/chat_history\20250404-185810.json
44
+ 2025-04-10 18:45:58,349 - __main__ - INFO - Moved chat_history\20250404-202051.json to chat_history/chat_history\20250404-202051.json
45
+ 2025-04-10 18:46:03,364 - __main__ - INFO - Moved chat_history\20250407-174318.json to chat_history/chat_history\20250407-174318.json
46
+ 2025-04-10 18:46:06,933 - __main__ - INFO - Moved chat_history\20250407-175851.json to chat_history/chat_history\20250407-175851.json
47
+ 2025-04-10 18:46:10,714 - __main__ - INFO - Moved chat_history\20250408-135920.json to chat_history/chat_history\20250408-135920.json
48
+ 2025-04-10 18:46:14,230 - __main__ - INFO - Moved chat_history\20250408-143211.json to chat_history/chat_history\20250408-143211.json
49
+ 2025-04-10 18:46:17,815 - __main__ - INFO - Moved chat_history\20250408-152804.json to chat_history/chat_history\20250408-152804.json
50
+ 2025-04-10 18:46:21,415 - __main__ - INFO - Moved chat_history\20250408-161228.json to chat_history/chat_history\20250408-161228.json
51
+ 2025-04-10 18:46:25,573 - __main__ - INFO - Moved chat_history\20250408-165140.json to chat_history/chat_history\20250408-165140.json
52
+ 2025-04-10 18:46:29,149 - __main__ - INFO - Moved chat_history\20250408-165215.json to chat_history/chat_history\20250408-165215.json
53
+ 2025-04-10 18:46:32,798 - __main__ - INFO - Moved chat_history\20250408-165308.json to chat_history/chat_history\20250408-165308.json
54
+ 2025-04-10 18:46:32,798 - __main__ - INFO - Successfully moved 42 files from root to chat_history
55
+ 2025-04-10 18:46:32,814 - __main__ - INFO - === Repair Complete ===
56
+ 2025-04-10 18:52:09,005 - __main__ - INFO - === Starting Dataset Structure Repair ===
57
+ 2025-04-10 18:52:09,005 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
58
+ 2025-04-10 18:52:09,005 - __main__ - INFO - === Starting Path Fix ===
59
+ 2025-04-10 18:52:09,415 - __main__ - INFO - Found 42 files with duplicated chat_history path
60
+ 2025-04-10 18:52:17,036 - __main__ - INFO - Renamed chat_history/chat_history\20250403-003446.json to chat_history/20250403-003446.json
61
+ 2025-04-10 18:52:21,492 - __main__ - INFO - Renamed chat_history/chat_history\20250403-004332.json to chat_history/20250403-004332.json
62
+ 2025-04-10 18:52:25,225 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005218.json to chat_history/20250403-005218.json
63
+ 2025-04-10 18:52:29,282 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005243.json to chat_history/20250403-005243.json
64
+ 2025-04-10 18:52:32,882 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005335.json to chat_history/20250403-005335.json
65
+ 2025-04-10 18:52:36,438 - __main__ - INFO - Renamed chat_history/chat_history\20250403-115212.json to chat_history/20250403-115212.json
66
+ 2025-04-10 18:52:40,280 - __main__ - INFO - Renamed chat_history/chat_history\20250403-154557.json to chat_history/20250403-154557.json
67
+ 2025-04-10 18:52:44,213 - __main__ - INFO - Renamed chat_history/chat_history\20250403-170727.json to chat_history/20250403-170727.json
68
+ 2025-04-10 18:52:48,115 - __main__ - INFO - Renamed chat_history/chat_history\20250403-171345.json to chat_history/20250403-171345.json
69
+ 2025-04-10 18:52:51,976 - __main__ - INFO - Renamed chat_history/chat_history\20250403-180146.json to chat_history/20250403-180146.json
70
+ 2025-04-10 18:52:55,930 - __main__ - INFO - Renamed chat_history/chat_history\20250403-181618.json to chat_history/20250403-181618.json
71
+ 2025-04-10 18:52:59,738 - __main__ - INFO - Renamed chat_history/chat_history\20250403-182229.json to chat_history/20250403-182229.json
72
+ 2025-04-10 18:53:05,572 - __main__ - INFO - Renamed chat_history/chat_history\20250404-131413.json to chat_history/20250404-131413.json
73
+ 2025-04-10 18:53:09,406 - __main__ - INFO - Renamed chat_history/chat_history\20250404-131503.json to chat_history/20250404-131503.json
74
+ 2025-04-10 18:53:13,212 - __main__ - INFO - Renamed chat_history/chat_history\20250404-134635.json to chat_history/20250404-134635.json
75
+ 2025-04-10 18:53:17,506 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140648.json to chat_history/20250404-140648.json
76
+ 2025-04-10 18:53:21,872 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140914.json to chat_history/20250404-140914.json
77
+ 2025-04-10 18:53:25,516 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140955.json to chat_history/20250404-140955.json
78
+ 2025-04-10 18:53:29,177 - __main__ - INFO - Renamed chat_history/chat_history\20250404-175337.json to chat_history/20250404-175337.json
79
+ 2025-04-10 18:53:32,998 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180019.json to chat_history/20250404-180019.json
80
+ 2025-04-10 18:53:36,716 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180053.json to chat_history/20250404-180053.json
81
+ 2025-04-10 18:53:40,483 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180123.json to chat_history/20250404-180123.json
82
+ 2025-04-10 18:53:43,269 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180651.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f791-503acbd66e4aeb5f2ba7abd9;11c9bb9a-ac35-4c84-ba83-0a1f415ee40c)
83
+
84
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
85
+ 2025-04-10 18:53:46,037 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180725.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f794-490d14f53aebbbb72066ab6f;131385d7-3a06-4037-a1d0-0a8ba167c80d)
86
+
87
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
88
+ 2025-04-10 18:53:49,886 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180940.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f798-2e222f1d59ac3dc41de64dfb;aecbf90a-b7b0-43ca-b4b1-fffd4d6ab70d)
89
+
90
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
91
+ 2025-04-10 18:53:52,612 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181628.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f79b-09915158005dfa7c5a56a15d;3d5cecd6-e1a3-44f1-8234-8ab0a873b718)
92
+
93
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
94
+ 2025-04-10 18:53:55,411 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181730.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f79d-64e62bbb44ac2ef51c8518b0;927958ad-8808-4eb2-96b3-0528085c7396)
95
+
96
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
97
+ 2025-04-10 18:53:58,337 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184103.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a0-5d8a1a2c1070cf2648e0ec44;37fccdca-bcc5-40d7-85f6-f26caa2489e6)
98
+
99
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
100
+ 2025-04-10 18:54:01,048 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184613.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a3-448e4bda194517cf7a8285ed;14c70e24-3545-4b6a-841b-2bbac1e9bc6b)
101
+
102
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
103
+ 2025-04-10 18:54:03,937 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184705.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a6-7655014d5e3a972d08815ddc;3cd55990-22c6-499a-95f1-f662d12c51b8)
104
+
105
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
106
+ 2025-04-10 18:54:06,653 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185243.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a9-2eced6ec760924e43262136d;633dd326-9951-49b0-9b52-57825ae88ed0)
107
+
108
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
109
+ 2025-04-10 18:54:09,403 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185810.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7ab-0d9b2d483b6902386b048933;b95da74a-3609-454d-a0c0-0a7f14b6d6de)
110
+
111
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
112
+ 2025-04-10 18:54:12,136 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-202051.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7ae-52d0acc90186a3dc4b9254a6;6c44b4d6-6627-4b01-8a50-a4fa21959fdb)
113
+
114
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
115
+ 2025-04-10 18:54:15,081 - __main__ - ERROR - Error processing file chat_history/chat_history\20250407-174318.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b1-4e99483725d815ad61bbaacc;6fb9057f-8d66-43ac-b603-789f4f7ba815)
116
+
117
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
118
+ 2025-04-10 18:54:17,822 - __main__ - ERROR - Error processing file chat_history/chat_history\20250407-175851.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b4-2b9c83346c295d82770bd968;8e704238-9252-426b-b350-e3f52fd2cbb0)
119
+
120
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
121
+ 2025-04-10 18:54:20,537 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-135920.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b6-6ea689051ea0cca350f3ca17;0fa53b91-119c-47b2-9761-93ea7d93a993)
122
+
123
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
124
+ 2025-04-10 18:54:23,255 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-143211.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b9-25cbff3e1f06fa441785271c;6c6a1190-6791-4167-9f75-a668903be2e1)
125
+
126
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
127
+ 2025-04-10 18:54:27,172 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-152804.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7bd-5f8da572682024ef3aeecadd;65480ebf-2375-4cd8-95f3-bc841d8dc171)
128
+
129
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
130
+ 2025-04-10 18:54:29,899 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-161228.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c0-75a715fb768bdb2a68ccf46f;95d56ef4-8de8-40c9-9297-e7a8281a13d6)
131
+
132
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
133
+ 2025-04-10 18:54:32,895 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165140.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c3-5f70e9ba565434fc7a3c5e18;275dda97-05cc-4a23-9baf-a8f43ede9d62)
134
+
135
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
136
+ 2025-04-10 18:54:35,623 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165215.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c6-133de4eb4630a67f0b375fc6;046200b9-aae9-42e9-85ed-e3a684b164d4)
137
+
138
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
139
+ 2025-04-10 18:54:38,329 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165308.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c8-7723083e298904f14a1af942;fc2bd99a-4fcf-4936-a15c-acbb481941e7)
140
+
141
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
142
+ 2025-04-10 18:54:38,329 - __main__ - INFO - Successfully renamed 22 files
143
+ 2025-04-10 18:54:38,329 - __main__ - WARNING - Failed to process 20 files
144
+ 2025-04-10 18:54:38,329 - __main__ - INFO - === Repair Complete ===
145
+ 2025-04-10 18:59:28,315 - __main__ - INFO - === Starting Dataset Structure Repair ===
146
+ 2025-04-10 18:59:28,318 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
147
+ 2025-04-10 18:59:28,318 - __main__ - INFO - === Starting Path Fix ===
148
+ 2025-04-10 18:59:28,704 - __main__ - INFO - Found 20 files with duplicated chat_history path
149
+ 2025-04-10 18:59:31,306 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180651.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8ed-3fe84d2154e3e007658207e9;73ab81c7-35e8-46ae-92a1-667dac29201b)
150
+
151
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
152
+ 2025-04-10 18:59:33,846 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180725.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f0-66056ea913dbdff1621a3e5c;3cc301c9-c3ce-4c3a-916f-60b221917742)
153
+
154
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
155
+ 2025-04-10 18:59:36,390 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180940.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f2-36854ccc08420cd35c1613b5;a7ca9aea-ac48-4ab6-b4bd-696ae751173a)
156
+
157
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
158
+ 2025-04-10 18:59:38,943 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181628.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f5-4901cbfb535be455015ceb14;39923c77-0144-4d89-a830-8f80a2456e0b)
159
+
160
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
161
+ 2025-04-10 18:59:41,452 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181730.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f7-1eabdc675e3f49fe21c60749;88a17e2b-011e-4440-b6c8-54da2f298b03)
162
+
163
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
164
+ 2025-04-10 18:59:44,025 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184103.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8fa-737ad0103e15fcc177b32fd4;25d06267-14df-4e02-aca0-c56fea86c85b)
165
+
166
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
167
+ 2025-04-10 18:59:46,563 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184613.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8fd-070c52d76345eac007a8ddaf;d5d3f452-f76a-44c8-a439-96a47bc07e77)
168
+
169
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
170
+ 2025-04-10 18:59:49,073 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184705.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8ff-646983a842f1edcd584b57cb;3f66f58c-621e-4842-be44-72c9d1c0fdde)
171
+
172
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
173
+ 2025-04-10 18:59:51,615 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185243.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f902-3dafd5c579a831d05562a7f0;a67f1694-a090-4590-b015-cb5b6f8a91ae)
174
+
175
+ You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
176
+ 2025-04-10 18:59:52,416 - __main__ - INFO -
177
+ Received keyboard interrupt, stopping gracefully...
178
+ 2025-04-10 18:59:55,429 - __main__ - INFO - Process stopped by user
tools/repair_dataset_structure.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ import sys
4
+ import os
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+
8
+ # Загружаем переменные окружения из .env файла
9
+ load_dotenv()
10
+
11
+ # Добавляем корневую директорию проекта в PYTHONPATH
12
+ root_dir = str(Path(__file__).parent.parent)
13
+ sys.path.append(root_dir)
14
+
15
+ from huggingface_hub import HfApi
16
+ from config.settings import (
17
+ DATASET_ID,
18
+ DATASET_CHAT_HISTORY_PATH,
19
+ HF_TOKEN
20
+ )
21
+
22
+ # Настройка логирования
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
26
+ handlers=[
27
+ logging.FileHandler("repair_dataset_structure.log", encoding='utf-8'),
28
+ logging.StreamHandler()
29
+ ]
30
+ )
31
+ logger = logging.getLogger(__name__)
32
+
33
+ def repair_chat_files_structure():
34
+ """
35
+ Move misplaced chat files from root to existing chat_history directory
36
+ """
37
+ try:
38
+ api = HfApi(token=HF_TOKEN)
39
+
40
+ # Получаем список всех файлов в датасете
41
+ files = api.list_repo_files(
42
+ repo_id=DATASET_ID,
43
+ repo_type="dataset"
44
+ )
45
+
46
+ # Находим только файлы чата в корневой директории (без пути)
47
+ misplaced_files = [
48
+ f for f in files
49
+ if f.endswith('.json') and
50
+ '/' not in f and # только файлы в корне
51
+ '-' in f # характерный признак файлов чата (timestamp)
52
+ ]
53
+
54
+ logger.info(f"Found {len(misplaced_files)} misplaced chat files")
55
+
56
+ moved_count = 0
57
+ error_count = 0
58
+
59
+ for file_path in misplaced_files:
60
+ try:
61
+ # Проверяем флаг остановки
62
+ if hasattr(repair_chat_files_structure, 'stop_flag') and repair_chat_files_structure.stop_flag:
63
+ logger.info("Stopping process...")
64
+ break
65
+
66
+ # Добавляем задержку между операциями
67
+ time.sleep(2)
68
+
69
+ # Скачиваем содержимое файла
70
+ file_content = api.hf_hub_download(
71
+ repo_id=DATASET_ID,
72
+ filename=file_path,
73
+ repo_type="dataset"
74
+ )
75
+
76
+ # Перемещаем в существующую chat_history директорию
77
+ new_path = f"chat_history/{file_path}"
78
+
79
+ # Загружаем файл в chat_history
80
+ with open(file_content, 'rb') as f:
81
+ api.upload_file(
82
+ path_or_fileobj=f,
83
+ path_in_repo=new_path,
84
+ repo_id=DATASET_ID,
85
+ repo_type="dataset"
86
+ )
87
+
88
+ # Удаляем файл из корневой директории
89
+ api.delete_file(
90
+ path_in_repo=file_path,
91
+ repo_id=DATASET_ID,
92
+ repo_type="dataset"
93
+ )
94
+
95
+ logger.info(f"Moved {file_path} to {new_path}")
96
+ moved_count += 1
97
+
98
+ except Exception as e:
99
+ logger.error(f"Error processing file {file_path}: {str(e)}")
100
+ error_count += 1
101
+ continue
102
+
103
+ logger.info(f"Successfully moved {moved_count} files from root to chat_history")
104
+ if error_count > 0:
105
+ logger.warning(f"Failed to process {error_count} files")
106
+
107
+ except Exception as e:
108
+ logger.error(f"Error accessing dataset: {str(e)}")
109
+
110
+ def fix_duplicated_paths():
111
+ """
112
+ Fix duplicated chat_history paths in filenames
113
+ """
114
+ try:
115
+ api = HfApi(token=HF_TOKEN)
116
+
117
+ # Получаем только файлы из папки chat_history с дублированным путем
118
+ wrong_paths = [
119
+ f for f in api.list_repo_files(
120
+ repo_id=DATASET_ID,
121
+ repo_type="dataset"
122
+ )
123
+ if f.startswith('chat_history/') and
124
+ f.endswith('.json') and
125
+ 'chat_history\\' in f # ищем файлы с Windows-путем в имени
126
+ ]
127
+
128
+ logger.info(f"Found {len(wrong_paths)} files with duplicated chat_history path")
129
+
130
+ fixed_count = 0
131
+ error_count = 0
132
+
133
+ for file_path in wrong_paths:
134
+ try:
135
+ # Проверяем флаг остановки
136
+ if hasattr(fix_duplicated_paths, 'stop_flag') and fix_duplicated_paths.stop_flag:
137
+ logger.info("Stopping process...")
138
+ break
139
+
140
+ # Добавляем задержку между операциями
141
+ time.sleep(2)
142
+
143
+ # Скачиваем содержимое файла
144
+ file_content = api.hf_hub_download(
145
+ repo_id=DATASET_ID,
146
+ filename=file_path,
147
+ repo_type="dataset"
148
+ )
149
+
150
+ # Создаем правильный путь
151
+ filename = os.path.basename(file_path).replace('chat_history\\', '')
152
+ new_path = f"chat_history/{filename}"
153
+
154
+ # Загружаем файл с правильным путем
155
+ with open(file_content, 'rb') as f:
156
+ api.upload_file(
157
+ path_or_fileobj=f,
158
+ path_in_repo=new_path,
159
+ repo_id=DATASET_ID,
160
+ repo_type="dataset"
161
+ )
162
+
163
+ # Удаляем файл со старым путем
164
+ api.delete_file(
165
+ path_in_repo=file_path,
166
+ repo_id=DATASET_ID,
167
+ repo_type="dataset"
168
+ )
169
+
170
+ logger.info(f"Renamed {file_path} to {new_path}")
171
+ fixed_count += 1
172
+
173
+ except Exception as e:
174
+ logger.error(f"Error processing file {file_path}: {str(e)}")
175
+ error_count += 1
176
+ continue
177
+
178
+ logger.info(f"Successfully renamed {fixed_count} files")
179
+ if error_count > 0:
180
+ logger.warning(f"Failed to process {error_count} files")
181
+
182
+ except Exception as e:
183
+ logger.error(f"Error accessing dataset: {str(e)}")
184
+
185
+ if __name__ == "__main__":
186
+ try:
187
+ logger.info("=== Starting Dataset Structure Repair ===")
188
+ logger.info(f"Dataset ID: {DATASET_ID}")
189
+
190
+ # Сначала перемещаем файлы из корня
191
+ #repair_chat_files_structure()
192
+
193
+ # Затем исправляем пути
194
+ logger.info("=== Starting Path Fix ===")
195
+ fix_duplicated_paths()
196
+
197
+ logger.info("=== Repair Complete ===")
198
+ except KeyboardInterrupt:
199
+ logger.info("\nReceived keyboard interrupt, stopping gracefully...")
200
+ repair_chat_files_structure.stop_flag = True
201
+ fix_duplicated_paths.stop_flag = True
202
+ time.sleep(3)
203
+ logger.info("Process stopped by user")
204
+ except Exception as e:
205
+ logger.error(f"Unexpected error: {str(e)}")
206
+
207
+
208
+
209
+
210
+
211
+