Spaces:
Running
Running
Refactor training data export function and add dataset structure repair tool
Browse files- Updated the export_training_data_action function call to change the order of parameters in app.py.
- Added new constants for training data paths in settings.py.
- Implemented ensure_dataset_directory function in dataset.py to create missing directories for training data.
- Enhanced dataset structure repair tool in repair_dataset_structure.py to move misplaced chat files and fix duplicated paths.
- Added logging functionality to track the repair process and errors.
- app.py +1 -1
- config/settings.py +3 -0
- dataset.py +26 -1
- repair_dataset_structure.log +178 -0
- tools/repair_dataset_structure.py +211 -0
app.py
CHANGED
|
@@ -1375,7 +1375,7 @@ with gr.Blocks(css="""
|
|
| 1375 |
|
| 1376 |
# Export training data
|
| 1377 |
export_btn.click(
|
| 1378 |
-
fn=lambda min_r, path: export_training_data_action(min_r, path
|
| 1379 |
inputs=[min_rating, export_path],
|
| 1380 |
outputs=[export_status]
|
| 1381 |
)
|
|
|
|
| 1375 |
|
| 1376 |
# Export training data
|
| 1377 |
export_btn.click(
|
| 1378 |
+
fn=lambda min_r, path: export_training_data_action(chat_evaluator, min_r, path),
|
| 1379 |
inputs=[min_rating, export_path],
|
| 1380 |
outputs=[export_status]
|
| 1381 |
)
|
config/settings.py
CHANGED
|
@@ -29,6 +29,9 @@ DATASET_FINE_TUNED_PATH = "fine_tuned_models"
|
|
| 29 |
DATASET_ANNOTATIONS_PATH = "annotations"
|
| 30 |
DATASET_ERROR_LOGS_PATH = "error_logs"
|
| 31 |
DATASET_PREFERENCES_PATH = "preferences/user_preferences.json"
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
# Temporary storage (using system temp directory)
|
| 34 |
TEMP_DIR = tempfile.gettempdir()
|
|
|
|
| 29 |
DATASET_ANNOTATIONS_PATH = "annotations"
|
| 30 |
DATASET_ERROR_LOGS_PATH = "error_logs"
|
| 31 |
DATASET_PREFERENCES_PATH = "preferences/user_preferences.json"
|
| 32 |
+
# Adding training data paths
|
| 33 |
+
DATASET_TRAINING_DATA_PATH = "training_data"
|
| 34 |
+
DATASET_TRAINING_LOGS_PATH = "training_logs"
|
| 35 |
|
| 36 |
# Temporary storage (using system temp directory)
|
| 37 |
TEMP_DIR = tempfile.gettempdir()
|
dataset.py
CHANGED
|
@@ -5,18 +5,43 @@ from config.settings import (
|
|
| 5 |
DATASET_CHAT_HISTORY_PATH,
|
| 6 |
DATASET_FINE_TUNED_PATH,
|
| 7 |
DATASET_ANNOTATIONS_PATH,
|
|
|
|
|
|
|
| 8 |
HF_TOKEN
|
| 9 |
)
|
| 10 |
|
| 11 |
api = HfApi(token=HF_TOKEN)
|
| 12 |
dataset_name = DATASET_ID
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# Initialize dataset structure
|
| 15 |
directories = [
|
| 16 |
DATASET_VECTOR_STORE_PATH,
|
| 17 |
DATASET_CHAT_HISTORY_PATH,
|
| 18 |
DATASET_FINE_TUNED_PATH,
|
| 19 |
-
DATASET_ANNOTATIONS_PATH
|
|
|
|
|
|
|
| 20 |
]
|
| 21 |
|
| 22 |
try:
|
|
|
|
| 5 |
DATASET_CHAT_HISTORY_PATH,
|
| 6 |
DATASET_FINE_TUNED_PATH,
|
| 7 |
DATASET_ANNOTATIONS_PATH,
|
| 8 |
+
DATASET_TRAINING_DATA_PATH,
|
| 9 |
+
DATASET_TRAINING_LOGS_PATH,
|
| 10 |
HF_TOKEN
|
| 11 |
)
|
| 12 |
|
| 13 |
api = HfApi(token=HF_TOKEN)
|
| 14 |
dataset_name = DATASET_ID
|
| 15 |
|
| 16 |
+
def ensure_dataset_directory(directory: str) -> None:
|
| 17 |
+
"""
|
| 18 |
+
Check if directory exists in dataset and create if not
|
| 19 |
+
"""
|
| 20 |
+
try:
|
| 21 |
+
files = api.list_repo_files(repo_id=dataset_name, repo_type="dataset")
|
| 22 |
+
if not any(f.startswith(f"{directory}/") for f in files):
|
| 23 |
+
api.upload_file(
|
| 24 |
+
path_or_fileobj=b"",
|
| 25 |
+
path_in_repo=f"{directory}/.gitkeep",
|
| 26 |
+
repo_id=dataset_name,
|
| 27 |
+
repo_type="dataset"
|
| 28 |
+
)
|
| 29 |
+
print(f"✓ Created missing directory: {directory}")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"Error ensuring directory {directory}: {str(e)}")
|
| 32 |
+
|
| 33 |
+
# Ensure training directories exist
|
| 34 |
+
ensure_dataset_directory(DATASET_TRAINING_DATA_PATH)
|
| 35 |
+
ensure_dataset_directory(DATASET_TRAINING_LOGS_PATH)
|
| 36 |
+
|
| 37 |
# Initialize dataset structure
|
| 38 |
directories = [
|
| 39 |
DATASET_VECTOR_STORE_PATH,
|
| 40 |
DATASET_CHAT_HISTORY_PATH,
|
| 41 |
DATASET_FINE_TUNED_PATH,
|
| 42 |
+
DATASET_ANNOTATIONS_PATH,
|
| 43 |
+
DATASET_TRAINING_DATA_PATH,
|
| 44 |
+
DATASET_TRAINING_LOGS_PATH
|
| 45 |
]
|
| 46 |
|
| 47 |
try:
|
repair_dataset_structure.log
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-10 17:42:04,332 - __main__ - INFO - === Starting Dataset Structure Repair ===
|
| 2 |
+
2025-04-10 17:42:04,333 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
|
| 3 |
+
2025-04-10 17:42:04,727 - __main__ - INFO - Found 3 misplaced chat files
|
| 4 |
+
2025-04-10 17:42:08,843 - __main__ - INFO - Moved annotations/annotation_01be5785-10a7-460b-9374-f749897987c7.json to chat_history/annotations/annotation_01be5785-10a7-460b-9374-f749897987c7.json
|
| 5 |
+
2025-04-10 17:42:12,475 - __main__ - INFO - Moved annotations/annotation_46035aa7-b1c9-4c33-af58-0cf335f6727c.json to chat_history/annotations/annotation_46035aa7-b1c9-4c33-af58-0cf335f6727c.json
|
| 6 |
+
2025-04-10 17:42:16,223 - __main__ - INFO - Moved annotations/annotation_6f98aeb4-58ca-48a4-a236-2719592c67b2.json to chat_history/annotations/annotation_6f98aeb4-58ca-48a4-a236-2719592c67b2.json
|
| 7 |
+
2025-04-10 17:42:16,223 - __main__ - INFO - Successfully moved 3 files
|
| 8 |
+
2025-04-10 17:42:16,223 - __main__ - INFO - === Repair Complete ===
|
| 9 |
+
2025-04-10 18:43:48,372 - __main__ - INFO - === Starting Dataset Structure Repair ===
|
| 10 |
+
2025-04-10 18:43:48,372 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
|
| 11 |
+
2025-04-10 18:43:48,893 - __main__ - INFO - Found 42 misplaced chat files
|
| 12 |
+
2025-04-10 18:43:53,235 - __main__ - INFO - Moved chat_history\20250403-003446.json to chat_history/chat_history\20250403-003446.json
|
| 13 |
+
2025-04-10 18:43:57,233 - __main__ - INFO - Moved chat_history\20250403-004332.json to chat_history/chat_history\20250403-004332.json
|
| 14 |
+
2025-04-10 18:44:01,825 - __main__ - INFO - Moved chat_history\20250403-005218.json to chat_history/chat_history\20250403-005218.json
|
| 15 |
+
2025-04-10 18:44:06,138 - __main__ - INFO - Moved chat_history\20250403-005243.json to chat_history/chat_history\20250403-005243.json
|
| 16 |
+
2025-04-10 18:44:10,245 - __main__ - INFO - Moved chat_history\20250403-005335.json to chat_history/chat_history\20250403-005335.json
|
| 17 |
+
2025-04-10 18:44:14,219 - __main__ - INFO - Moved chat_history\20250403-115212.json to chat_history/chat_history\20250403-115212.json
|
| 18 |
+
2025-04-10 18:44:19,303 - __main__ - INFO - Moved chat_history\20250403-154557.json to chat_history/chat_history\20250403-154557.json
|
| 19 |
+
2025-04-10 18:44:22,916 - __main__ - INFO - Moved chat_history\20250403-170727.json to chat_history/chat_history\20250403-170727.json
|
| 20 |
+
2025-04-10 18:44:26,650 - __main__ - INFO - Moved chat_history\20250403-171345.json to chat_history/chat_history\20250403-171345.json
|
| 21 |
+
2025-04-10 18:44:31,024 - __main__ - INFO - Moved chat_history\20250403-180146.json to chat_history/chat_history\20250403-180146.json
|
| 22 |
+
2025-04-10 18:44:34,740 - __main__ - INFO - Moved chat_history\20250403-181618.json to chat_history/chat_history\20250403-181618.json
|
| 23 |
+
2025-04-10 18:44:38,314 - __main__ - INFO - Moved chat_history\20250403-182229.json to chat_history/chat_history\20250403-182229.json
|
| 24 |
+
2025-04-10 18:44:41,931 - __main__ - INFO - Moved chat_history\20250404-131413.json to chat_history/chat_history\20250404-131413.json
|
| 25 |
+
2025-04-10 18:44:45,580 - __main__ - INFO - Moved chat_history\20250404-131503.json to chat_history/chat_history\20250404-131503.json
|
| 26 |
+
2025-04-10 18:44:49,412 - __main__ - INFO - Moved chat_history\20250404-134635.json to chat_history/chat_history\20250404-134635.json
|
| 27 |
+
2025-04-10 18:44:53,247 - __main__ - INFO - Moved chat_history\20250404-140648.json to chat_history/chat_history\20250404-140648.json
|
| 28 |
+
2025-04-10 18:44:56,894 - __main__ - INFO - Moved chat_history\20250404-140914.json to chat_history/chat_history\20250404-140914.json
|
| 29 |
+
2025-04-10 18:45:00,593 - __main__ - INFO - Moved chat_history\20250404-140955.json to chat_history/chat_history\20250404-140955.json
|
| 30 |
+
2025-04-10 18:45:04,591 - __main__ - INFO - Moved chat_history\20250404-175337.json to chat_history/chat_history\20250404-175337.json
|
| 31 |
+
2025-04-10 18:45:08,298 - __main__ - INFO - Moved chat_history\20250404-180019.json to chat_history/chat_history\20250404-180019.json
|
| 32 |
+
2025-04-10 18:45:11,886 - __main__ - INFO - Moved chat_history\20250404-180053.json to chat_history/chat_history\20250404-180053.json
|
| 33 |
+
2025-04-10 18:45:15,675 - __main__ - INFO - Moved chat_history\20250404-180123.json to chat_history/chat_history\20250404-180123.json
|
| 34 |
+
2025-04-10 18:45:19,630 - __main__ - INFO - Moved chat_history\20250404-180651.json to chat_history/chat_history\20250404-180651.json
|
| 35 |
+
2025-04-10 18:45:23,351 - __main__ - INFO - Moved chat_history\20250404-180725.json to chat_history/chat_history\20250404-180725.json
|
| 36 |
+
2025-04-10 18:45:27,772 - __main__ - INFO - Moved chat_history\20250404-180940.json to chat_history/chat_history\20250404-180940.json
|
| 37 |
+
2025-04-10 18:45:31,678 - __main__ - INFO - Moved chat_history\20250404-181628.json to chat_history/chat_history\20250404-181628.json
|
| 38 |
+
2025-04-10 18:45:35,959 - __main__ - INFO - Moved chat_history\20250404-181730.json to chat_history/chat_history\20250404-181730.json
|
| 39 |
+
2025-04-10 18:45:39,765 - __main__ - INFO - Moved chat_history\20250404-184103.json to chat_history/chat_history\20250404-184103.json
|
| 40 |
+
2025-04-10 18:45:43,638 - __main__ - INFO - Moved chat_history\20250404-184613.json to chat_history/chat_history\20250404-184613.json
|
| 41 |
+
2025-04-10 18:45:47,378 - __main__ - INFO - Moved chat_history\20250404-184705.json to chat_history/chat_history\20250404-184705.json
|
| 42 |
+
2025-04-10 18:45:51,172 - __main__ - INFO - Moved chat_history\20250404-185243.json to chat_history/chat_history\20250404-185243.json
|
| 43 |
+
2025-04-10 18:45:54,867 - __main__ - INFO - Moved chat_history\20250404-185810.json to chat_history/chat_history\20250404-185810.json
|
| 44 |
+
2025-04-10 18:45:58,349 - __main__ - INFO - Moved chat_history\20250404-202051.json to chat_history/chat_history\20250404-202051.json
|
| 45 |
+
2025-04-10 18:46:03,364 - __main__ - INFO - Moved chat_history\20250407-174318.json to chat_history/chat_history\20250407-174318.json
|
| 46 |
+
2025-04-10 18:46:06,933 - __main__ - INFO - Moved chat_history\20250407-175851.json to chat_history/chat_history\20250407-175851.json
|
| 47 |
+
2025-04-10 18:46:10,714 - __main__ - INFO - Moved chat_history\20250408-135920.json to chat_history/chat_history\20250408-135920.json
|
| 48 |
+
2025-04-10 18:46:14,230 - __main__ - INFO - Moved chat_history\20250408-143211.json to chat_history/chat_history\20250408-143211.json
|
| 49 |
+
2025-04-10 18:46:17,815 - __main__ - INFO - Moved chat_history\20250408-152804.json to chat_history/chat_history\20250408-152804.json
|
| 50 |
+
2025-04-10 18:46:21,415 - __main__ - INFO - Moved chat_history\20250408-161228.json to chat_history/chat_history\20250408-161228.json
|
| 51 |
+
2025-04-10 18:46:25,573 - __main__ - INFO - Moved chat_history\20250408-165140.json to chat_history/chat_history\20250408-165140.json
|
| 52 |
+
2025-04-10 18:46:29,149 - __main__ - INFO - Moved chat_history\20250408-165215.json to chat_history/chat_history\20250408-165215.json
|
| 53 |
+
2025-04-10 18:46:32,798 - __main__ - INFO - Moved chat_history\20250408-165308.json to chat_history/chat_history\20250408-165308.json
|
| 54 |
+
2025-04-10 18:46:32,798 - __main__ - INFO - Successfully moved 42 files from root to chat_history
|
| 55 |
+
2025-04-10 18:46:32,814 - __main__ - INFO - === Repair Complete ===
|
| 56 |
+
2025-04-10 18:52:09,005 - __main__ - INFO - === Starting Dataset Structure Repair ===
|
| 57 |
+
2025-04-10 18:52:09,005 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
|
| 58 |
+
2025-04-10 18:52:09,005 - __main__ - INFO - === Starting Path Fix ===
|
| 59 |
+
2025-04-10 18:52:09,415 - __main__ - INFO - Found 42 files with duplicated chat_history path
|
| 60 |
+
2025-04-10 18:52:17,036 - __main__ - INFO - Renamed chat_history/chat_history\20250403-003446.json to chat_history/20250403-003446.json
|
| 61 |
+
2025-04-10 18:52:21,492 - __main__ - INFO - Renamed chat_history/chat_history\20250403-004332.json to chat_history/20250403-004332.json
|
| 62 |
+
2025-04-10 18:52:25,225 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005218.json to chat_history/20250403-005218.json
|
| 63 |
+
2025-04-10 18:52:29,282 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005243.json to chat_history/20250403-005243.json
|
| 64 |
+
2025-04-10 18:52:32,882 - __main__ - INFO - Renamed chat_history/chat_history\20250403-005335.json to chat_history/20250403-005335.json
|
| 65 |
+
2025-04-10 18:52:36,438 - __main__ - INFO - Renamed chat_history/chat_history\20250403-115212.json to chat_history/20250403-115212.json
|
| 66 |
+
2025-04-10 18:52:40,280 - __main__ - INFO - Renamed chat_history/chat_history\20250403-154557.json to chat_history/20250403-154557.json
|
| 67 |
+
2025-04-10 18:52:44,213 - __main__ - INFO - Renamed chat_history/chat_history\20250403-170727.json to chat_history/20250403-170727.json
|
| 68 |
+
2025-04-10 18:52:48,115 - __main__ - INFO - Renamed chat_history/chat_history\20250403-171345.json to chat_history/20250403-171345.json
|
| 69 |
+
2025-04-10 18:52:51,976 - __main__ - INFO - Renamed chat_history/chat_history\20250403-180146.json to chat_history/20250403-180146.json
|
| 70 |
+
2025-04-10 18:52:55,930 - __main__ - INFO - Renamed chat_history/chat_history\20250403-181618.json to chat_history/20250403-181618.json
|
| 71 |
+
2025-04-10 18:52:59,738 - __main__ - INFO - Renamed chat_history/chat_history\20250403-182229.json to chat_history/20250403-182229.json
|
| 72 |
+
2025-04-10 18:53:05,572 - __main__ - INFO - Renamed chat_history/chat_history\20250404-131413.json to chat_history/20250404-131413.json
|
| 73 |
+
2025-04-10 18:53:09,406 - __main__ - INFO - Renamed chat_history/chat_history\20250404-131503.json to chat_history/20250404-131503.json
|
| 74 |
+
2025-04-10 18:53:13,212 - __main__ - INFO - Renamed chat_history/chat_history\20250404-134635.json to chat_history/20250404-134635.json
|
| 75 |
+
2025-04-10 18:53:17,506 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140648.json to chat_history/20250404-140648.json
|
| 76 |
+
2025-04-10 18:53:21,872 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140914.json to chat_history/20250404-140914.json
|
| 77 |
+
2025-04-10 18:53:25,516 - __main__ - INFO - Renamed chat_history/chat_history\20250404-140955.json to chat_history/20250404-140955.json
|
| 78 |
+
2025-04-10 18:53:29,177 - __main__ - INFO - Renamed chat_history/chat_history\20250404-175337.json to chat_history/20250404-175337.json
|
| 79 |
+
2025-04-10 18:53:32,998 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180019.json to chat_history/20250404-180019.json
|
| 80 |
+
2025-04-10 18:53:36,716 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180053.json to chat_history/20250404-180053.json
|
| 81 |
+
2025-04-10 18:53:40,483 - __main__ - INFO - Renamed chat_history/chat_history\20250404-180123.json to chat_history/20250404-180123.json
|
| 82 |
+
2025-04-10 18:53:43,269 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180651.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f791-503acbd66e4aeb5f2ba7abd9;11c9bb9a-ac35-4c84-ba83-0a1f415ee40c)
|
| 83 |
+
|
| 84 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 85 |
+
2025-04-10 18:53:46,037 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180725.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f794-490d14f53aebbbb72066ab6f;131385d7-3a06-4037-a1d0-0a8ba167c80d)
|
| 86 |
+
|
| 87 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 88 |
+
2025-04-10 18:53:49,886 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180940.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f798-2e222f1d59ac3dc41de64dfb;aecbf90a-b7b0-43ca-b4b1-fffd4d6ab70d)
|
| 89 |
+
|
| 90 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 91 |
+
2025-04-10 18:53:52,612 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181628.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f79b-09915158005dfa7c5a56a15d;3d5cecd6-e1a3-44f1-8234-8ab0a873b718)
|
| 92 |
+
|
| 93 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 94 |
+
2025-04-10 18:53:55,411 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181730.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f79d-64e62bbb44ac2ef51c8518b0;927958ad-8808-4eb2-96b3-0528085c7396)
|
| 95 |
+
|
| 96 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 97 |
+
2025-04-10 18:53:58,337 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184103.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a0-5d8a1a2c1070cf2648e0ec44;37fccdca-bcc5-40d7-85f6-f26caa2489e6)
|
| 98 |
+
|
| 99 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 100 |
+
2025-04-10 18:54:01,048 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184613.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a3-448e4bda194517cf7a8285ed;14c70e24-3545-4b6a-841b-2bbac1e9bc6b)
|
| 101 |
+
|
| 102 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 103 |
+
2025-04-10 18:54:03,937 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184705.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a6-7655014d5e3a972d08815ddc;3cd55990-22c6-499a-95f1-f662d12c51b8)
|
| 104 |
+
|
| 105 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 106 |
+
2025-04-10 18:54:06,653 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185243.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7a9-2eced6ec760924e43262136d;633dd326-9951-49b0-9b52-57825ae88ed0)
|
| 107 |
+
|
| 108 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 109 |
+
2025-04-10 18:54:09,403 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185810.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7ab-0d9b2d483b6902386b048933;b95da74a-3609-454d-a0c0-0a7f14b6d6de)
|
| 110 |
+
|
| 111 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 112 |
+
2025-04-10 18:54:12,136 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-202051.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7ae-52d0acc90186a3dc4b9254a6;6c44b4d6-6627-4b01-8a50-a4fa21959fdb)
|
| 113 |
+
|
| 114 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 115 |
+
2025-04-10 18:54:15,081 - __main__ - ERROR - Error processing file chat_history/chat_history\20250407-174318.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b1-4e99483725d815ad61bbaacc;6fb9057f-8d66-43ac-b603-789f4f7ba815)
|
| 116 |
+
|
| 117 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 118 |
+
2025-04-10 18:54:17,822 - __main__ - ERROR - Error processing file chat_history/chat_history\20250407-175851.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b4-2b9c83346c295d82770bd968;8e704238-9252-426b-b350-e3f52fd2cbb0)
|
| 119 |
+
|
| 120 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 121 |
+
2025-04-10 18:54:20,537 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-135920.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b6-6ea689051ea0cca350f3ca17;0fa53b91-119c-47b2-9761-93ea7d93a993)
|
| 122 |
+
|
| 123 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 124 |
+
2025-04-10 18:54:23,255 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-143211.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7b9-25cbff3e1f06fa441785271c;6c6a1190-6791-4167-9f75-a668903be2e1)
|
| 125 |
+
|
| 126 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 127 |
+
2025-04-10 18:54:27,172 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-152804.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7bd-5f8da572682024ef3aeecadd;65480ebf-2375-4cd8-95f3-bc841d8dc171)
|
| 128 |
+
|
| 129 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 130 |
+
2025-04-10 18:54:29,899 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-161228.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c0-75a715fb768bdb2a68ccf46f;95d56ef4-8de8-40c9-9297-e7a8281a13d6)
|
| 131 |
+
|
| 132 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 133 |
+
2025-04-10 18:54:32,895 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165140.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c3-5f70e9ba565434fc7a3c5e18;275dda97-05cc-4a23-9baf-a8f43ede9d62)
|
| 134 |
+
|
| 135 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 136 |
+
2025-04-10 18:54:35,623 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165215.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c6-133de4eb4630a67f0b375fc6;046200b9-aae9-42e9-85ed-e3a684b164d4)
|
| 137 |
+
|
| 138 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 139 |
+
2025-04-10 18:54:38,329 - __main__ - ERROR - Error processing file chat_history/chat_history\20250408-165308.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f7c8-7723083e298904f14a1af942;fc2bd99a-4fcf-4936-a15c-acbb481941e7)
|
| 140 |
+
|
| 141 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 142 |
+
2025-04-10 18:54:38,329 - __main__ - INFO - Successfully renamed 22 files
|
| 143 |
+
2025-04-10 18:54:38,329 - __main__ - WARNING - Failed to process 20 files
|
| 144 |
+
2025-04-10 18:54:38,329 - __main__ - INFO - === Repair Complete ===
|
| 145 |
+
2025-04-10 18:59:28,315 - __main__ - INFO - === Starting Dataset Structure Repair ===
|
| 146 |
+
2025-04-10 18:59:28,318 - __main__ - INFO - Dataset ID: Rulga/status-law-knowledge-base
|
| 147 |
+
2025-04-10 18:59:28,318 - __main__ - INFO - === Starting Path Fix ===
|
| 148 |
+
2025-04-10 18:59:28,704 - __main__ - INFO - Found 20 files with duplicated chat_history path
|
| 149 |
+
2025-04-10 18:59:31,306 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180651.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8ed-3fe84d2154e3e007658207e9;73ab81c7-35e8-46ae-92a1-667dac29201b)
|
| 150 |
+
|
| 151 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 152 |
+
2025-04-10 18:59:33,846 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180725.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f0-66056ea913dbdff1621a3e5c;3cc301c9-c3ce-4c3a-916f-60b221917742)
|
| 153 |
+
|
| 154 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 155 |
+
2025-04-10 18:59:36,390 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-180940.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f2-36854ccc08420cd35c1613b5;a7ca9aea-ac48-4ab6-b4bd-696ae751173a)
|
| 156 |
+
|
| 157 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 158 |
+
2025-04-10 18:59:38,943 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181628.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f5-4901cbfb535be455015ceb14;39923c77-0144-4d89-a830-8f80a2456e0b)
|
| 159 |
+
|
| 160 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 161 |
+
2025-04-10 18:59:41,452 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-181730.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8f7-1eabdc675e3f49fe21c60749;88a17e2b-011e-4440-b6c8-54da2f298b03)
|
| 162 |
+
|
| 163 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 164 |
+
2025-04-10 18:59:44,025 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184103.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8fa-737ad0103e15fcc177b32fd4;25d06267-14df-4e02-aca0-c56fea86c85b)
|
| 165 |
+
|
| 166 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 167 |
+
2025-04-10 18:59:46,563 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184613.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8fd-070c52d76345eac007a8ddaf;d5d3f452-f76a-44c8-a439-96a47bc07e77)
|
| 168 |
+
|
| 169 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 170 |
+
2025-04-10 18:59:49,073 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-184705.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f8ff-646983a842f1edcd584b57cb;3f66f58c-621e-4842-be44-72c9d1c0fdde)
|
| 171 |
+
|
| 172 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 173 |
+
2025-04-10 18:59:51,615 - __main__ - ERROR - Error processing file chat_history/chat_history\20250404-185243.json: 429 Client Error: Too Many Requests for url: https://huggingface.co/api/datasets/Rulga/status-law-knowledge-base/commit/main (Request ID: Root=1-67f7f902-3dafd5c579a831d05562a7f0;a67f1694-a090-4590-b015-cb5b6f8a91ae)
|
| 174 |
+
|
| 175 |
+
You have been rate-limited; you can retry this action in about 1 hour. If you're a new user, your limits will raise progressively over time. Get in touch with us at website@huggingface.co if you need access now.
|
| 176 |
+
2025-04-10 18:59:52,416 - __main__ - INFO -
|
| 177 |
+
Received keyboard interrupt, stopping gracefully...
|
| 178 |
+
2025-04-10 18:59:55,429 - __main__ - INFO - Process stopped by user
|
tools/repair_dataset_structure.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import time
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Загружаем переменные окружения из .env файла
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# Добавляем корневую директорию проекта в PYTHONPATH
|
| 12 |
+
root_dir = str(Path(__file__).parent.parent)
|
| 13 |
+
sys.path.append(root_dir)
|
| 14 |
+
|
| 15 |
+
from huggingface_hub import HfApi
|
| 16 |
+
from config.settings import (
|
| 17 |
+
DATASET_ID,
|
| 18 |
+
DATASET_CHAT_HISTORY_PATH,
|
| 19 |
+
HF_TOKEN
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Настройка логирования
|
| 23 |
+
logging.basicConfig(
|
| 24 |
+
level=logging.INFO,
|
| 25 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 26 |
+
handlers=[
|
| 27 |
+
logging.FileHandler("repair_dataset_structure.log", encoding='utf-8'),
|
| 28 |
+
logging.StreamHandler()
|
| 29 |
+
]
|
| 30 |
+
)
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
def repair_chat_files_structure():
|
| 34 |
+
"""
|
| 35 |
+
Move misplaced chat files from root to existing chat_history directory
|
| 36 |
+
"""
|
| 37 |
+
try:
|
| 38 |
+
api = HfApi(token=HF_TOKEN)
|
| 39 |
+
|
| 40 |
+
# Получаем список всех файлов в датасете
|
| 41 |
+
files = api.list_repo_files(
|
| 42 |
+
repo_id=DATASET_ID,
|
| 43 |
+
repo_type="dataset"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Находим только файлы чата в корневой директории (без пути)
|
| 47 |
+
misplaced_files = [
|
| 48 |
+
f for f in files
|
| 49 |
+
if f.endswith('.json') and
|
| 50 |
+
'/' not in f and # только файлы в корне
|
| 51 |
+
'-' in f # характерный признак файлов чата (timestamp)
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
logger.info(f"Found {len(misplaced_files)} misplaced chat files")
|
| 55 |
+
|
| 56 |
+
moved_count = 0
|
| 57 |
+
error_count = 0
|
| 58 |
+
|
| 59 |
+
for file_path in misplaced_files:
|
| 60 |
+
try:
|
| 61 |
+
# Проверяем флаг остановки
|
| 62 |
+
if hasattr(repair_chat_files_structure, 'stop_flag') and repair_chat_files_structure.stop_flag:
|
| 63 |
+
logger.info("Stopping process...")
|
| 64 |
+
break
|
| 65 |
+
|
| 66 |
+
# Добавляем задержку между операциями
|
| 67 |
+
time.sleep(2)
|
| 68 |
+
|
| 69 |
+
# Скачиваем содержимое файла
|
| 70 |
+
file_content = api.hf_hub_download(
|
| 71 |
+
repo_id=DATASET_ID,
|
| 72 |
+
filename=file_path,
|
| 73 |
+
repo_type="dataset"
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Перемещаем в существующую chat_history директорию
|
| 77 |
+
new_path = f"chat_history/{file_path}"
|
| 78 |
+
|
| 79 |
+
# Загружаем файл в chat_history
|
| 80 |
+
with open(file_content, 'rb') as f:
|
| 81 |
+
api.upload_file(
|
| 82 |
+
path_or_fileobj=f,
|
| 83 |
+
path_in_repo=new_path,
|
| 84 |
+
repo_id=DATASET_ID,
|
| 85 |
+
repo_type="dataset"
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Удаляем файл из корневой директории
|
| 89 |
+
api.delete_file(
|
| 90 |
+
path_in_repo=file_path,
|
| 91 |
+
repo_id=DATASET_ID,
|
| 92 |
+
repo_type="dataset"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
logger.info(f"Moved {file_path} to {new_path}")
|
| 96 |
+
moved_count += 1
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"Error processing file {file_path}: {str(e)}")
|
| 100 |
+
error_count += 1
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
logger.info(f"Successfully moved {moved_count} files from root to chat_history")
|
| 104 |
+
if error_count > 0:
|
| 105 |
+
logger.warning(f"Failed to process {error_count} files")
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Error accessing dataset: {str(e)}")
|
| 109 |
+
|
| 110 |
+
def fix_duplicated_paths():
|
| 111 |
+
"""
|
| 112 |
+
Fix duplicated chat_history paths in filenames
|
| 113 |
+
"""
|
| 114 |
+
try:
|
| 115 |
+
api = HfApi(token=HF_TOKEN)
|
| 116 |
+
|
| 117 |
+
# Получаем только файлы из папки chat_history с дублированным путем
|
| 118 |
+
wrong_paths = [
|
| 119 |
+
f for f in api.list_repo_files(
|
| 120 |
+
repo_id=DATASET_ID,
|
| 121 |
+
repo_type="dataset"
|
| 122 |
+
)
|
| 123 |
+
if f.startswith('chat_history/') and
|
| 124 |
+
f.endswith('.json') and
|
| 125 |
+
'chat_history\\' in f # ищем файлы с Windows-путем в имени
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
logger.info(f"Found {len(wrong_paths)} files with duplicated chat_history path")
|
| 129 |
+
|
| 130 |
+
fixed_count = 0
|
| 131 |
+
error_count = 0
|
| 132 |
+
|
| 133 |
+
for file_path in wrong_paths:
|
| 134 |
+
try:
|
| 135 |
+
# Проверяем флаг остановки
|
| 136 |
+
if hasattr(fix_duplicated_paths, 'stop_flag') and fix_duplicated_paths.stop_flag:
|
| 137 |
+
logger.info("Stopping process...")
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
# Добавляем задержку между операциями
|
| 141 |
+
time.sleep(2)
|
| 142 |
+
|
| 143 |
+
# Скачиваем содержимое файла
|
| 144 |
+
file_content = api.hf_hub_download(
|
| 145 |
+
repo_id=DATASET_ID,
|
| 146 |
+
filename=file_path,
|
| 147 |
+
repo_type="dataset"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# Создаем правильный путь
|
| 151 |
+
filename = os.path.basename(file_path).replace('chat_history\\', '')
|
| 152 |
+
new_path = f"chat_history/{filename}"
|
| 153 |
+
|
| 154 |
+
# Загружаем файл с правильным путем
|
| 155 |
+
with open(file_content, 'rb') as f:
|
| 156 |
+
api.upload_file(
|
| 157 |
+
path_or_fileobj=f,
|
| 158 |
+
path_in_repo=new_path,
|
| 159 |
+
repo_id=DATASET_ID,
|
| 160 |
+
repo_type="dataset"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Удаляем файл со старым путем
|
| 164 |
+
api.delete_file(
|
| 165 |
+
path_in_repo=file_path,
|
| 166 |
+
repo_id=DATASET_ID,
|
| 167 |
+
repo_type="dataset"
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
logger.info(f"Renamed {file_path} to {new_path}")
|
| 171 |
+
fixed_count += 1
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error processing file {file_path}: {str(e)}")
|
| 175 |
+
error_count += 1
|
| 176 |
+
continue
|
| 177 |
+
|
| 178 |
+
logger.info(f"Successfully renamed {fixed_count} files")
|
| 179 |
+
if error_count > 0:
|
| 180 |
+
logger.warning(f"Failed to process {error_count} files")
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.error(f"Error accessing dataset: {str(e)}")
|
| 184 |
+
|
| 185 |
+
if __name__ == "__main__":
|
| 186 |
+
try:
|
| 187 |
+
logger.info("=== Starting Dataset Structure Repair ===")
|
| 188 |
+
logger.info(f"Dataset ID: {DATASET_ID}")
|
| 189 |
+
|
| 190 |
+
# Сначала перемещаем файлы из корня
|
| 191 |
+
#repair_chat_files_structure()
|
| 192 |
+
|
| 193 |
+
# Затем исправляем пути
|
| 194 |
+
logger.info("=== Starting Path Fix ===")
|
| 195 |
+
fix_duplicated_paths()
|
| 196 |
+
|
| 197 |
+
logger.info("=== Repair Complete ===")
|
| 198 |
+
except KeyboardInterrupt:
|
| 199 |
+
logger.info("\nReceived keyboard interrupt, stopping gracefully...")
|
| 200 |
+
repair_chat_files_structure.stop_flag = True
|
| 201 |
+
fix_duplicated_paths.stop_flag = True
|
| 202 |
+
time.sleep(3)
|
| 203 |
+
logger.info("Process stopped by user")
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error(f"Unexpected error: {str(e)}")
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|