1f commited on
Commit
2a79d4e
·
verified ·
1 Parent(s): f65aa03

Add files using upload-large-folder tool

Browse files
r1-a/dataset/filter/gpt_filter_shp2.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import http.client
3
+ import json
4
+ import time
5
+ import random
6
+ import re # Import regex for parsing
7
+ import pandas as pd # For data distribution analysis
8
+ # Make sure necessary types are imported
9
+ from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
10
+ from tqdm.auto import tqdm
11
+ import sys
12
+ import logging
13
+ import concurrent.futures
14
+ from concurrent.futures import ThreadPoolExecutor
15
+ import shutil
16
+ import socket # Added for potential error catching, though http.client might cover it
17
+
18
+ # --- Configuration ---
19
+ # --- !! MODIFIED: Point to the pre-filtered dataset !! ---
20
+ INPUT_DATA_PATH = "./shp2_filtered_tts_high_quality_train_only" # Path from the previous script's output
21
+ # --- Keep other configurations ---
22
+ API_HOST = "api2.aigcbest.top"
23
+ API_PATH = "/v1/chat/completions"
24
+ LLM_MODEL = "gemini-2.5-flash-preview-04-17-nothinking"
25
+ API_KEY = os.environ.get('AIGCBEST_API_KEY', "sk-U15cDXxI0bboL6iH4Hymzl30ws6oWzazWe1Ndwq9QtiPUEgI") # Replace or set env variable
26
+ if not API_KEY or API_KEY == "YOUR_API_KEY_HERE":
27
+ print("API Key is not set correctly. Please set the AIGCBEST_API_KEY environment variable or replace the placeholder.")
28
+ sys.exit(1)
29
+
30
+ # --- !! MODIFIED: Update output directory name !! ---
31
+ OUTPUT_DIR = f"./shp2_filtered_evaluated" # Reflects evaluation applied to filtered data
32
+ # Path to the existing, potentially incomplete, processed dataset (LOAD ONLY) - specific to this script's run
33
+ PROCESSED_DATA_PATH = os.path.join(OUTPUT_DIR, f"train_split_evaluated_intermediate") # Use descriptive name
34
+ # Path where final results will be saved (SAVE ONLY) - specific to this script's run
35
+ FINAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"train_split_evaluated_final")
36
+ # Path for the filtered dataset (based on LLM scores)
37
+ FILTERED_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"train_split_llm_filtered")
38
+
39
+ MAX_WORKERS = 40
40
+ REQUEST_DELAY_SECONDS = 0.1
41
+ MAX_RETRIES = 4
42
+ SAVE_INTERVAL = 1000
43
+
44
+ # --- Filtering Thresholds (LLM scores) ---
45
+ MIN_QUALITY_SCORE = 4
46
+ MIN_SUITABILITY_SCORE = 3
47
+ # Optional: MAX_COMPLEXITY_SCORE = 4
48
+
49
+ # Setup logging
50
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
51
+ logging.getLogger("datasets").setLevel(logging.WARNING)
52
+ logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
53
+ logging.getLogger("filelock").setLevel(logging.WARNING)
54
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
55
+
56
+ # --- LLM API Function (evaluate_prompt_with_llm) ---
57
+ # (Keep the same SYSTEM_PROMPT and function definition as before)
58
+ SYSTEM_PROMPT = """
59
+ You are an AI Quality Assessor specializing in evaluating prompts for AI models, particularly voice-based assistants.
60
+ Your task is to analyze the given user prompt and assign scores based on three metrics: Overall Quality, Complexity, and Voice Response Suitability. You must also provide a brief justification.
61
+
62
+ **Input:** You will receive a single user prompt.
63
+
64
+
65
+
66
+ **Metric Definitions:**
67
+
68
+ 1. **Overall Quality (Score 1-5):** Clarity, coherence, and completeness of the prompt itself.
69
+ * 1 (Very Low): Nonsensical, ambiguous, ungrammatical, incomplete template/placeholder text.
70
+ * 2 (Low): Vague, poorly worded, significant errors, requires excessive interpretation.
71
+ * 3 (Medium): Understandable but could be clearer/more specific. Basic, functional.
72
+ * 4 (High): Clear, well-phrased, specific, unambiguous, effective.
73
+ * 5 (Very High): Exceptionally clear, concise, specific, well-formulated, ideal.
74
+
75
+ 2. **Complexity (Score 1-5):** Cognitive load/intricacy needed to understand the request and generate the *answer*.
76
+ * 1 (Very Simple): Single simple fact, definition, common phrase.
77
+ * 2 (Simple): Basic info recall, single calculation, short standard text generation.
78
+ * 3 (Moderate): Multi-step reasoning, combining info, comparison, moderately complex text/explanation.
79
+ * 4 (Complex): Deep analysis, synthesis, advanced reasoning, creative problem-solving, detailed nuanced text.
80
+ * 5 (Very Complex): Highly specialized knowledge, intricate multi-stage problems, long-form creative content, detailed technical procedures.
81
+
82
+ 3. **Voice Response Suitability (Score 1-5):** Is the *expected answer's content* suitable for delivery via voice ONLY? And whether it can be responded to by llm and whether it is suitable to be converted into speech as a sample.
83
+ * 1 (Very Unsuitable): Answer requires visuals (graphs, tables, code formatting), UI interaction, or is excessively long/structured (e.g., long lists, large code blocks).
84
+ * 2 (Unsuitable): Answer likely very long, complex formatting, significantly easier to parse visually. Poor audio UX.
85
+ * 3 (Moderate): Answer might be slightly long or have simple structure (e.g., short lists), but generally digestible via audio. Upper limit for comfort.
86
+ * 4 (Suitable): Answer reasonably concise, informational/conversational, easy to understand when spoken.
87
+ * 5 (Highly Suitable): Ideal for voice - short facts, direct answers, conversational responses, short creative outputs.
88
+
89
+ 4. **Justification (Brief Text):** 1-2 sentences explaining the scores, especially for low (<3) or unusual scores.
90
+ **Output Format:** Respond ONLY with a single string in the following format, replacing bracketed values with your scores and justification. Do NOT include any other text, greetings, or explanations outside this format.
91
+
92
+ Quality: [1-5], Complexity: [1-5], Suitability: [1-5], Justification: [Your brief justification text here]
93
+ **Example Input Prompt:**
94
+ "Explain the process of photosynthesis in detail, including the chemical equations and the differences between C3 and C4 pathways."
95
+
96
+ **Example Output String:**
97
+ Quality: 4, Complexity: 4, Suitability: 3, Justification: Clear prompt asking for detailed scientific explanation. Complex topic, potentially long answer making voice suitability moderate.
98
+ """
99
+
100
+ def evaluate_prompt_with_llm(prompt_text, api_key, host, path, model, retries=MAX_RETRIES):
101
+ """Calls the LLM API to get evaluation scores for a prompt."""
102
+ # Add check for None or empty prompt_text
103
+ if not prompt_text or not isinstance(prompt_text, str) or not prompt_text.strip():
104
+ logging.warning("evaluate_prompt_with_llm received empty or invalid prompt text.")
105
+ return None # Cannot evaluate an empty prompt
106
+
107
+ payload = json.dumps({
108
+ "model": model,
109
+ "messages": [
110
+ {"role": "system", "content": SYSTEM_PROMPT},
111
+ {"role": "user", "content": prompt_text}
112
+ ],
113
+ "temperature": 0.1,
114
+ "max_tokens": 100
115
+ })
116
+ headers = {
117
+ 'Accept': 'application/json',
118
+ 'Authorization': f'Bearer {api_key}',
119
+ 'User-Agent': 'HuggingFace SHP2-Filtered Evaluation Script', # Updated User-Agent
120
+ 'Content-Type': 'application/json'
121
+ }
122
+ time.sleep(random.uniform(REQUEST_DELAY_SECONDS * 0.8, REQUEST_DELAY_SECONDS * 1.2))
123
+
124
+ for attempt in range(retries):
125
+ try:
126
+ conn = http.client.HTTPSConnection(host, timeout=60)
127
+ conn.request("POST", path, payload, headers)
128
+ res = conn.getresponse()
129
+ status = res.status
130
+ data = res.read()
131
+ conn.close()
132
+
133
+ if status == 200:
134
+ response_json = json.loads(data.decode("utf-8"))
135
+ if response_json.get("choices") and len(response_json["choices"]) > 0:
136
+ message = response_json["choices"][0].get("message")
137
+ if message and message.get("content"):
138
+ raw_response = message["content"].strip()
139
+ if raw_response.startswith("Quality:") and "Complexity:" in raw_response and "Suitability:" in raw_response:
140
+ return raw_response
141
+ else:
142
+ logging.warning(f"LLM response format unexpected for prompt '{prompt_text[:50]}...': {raw_response}")
143
+ return raw_response # Return potentially malformed for parsing attempt
144
+ logging.error(f"Unexpected API response structure: {data.decode('utf-8')}")
145
+ elif status == 429:
146
+ retry_after_header = res.getheader('Retry-After', str(int(REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5))))
147
+ try: wait_time = int(retry_after_header)
148
+ except ValueError: wait_time = REQUEST_DELAY_SECONDS * (2 ** attempt) + random.uniform(1, 5)
149
+ logging.warning(f"Rate limit exceeded (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
150
+ time.sleep(wait_time)
151
+ elif status >= 500:
152
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5)
153
+ logging.warning(f"Server error (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
154
+ time.sleep(wait_time)
155
+ else:
156
+ logging.error(f"API Client Error: Status {status}, Response: {data.decode('utf-8')} for prompt: {prompt_text[:60]}")
157
+ return None
158
+
159
+ except (http.client.HTTPException, ConnectionError, socket.gaierror, TimeoutError, socket.timeout) as e: # Added socket errors
160
+ logging.error(f"Network/HTTP error during API call: {e}. Attempt {attempt + 1}/{retries}")
161
+ if attempt + 1 == retries: return None
162
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
163
+ logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
164
+ time.sleep(wait_time)
165
+ except json.JSONDecodeError as e:
166
+ logging.error(f"Failed to decode API response: {e}. Response snippet: {data[:200] if data else 'N/A'}")
167
+ return None
168
+ except Exception as e:
169
+ logging.error(f"An unexpected error occurred during API call: {e}", exc_info=True)
170
+ if attempt + 1 == retries: return None
171
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
172
+ logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
173
+ time.sleep(wait_time)
174
+
175
+ logging.error(f"API call failed after {retries} retries for prompt: {prompt_text[:60]}...")
176
+ return None
177
+
178
+
179
+ # --- Function to Parse LLM Response ---
180
+ # (Keep the same parse_llm_evaluation function as before)
181
+ def parse_llm_evaluation(response_string):
182
+ """Parses the structured string response from the LLM."""
183
+ if not response_string:
184
+ return None, None, None, None, "error_empty_response"
185
+ match = re.match(
186
+ r"Quality:\s*([1-5])\s*,\s*Complexity:\s*([1-5])\s*,\s*Suitability:\s*([1-5])\s*,\s*Justification:\s*(.*)",
187
+ response_string.strip(),
188
+ re.IGNORECASE | re.DOTALL
189
+ )
190
+ if match:
191
+ try:
192
+ quality = int(match.group(1))
193
+ complexity = int(match.group(2))
194
+ suitability = int(match.group(3))
195
+ justification = match.group(4).strip() if match.group(4) else ""
196
+ return quality, complexity, suitability, justification, "success"
197
+ except (ValueError, IndexError):
198
+ logging.warning(f"Parsing failed for matched string (invalid numbers?): {response_string}")
199
+ return None, None, None, None, "error_parsing_matched"
200
+ else:
201
+ logging.warning(f"Regex did not match LLM response format: {response_string}")
202
+ # Fallback (optional, kept from previous version)
203
+ parts = [p.strip() for p in response_string.split(',')]
204
+ scores = {}
205
+ justification = ""
206
+ try:
207
+ for part in parts:
208
+ if ':' in part:
209
+ key, val = part.split(':', 1)
210
+ key = key.strip().lower()
211
+ val = val.strip()
212
+ if key == 'quality' and val.isdigit() and 1 <= int(val) <= 5: scores['quality'] = int(val)
213
+ elif key == 'complexity' and val.isdigit() and 1 <= int(val) <= 5: scores['complexity'] = int(val)
214
+ elif key == 'suitability' and val.isdigit() and 1 <= int(val) <= 5: scores['suitability'] = int(val)
215
+ elif key == 'justification': justification = val
216
+ if 'quality' in scores and 'complexity' in scores and 'suitability' in scores:
217
+ logging.info(f"Fallback parsing successful for: {response_string[:50]}...")
218
+ return scores['quality'], scores['complexity'], scores['suitability'], justification, "success_fallback_parse"
219
+ except Exception as e:
220
+ logging.warning(f"Fallback parsing also failed: {e}")
221
+ pass
222
+ return None, None, None, None, "error_parsing_no_match"
223
+
224
+
225
+ # --- Dataset Processing Function (Adapted for Evaluation) ---
226
+ # --- !! MODIFIED: Target 'query' column !! ---
227
+ def evaluate_dataset_entry(example):
228
+ """Processes a single dataset entry to get LLM evaluation."""
229
+ processed_example = example.copy()
230
+ processed_example['llm_quality'] = example.get('llm_quality', None)
231
+ processed_example['llm_complexity'] = example.get('llm_complexity', None)
232
+ processed_example['llm_suitability'] = example.get('llm_suitability', None)
233
+ processed_example['llm_justification'] = example.get('llm_justification', '')
234
+ processed_example['llm_evaluation_status'] = 'processing_retry'
235
+
236
+ # --- MODIFIED: Get text from 'query' column ---
237
+ query_text = example.get("query")
238
+
239
+ # --- MODIFIED: Check the 'query' column ---
240
+ if not query_text or not isinstance(query_text, str) or not query_text.strip():
241
+ processed_example['llm_evaluation_status'] = 'skipped_invalid_query' # Changed status name
242
+ return processed_example
243
+
244
+ # Call LLM API with the query text
245
+ llm_response_string = evaluate_prompt_with_llm(query_text, API_KEY, API_HOST, API_PATH, LLM_MODEL)
246
+
247
+ if llm_response_string:
248
+ q, c, s, j, parse_status = parse_llm_evaluation(llm_response_string)
249
+ if parse_status.startswith("success"):
250
+ processed_example["llm_quality"] = q
251
+ processed_example["llm_complexity"] = c
252
+ processed_example["llm_suitability"] = s
253
+ processed_example["llm_justification"] = j
254
+ processed_example['llm_evaluation_status'] = 'success'
255
+ else:
256
+ processed_example['llm_evaluation_status'] = parse_status
257
+ processed_example['llm_justification'] = f"RAW_RESPONSE: {llm_response_string}"
258
+ else:
259
+ processed_example['llm_evaluation_status'] = 'failed_llm_call'
260
+
261
+ return processed_example
262
+
263
+ # --- Function to Save Dataset Atomically ---
264
+ # (Keep the same save_dataset_atomically function as before)
265
+ def save_dataset_atomically(data_list, output_path, features):
266
+ """Saves the list of data dictionaries atomically using the correct schema."""
267
+ if not data_list:
268
+ logging.info("No data provided for saving.")
269
+ return False
270
+ temp_output_path = output_path + "_saving"
271
+ final_output_path = output_path
272
+ logging.info(f"Attempting to save {len(data_list)} examples to temp path {temp_output_path}...")
273
+ try:
274
+ processed_data_list = []
275
+ # Handle potential None for integer columns before creating Dataset
276
+ for item in data_list:
277
+ item_copy = item.copy() # Work on a copy
278
+ # Replace None with a placeholder like -1 if the Feature type is integer
279
+ # Or ensure the Feature type allows None (e.g., use Value('float32') or check default behavior)
280
+ # For now, assume Value('int32') might require a number, using -1 as placeholder for None
281
+ for key in ['llm_quality', 'llm_complexity', 'llm_suitability']:
282
+ if item_copy.get(key) is None and isinstance(features[key], Value) and features[key].dtype == 'int32':
283
+ # logging.debug(f"Replacing None with -1 for int32 field '{key}' in item: {item_copy.get('query', '')[:30]}...")
284
+ item_copy[key] = -1 # Or other suitable placeholder
285
+ processed_data_list.append(item_copy)
286
+
287
+ # Create dataset from the list of dictionaries using the defined features
288
+ processed_dataset = Dataset.from_list(processed_data_list, features=features)
289
+ os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
290
+ if os.path.exists(temp_output_path):
291
+ logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
292
+ shutil.rmtree(temp_output_path)
293
+ processed_dataset.save_to_disk(temp_output_path)
294
+ logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
295
+ if os.path.exists(final_output_path):
296
+ logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
297
+ shutil.rmtree(final_output_path)
298
+ os.rename(temp_output_path, final_output_path)
299
+ logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
300
+ return True
301
+ except Exception as e:
302
+ logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
303
+ if os.path.exists(temp_output_path):
304
+ try:
305
+ shutil.rmtree(temp_output_path)
306
+ logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
307
+ except Exception as cleanup_e:
308
+ logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
309
+ # Fallback JSON Lines save
310
+ fallback_json_path = final_output_path + ".jsonl.failed_save"
311
+ logging.warning(f"Attempting fallback save to JSON Lines file: {fallback_json_path}")
312
+ try:
313
+ with open(fallback_json_path, 'w', encoding='utf-8') as f:
314
+ for item in data_list: # Use original list for fallback
315
+ f.write(json.dumps(dict(item), ensure_ascii=False, default=str) + '\n')
316
+ logging.info(f"Successfully saved fallback JSON Lines file.")
317
+ except Exception as json_e:
318
+ logging.error(f"Fallback JSON save also failed: {json_e}", exc_info=True)
319
+ return False
320
+
321
+
322
+ # --- Function to Check if Retry is Needed ---
323
+ # (Keep the same needs_retry function as before)
324
+ def needs_retry(example):
325
+ """Checks if an example needs evaluation or retry."""
326
+ status = example.get('llm_evaluation_status')
327
+ retry_flag = (status != 'success') and (not str(status).startswith('skipped_')) # Check status string safely
328
+ return retry_flag
329
+
330
+
331
+ # --- Get Dataset Features ---
332
+ # --- !! MODIFIED: Define features explicitly for the filtered SHP-2 data !! ---
333
+ def get_filtered_shp2_features_with_evaluation():
334
+ """Defines features for the pre-filtered SHP-2 dataset + evaluation columns."""
335
+ logging.info(f"Defining features for pre-filtered SHP-2 data + LLM evaluation.")
336
+
337
+ # Define features based on the known output of the filtering script
338
+ # Using Value('string', id=None) ensures compatibility if 'id' attribute exists
339
+ base_features = Features({
340
+ 'query': Value(dtype='string', id=None),
341
+ 'chosen': Value(dtype='string', id=None),
342
+ 'reject': Value(dtype='string', id=None),
343
+ 'domain': Value(dtype='string', id=None),
344
+ })
345
+
346
+ # Add new features for LLM evaluation
347
+ # Using int32, remember save function replaces None with -1
348
+ augmented_features = Features({
349
+ **base_features,
350
+ 'llm_quality': Value('int32'),
351
+ 'llm_complexity': Value('int32'),
352
+ 'llm_suitability': Value('int32'),
353
+ 'llm_justification': Value('string'),
354
+ 'llm_evaluation_status': Value('string')
355
+ })
356
+ logging.info(f"Defined features: {augmented_features}")
357
+ return augmented_features
358
+
359
+ # --- Main Execution ---
360
+ if __name__ == "__main__":
361
+ start_time = time.time()
362
+ logging.info("======================================================")
363
+ logging.info(f" Starting Filtered SHP-2 Dataset Evaluation - {LLM_MODEL}")
364
+ logging.info(f" Input Data Path: {INPUT_DATA_PATH}") # Log input path
365
+ logging.info(f" Output Dir: {OUTPUT_DIR}")
366
+ logging.info(f" Intermediate Save Path: {PROCESSED_DATA_PATH}")
367
+ logging.info(f" Final Annotated Path: {FINAL_OUTPUT_PATH}")
368
+ logging.info(f" LLM-Filtered Output Path: {FILTERED_OUTPUT_PATH}")
369
+ logging.info("======================================================")
370
+
371
+ # --- Define Features ---
372
+ dataset_features = get_filtered_shp2_features_with_evaluation()
373
+
374
+ # --- Load or Initialize Dataset ---
375
+ results_list = []
376
+ # Check for intermediate save file from *this* script first
377
+ if os.path.exists(PROCESSED_DATA_PATH):
378
+ logging.info(f"Loading existing intermediate dataset from {PROCESSED_DATA_PATH}...")
379
+ try:
380
+ existing_dataset = Dataset.load_from_disk(PROCESSED_DATA_PATH)
381
+ # Optional: verify features match
382
+ if existing_dataset.features.keys() != dataset_features.keys():
383
+ logging.warning(f"Loaded intermediate dataset features mismatch expected. Trying to continue...")
384
+ results_list = existing_dataset.to_list()
385
+ total_examples = len(results_list)
386
+ logging.info(f"Loaded {total_examples} examples from intermediate save.")
387
+ except Exception as e:
388
+ logging.error(f"Failed to load intermediate dataset from {PROCESSED_DATA_PATH}: {e}", exc_info=True)
389
+ logging.warning("Will attempt to load fresh dataset from input path.")
390
+ results_list = []
391
+
392
+ if not results_list:
393
+ # --- MODIFIED: Load from the local pre-filtered dataset path ---
394
+ logging.info(f"Loading pre-filtered dataset from: {INPUT_DATA_PATH}")
395
+ if not os.path.exists(INPUT_DATA_PATH):
396
+ logging.error(f"Input dataset not found at '{INPUT_DATA_PATH}'. Please run the initial filtering script first.")
397
+ sys.exit(1)
398
+ try:
399
+ # Load the dataset generated by the previous script
400
+ original_filtered_dataset = Dataset.load_from_disk(INPUT_DATA_PATH)
401
+ total_examples = len(original_filtered_dataset)
402
+ logging.info(f"Loaded {total_examples} original examples from {INPUT_DATA_PATH}.")
403
+
404
+ # Initialize results list with original data + placeholder fields
405
+ results_list = []
406
+ for example in tqdm(original_filtered_dataset, desc="Initializing data"):
407
+ init_example = dict(example) # Make a copy
408
+ # Ensure all base features are present, handle potential missing ones if needed
409
+ init_example['query'] = init_example.get('query', '') # Ensure defaults if schema uncertain
410
+ init_example['chosen'] = init_example.get('chosen', '')
411
+ init_example['reject'] = init_example.get('reject', '')
412
+ init_example['domain'] = init_example.get('domain', '')
413
+ # Add evaluation placeholders
414
+ init_example['llm_quality'] = None
415
+ init_example['llm_complexity'] = None
416
+ init_example['llm_suitability'] = None
417
+ init_example['llm_justification'] = ''
418
+ init_example['llm_evaluation_status'] = 'pending'
419
+ results_list.append(init_example)
420
+
421
+ # Perform an initial save to the intermediate path for this script run
422
+ logging.info(f"Performing initial save of placeholder data to {PROCESSED_DATA_PATH}...")
423
+ save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features)
424
+ except Exception as e:
425
+ logging.error(f"Failed to load or initialize dataset from {INPUT_DATA_PATH}: {e}", exc_info=True)
426
+ sys.exit(1)
427
+
428
+ # --- Identify Indices to Process/Retry ---
429
+ logging.info("Identifying examples needing evaluation/retry...")
430
+ indices_to_process = [
431
+ i for i, example in enumerate(tqdm(results_list, desc="Checking examples")) if needs_retry(example)
432
+ ]
433
+ num_to_process = len(indices_to_process)
434
+
435
+ if num_to_process == 0:
436
+ logging.info("No examples found needing evaluation/retry based on status.")
437
+ # Ensure final data exists even if no retries needed
438
+ if not os.path.exists(FINAL_OUTPUT_PATH):
439
+ logging.info(f"Copying data from {PROCESSED_DATA_PATH} to final location {FINAL_OUTPUT_PATH}...")
440
+ if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
441
+ logging.info("Dataset copied to final location.")
442
+ else:
443
+ logging.error("Failed to copy dataset to final location.")
444
+ else:
445
+ logging.info(f"Identified {num_to_process} examples to process/retry out of {total_examples}.")
446
+ # --- Concurrent Processing Logic (remains the same structure) ---
447
+ processed_count_total = 0
448
+ processed_since_last_save = 0
449
+ last_save_time = time.time()
450
+ logging.info("Starting concurrent evaluation with periodic saving...")
451
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
452
+ futures = {
453
+ executor.submit(evaluate_dataset_entry, results_list[i]): i
454
+ for i in indices_to_process
455
+ }
456
+ try:
457
+ pbar = tqdm(total=num_to_process, desc="Evaluating queries", unit="query") # Updated desc
458
+ for future in concurrent.futures.as_completed(futures):
459
+ original_index = futures[future]
460
+ try:
461
+ updated_example_dict = future.result()
462
+ results_list[original_index] = updated_example_dict
463
+ pbar.set_postfix({"LastStatus": updated_example_dict.get('llm_evaluation_status', 'N/A')}, refresh=True)
464
+ except Exception as exc:
465
+ logging.error(f'Evaluation task for index {original_index} encountered an exception: {exc}', exc_info=True)
466
+ error_placeholder = results_list[original_index].copy()
467
+ error_placeholder['llm_evaluation_status'] = f'failed_future_exception_{type(exc).__name__}'
468
+ results_list[original_index] = error_placeholder
469
+ pbar.set_postfix({"LastStatus": error_placeholder['llm_evaluation_status']}, refresh=True)
470
+ finally:
471
+ processed_count_total += 1
472
+ processed_since_last_save += 1
473
+ pbar.update(1)
474
+ if processed_since_last_save >= SAVE_INTERVAL:
475
+ current_time = time.time()
476
+ time_since_last = current_time - last_save_time
477
+ logging.info(f"\n--- Processed {processed_since_last_save} items (Total this run: {processed_count_total}/{num_to_process}). Time since last save: {time_since_last:.1f}s. Saving progress... ---")
478
+ # Save intermediate progress to PROCESSED_DATA_PATH
479
+ if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
480
+ logging.info(f"--- Progress successfully saved to {PROCESSED_DATA_PATH} ---")
481
+ processed_since_last_save = 0
482
+ last_save_time = current_time
483
+ else:
484
+ logging.error(f"--- FAILED TO SAVE PROGRESS to {PROCESSED_DATA_PATH}! Check errors. Will retry later. ---")
485
+ except KeyboardInterrupt:
486
+ logging.warning("\nCtrl+C detected! Attempting final save...")
487
+ except Exception as e:
488
+ logging.error(f"An unexpected error occurred during the main processing loop: {e}", exc_info=True)
489
+ logging.error("Attempting final save...")
490
+ finally:
491
+ if 'pbar' in locals() and pbar is not None:
492
+ pbar.close()
493
+ logging.info("--- Processing loop finished or interrupted. ---")
494
+ # --- Final Save Attempt (to FINAL_OUTPUT_PATH) ---
495
+ logging.info(f"Attempting final save of the fully annotated dataset ({len(results_list)} items) to: {FINAL_OUTPUT_PATH}")
496
+ if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
497
+ logging.info("--- Final annotated dataset state saved successfully. ---")
498
+ else:
499
+ logging.error(f">>> FINAL ANNOTATED SAVE FAILED to {FINAL_OUTPUT_PATH}! <<< Check logs. Fallback JSON/Intermediate data might exist.")
500
+
501
+ # --- Post-Processing: Verification, Analysis, Filtering ---
502
+ logging.info("======================================================")
503
+ logging.info("Post-Processing: Verification, Analysis, and LLM Filtering")
504
+ logging.info("======================================================")
505
+
506
+ # --- Verification of Final Annotated Data ---
507
+ logging.info(f"Verifying and Analyzing final annotated dataset: {FINAL_OUTPUT_PATH}")
508
+ if not os.path.exists(FINAL_OUTPUT_PATH):
509
+ logging.error(f"Final annotated dataset not found at {FINAL_OUTPUT_PATH}. Cannot perform analysis or filtering.")
510
+ else:
511
+ try:
512
+ final_annotated_dataset = Dataset.load_from_disk(FINAL_OUTPUT_PATH)
513
+ num_final_examples = len(final_annotated_dataset)
514
+ logging.info(f"Successfully reloaded final annotated dataset with {num_final_examples} examples.")
515
+
516
+ # --- Calculate Score Distributions ---
517
+ logging.info("Calculating score distributions...")
518
+ try:
519
+ df = final_annotated_dataset.to_pandas()
520
+ # Handle the placeholder -1 we might have used for None in integer columns
521
+ df['llm_quality'].replace(-1, pd.NA, inplace=True)
522
+ df['llm_complexity'].replace(-1, pd.NA, inplace=True)
523
+ df['llm_suitability'].replace(-1, pd.NA, inplace=True)
524
+
525
+ quality_dist = df['llm_quality'].value_counts(dropna=False).sort_index() # Include NA count
526
+ complexity_dist = df['llm_complexity'].value_counts(dropna=False).sort_index()
527
+ suitability_dist = df['llm_suitability'].value_counts(dropna=False).sort_index()
528
+ status_dist = df['llm_evaluation_status'].value_counts()
529
+
530
+ print("\n--- Score Distributions (Annotated Dataset) ---")
531
+ print("\nOverall Quality Distribution (NA indicates missing/placeholder):")
532
+ print(quality_dist)
533
+ print("\nComplexity Distribution (NA indicates missing/placeholder):")
534
+ print(complexity_dist)
535
+ print("\nVoice Response Suitability Distribution (NA indicates missing/placeholder):")
536
+ print(suitability_dist)
537
+ print("\nEvaluation Status Distribution:")
538
+ print(status_dist)
539
+ print("--------------------------------------------------")
540
+
541
+ except ImportError:
542
+ logging.warning("Pandas not found. Performing basic counts (may not show None correctly).")
543
+ # Basic counting (less informative about None/-1)
544
+ quality_counts, complexity_counts, suitability_counts, status_counts = {}, {}, {}, {}
545
+ for ex in final_annotated_dataset:
546
+ q = ex.get('llm_quality', -99) # Use distinct value for missing
547
+ c = ex.get('llm_complexity', -99)
548
+ s = ex.get('llm_suitability', -99)
549
+ st = ex.get('llm_evaluation_status', 'unknown')
550
+ quality_counts[q] = quality_counts.get(q, 0) + 1
551
+ complexity_counts[c] = complexity_counts.get(c, 0) + 1
552
+ suitability_counts[s] = suitability_counts.get(s, 0) + 1
553
+ status_counts[st] = status_counts.get(st, 0) + 1
554
+ print("\n--- Score Distributions (Annotated Dataset - Basic) ---")
555
+ print(f"Quality (-99=missing): {sorted(quality_counts.items())}")
556
+ print(f"Complexity (-99=missing): {sorted(complexity_counts.items())}")
557
+ print(f"Suitability (-99=missing): {sorted(suitability_counts.items())}")
558
+ print(f"Status: {sorted(status_counts.items())}")
559
+ print("--------------------------------------------------")
560
+
561
+ # --- Filtering based on LLM scores ---
562
+ logging.info(f"Filtering annotated dataset: Quality >= {MIN_QUALITY_SCORE}, Suitability >= {MIN_SUITABILITY_SCORE}")
563
+
564
+ def filter_criteria(example):
565
+ q = example.get('llm_quality')
566
+ s = example.get('llm_suitability')
567
+ # Handle potential None or placeholder (-1) scores before comparing
568
+ if q is None or q == -1 or s is None or s == -1:
569
+ return False
570
+ passes = q >= MIN_QUALITY_SCORE and s >= MIN_SUITABILITY_SCORE
571
+ # Optional: Add complexity filter
572
+ # c = example.get('llm_complexity')
573
+ # if c is not None and c != -1 and MAX_COMPLEXITY_SCORE is not None:
574
+ # passes = passes and c <= MAX_COMPLEXITY_SCORE
575
+ return passes
576
+
577
+ # Use num_proc=1 if filtering is fast enough or to avoid potential issues
578
+ filtered_llm_dataset = final_annotated_dataset.filter(filter_criteria, num_proc=max(1, os.cpu_count() // 2))
579
+ num_filtered = len(filtered_llm_dataset)
580
+ logging.info(f"LLM-Filtered dataset size: {num_filtered} examples ({num_filtered / num_final_examples:.2%} of annotated)")
581
+
582
+ # --- Save LLM-Filtered Dataset ---
583
+ logging.info(f"Saving LLM-filtered dataset to: {FILTERED_OUTPUT_PATH}")
584
+ try:
585
+ os.makedirs(os.path.dirname(FILTERED_OUTPUT_PATH), exist_ok=True)
586
+ if os.path.exists(FILTERED_OUTPUT_PATH):
587
+ logging.debug(f"Removing existing LLM-filtered directory: {FILTERED_OUTPUT_PATH}")
588
+ shutil.rmtree(FILTERED_OUTPUT_PATH)
589
+ filtered_llm_dataset.save_to_disk(FILTERED_OUTPUT_PATH)
590
+ logging.info("LLM-Filtered dataset saved successfully.")
591
+ except Exception as e:
592
+ logging.error(f"Failed to save LLM-filtered dataset to {FILTERED_OUTPUT_PATH}: {e}", exc_info=True)
593
+
594
+ except Exception as e:
595
+ logging.error(f"Verification/Analysis/Filtering failed on final annotated dataset: {e}", exc_info=True)
596
+
597
+ # --- Script End ---
598
+ end_time = time.time()
599
+ logging.info("------------------------------------------------------")
600
+ logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
601
+ logging.info(f"Final annotated dataset saved at: {FINAL_OUTPUT_PATH}")
602
+ logging.info(f"LLM-Filtered dataset saved at: {FILTERED_OUTPUT_PATH}")
603
+ logging.info("======================================================")
r1-a/dataset/filter/gsm8k.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from datasets import load_from_disk, Dataset
4
+
5
+ # --- 配置参数 ---
6
+ INPUT_BASE = '/root/autodl-tmp/audio-r1/r1-a/dataset/gsm8k_with_audio'
7
+ OUTPUT_BASE = './gsm8k_final_filtered'
8
+
9
+ os.makedirs(OUTPUT_BASE, exist_ok=True)
10
+
11
+ # --- 过滤函数(同之前) ---
12
+ def is_suitable_for_tts_question(q: str) -> bool:
13
+ words = q.split()
14
+ if len(words) < 5 or len(words) > 100:
15
+ return False
16
+ if re.search(r'[\(\)\[\]/\^<>]', q):
17
+ return False
18
+ if q.count(',') > 2:
19
+ return False
20
+ return True
21
+
22
+ # --- 处理每个 split ---
23
+ all_samples = []
24
+ for split_name in os.listdir(INPUT_BASE):
25
+ split_dir = os.path.join(INPUT_BASE, split_name, 'final_dataset')
26
+ if not os.path.isdir(split_dir):
27
+ continue
28
+ print(f"→ Loading split '{split_name}'")
29
+ ds = load_from_disk(split_dir)
30
+
31
+ filtered = []
32
+ for ex in ds:
33
+ q = ex.get('question_text', '')
34
+ wav = ex.get('audio_filepath', '')
35
+ # 跳过无音频或文件缺失
36
+ if not wav or not os.path.exists(wav):
37
+ continue
38
+ # 过滤不合适的问句
39
+ if not is_suitable_for_tts_question(q):
40
+ continue
41
+ rec = {
42
+ 'query': q,
43
+ 'answer': ex.get('answer', ''),
44
+ 'source_dataset': "gsm8k",
45
+ 'audio': wav,
46
+ 'question_type': 'Math',
47
+ 'difficulty': ''
48
+ }
49
+ filtered.append(rec)
50
+ all_samples.append(rec)
51
+
52
+ print(f" Kept {len(filtered)}/{len(ds)} examples in '{split_name}'")
53
+ # 保存该 split
54
+ out_dir = os.path.join(OUTPUT_BASE, split_name)
55
+ os.makedirs(out_dir, exist_ok=True)
56
+ Dataset.from_list(filtered).save_to_disk(out_dir)
57
+
58
+ # --- 可选:合并所有 split ---
59
+ print("→ Saving combined dataset")
60
+ combined_dir = os.path.join(OUTPUT_BASE, 'combined')
61
+ os.makedirs(combined_dir, exist_ok=True)
62
+ Dataset.from_list(all_samples).save_to_disk(combined_dir)
63
+ print(f"Total kept examples: {len(all_samples)}")
r1-a/dataset/filter/shp2_final.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from datasets import load_dataset, Dataset, Features, Value
4
+ import logging
5
+ import math
6
+ import shutil
7
+ import time
8
+
9
+ # --- Configuration ---
10
+ # Path to the LLM-filtered dataset created by the previous script
11
+ # !! Make sure this matches the FILTERED_OUTPUT_PATH from the previous script !!
12
+ INPUT_LLM_FILTERED_PATH = "./shp2_filtered_evaluated/train_split_llm_filtered"
13
+
14
+ # Output directory for the final top 20% dataset
15
+ OUTPUT_DIR_FINAL_SELECTION = "./shp2_final_top20_percent"
16
+ FINAL_DATASET_PATH = os.path.join(OUTPUT_DIR_FINAL_SELECTION, "train_split_top20_percent_by_complexity")
17
+
18
+ # Percentage to select from each complexity group
19
+ TOP_PERCENTAGE = 20.0
20
+
21
+ # --- Setup Logging ---
22
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
23
+ logging.getLogger("datasets").setLevel(logging.WARNING)
24
+ logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
25
+ logging.getLogger("filelock").setLevel(logging.WARNING)
26
+ logging.getLogger("pandas").setLevel(logging.WARNING) # Keep pandas less verbose
27
+
28
+ # --- Function to Save Dataset Atomically (Adapted for Dataset object) ---
29
+ def save_dataset_atomically(dataset_to_save, output_path):
30
+ """Saves a Hugging Face Dataset object atomically."""
31
+ if not dataset_to_save or len(dataset_to_save) == 0:
32
+ logging.warning(f"No data provided or dataset is empty. Skipping save for {output_path}.")
33
+ return False
34
+ temp_output_path = output_path + "_saving"
35
+ final_output_path = output_path
36
+ logging.info(f"Attempting to save {len(dataset_to_save)} examples to temp path {temp_output_path}...")
37
+ try:
38
+ # Ensure output directory exists
39
+ os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
40
+ # Remove existing temp directory if it exists
41
+ if os.path.exists(temp_output_path):
42
+ logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
43
+ shutil.rmtree(temp_output_path)
44
+ # Save to temporary path
45
+ dataset_to_save.save_to_disk(temp_output_path)
46
+ logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
47
+ # Remove final destination if it exists
48
+ if os.path.exists(final_output_path):
49
+ logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
50
+ shutil.rmtree(final_output_path)
51
+ # Move temporary to final destination
52
+ os.rename(temp_output_path, final_output_path)
53
+ logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
54
+ return True
55
+ except Exception as e:
56
+ logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
57
+ # Cleanup temp directory on failure
58
+ if os.path.exists(temp_output_path):
59
+ try:
60
+ shutil.rmtree(temp_output_path)
61
+ logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
62
+ except Exception as cleanup_e:
63
+ logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
64
+ return False
65
+
66
+ # --- Main Execution ---
67
+ if __name__ == "__main__":
68
+ start_time = time.time()
69
+ logging.info("===============================================================")
70
+ logging.info(" Starting Final Selection: Top 20% by Complexity, Quality & Suitability")
71
+ logging.info(f" Input LLM-Filtered Dataset Path: {INPUT_LLM_FILTERED_PATH}")
72
+ logging.info(f" Output Final Dataset Path: {FINAL_DATASET_PATH}")
73
+ logging.info(f" Selection Percentage per Complexity Group: {TOP_PERCENTAGE}%")
74
+ logging.info("===============================================================")
75
+
76
+ # --- Load the LLM-Filtered Dataset ---
77
+ if not os.path.exists(INPUT_LLM_FILTERED_PATH):
78
+ logging.error(f"Input dataset not found at '{INPUT_LLM_FILTERED_PATH}'.")
79
+ logging.error("Please ensure the previous script ran successfully and produced the dataset.")
80
+ exit(1)
81
+
82
+ try:
83
+ logging.info(f"Loading dataset from {INPUT_LLM_FILTERED_PATH}...")
84
+ llm_filtered_dataset = Dataset.load_from_disk(INPUT_LLM_FILTERED_PATH)
85
+ logging.info(f"Successfully loaded dataset with {len(llm_filtered_dataset)} examples.")
86
+ # Store features for later conversion back to Dataset
87
+ original_features = llm_filtered_dataset.features
88
+ logging.info(f"Original features: {original_features}")
89
+ except Exception as e:
90
+ logging.error(f"Failed to load dataset from {INPUT_LLM_FILTERED_PATH}: {e}", exc_info=True)
91
+ exit(1)
92
+
93
+ # --- Convert to Pandas DataFrame ---
94
+ try:
95
+ df = llm_filtered_dataset.to_pandas()
96
+ logging.info("Converted dataset to Pandas DataFrame.")
97
+ # Basic check for required columns
98
+ required_cols = ['llm_complexity', 'llm_quality', 'llm_suitability']
99
+ if not all(col in df.columns for col in required_cols):
100
+ logging.error(f"DataFrame is missing one or more required columns: {required_cols}")
101
+ exit(1)
102
+ # Handle potential placeholder values (-1) if they were used for None during saving
103
+ for col in ['llm_quality', 'llm_complexity', 'llm_suitability']:
104
+ if col in df.columns:
105
+ # Replace -1 with NaN for proper handling if necessary
106
+ # df[col] = df[col].replace(-1, pd.NA) # Use pd.NA for nullable integers
107
+ pass # Assuming valid scores (>=1) in the filtered dataset from previous step
108
+
109
+ # Drop rows with missing essential scores (shouldn't happen if filtered correctly, but good practice)
110
+ initial_count = len(df)
111
+ df.dropna(subset=required_cols, inplace=True)
112
+ if len(df) < initial_count:
113
+ logging.warning(f"Dropped {initial_count - len(df)} rows with missing essential scores (quality, complexity, suitability).")
114
+
115
+ # Ensure scores are numeric
116
+ df['llm_quality'] = pd.to_numeric(df['llm_quality'])
117
+ df['llm_complexity'] = pd.to_numeric(df['llm_complexity'])
118
+ df['llm_suitability'] = pd.to_numeric(df['llm_suitability'])
119
+
120
+ except ImportError:
121
+ logging.error("Pandas library is required for this script. Please install it (`pip install pandas`).")
122
+ exit(1)
123
+ except Exception as e:
124
+ logging.error(f"Error during DataFrame conversion or preparation: {e}", exc_info=True)
125
+ exit(1)
126
+
127
+ if df.empty:
128
+ logging.error("DataFrame is empty after loading and preparation. Cannot proceed.")
129
+ exit(1)
130
+
131
+ # --- Group by Complexity and Select Top 20% ---
132
+ logging.info("Grouping by complexity and selecting top 20% based on quality and suitability...")
133
+ all_selected_dfs = []
134
+ total_selected_count = 0
135
+
136
+ grouped = df.groupby('llm_complexity')
137
+
138
+ complexity_levels_found = sorted(df['llm_complexity'].unique())
139
+ logging.info(f"Found data for complexity levels: {complexity_levels_found}")
140
+
141
+ for complexity_level, group_df in grouped:
142
+ group_size = len(group_df)
143
+ logging.info(f"\nProcessing Complexity Level: {complexity_level} (Size: {group_size})")
144
+
145
+ if group_size == 0:
146
+ logging.info(" -> Group is empty, skipping.")
147
+ continue
148
+
149
+ # Calculate number of items to select (top N)
150
+ # Use math.ceil to ensure at least one item is selected if percentage > 0 and group > 0
151
+ num_to_select = math.ceil(group_size * (TOP_PERCENTAGE / 100.0))
152
+ logging.info(f" -> Target top {TOP_PERCENTAGE}% = {num_to_select} items.")
153
+
154
+ # Sort by Quality (desc), then Suitability (desc)
155
+ # Higher quality is better, higher suitability is better
156
+ sorted_group = group_df.sort_values(
157
+ by=['llm_quality', 'llm_suitability'],
158
+ ascending=[False, False] # Both descending
159
+ )
160
+
161
+ # Select the top N rows
162
+ selected_df = sorted_group.head(num_to_select)
163
+ all_selected_dfs.append(selected_df)
164
+ logging.info(f" -> Selected {len(selected_df)} items for complexity {complexity_level}.")
165
+ total_selected_count += len(selected_df)
166
+
167
+ # --- Combine Selected DataFrames ---
168
+ if not all_selected_dfs:
169
+ logging.error("No data was selected from any complexity group. Final dataset will be empty.")
170
+ final_df = pd.DataFrame(columns=df.columns) # Create empty df with same columns
171
+ else:
172
+ logging.info(f"\nCombining selected data from all complexity groups...")
173
+ final_df = pd.concat(all_selected_dfs, ignore_index=True)
174
+ logging.info(f"Combined DataFrame created with {len(final_df)} total selected examples.")
175
+ logging.info(f"Original number of examples in filtered input: {initial_count}") # Use count before dropna
176
+ logging.info(f"Final number of examples after top 20% selection: {total_selected_count}")
177
+
178
+ # Optional: Log distribution in the final dataset
179
+ print("\n--- Complexity Distribution in Final Selected Dataset ---")
180
+ print(final_df['llm_complexity'].value_counts().sort_index())
181
+ print("---------------------------------------------------------")
182
+ print("\n--- Quality Distribution in Final Selected Dataset ---")
183
+ print(final_df['llm_quality'].value_counts().sort_index())
184
+ print("-------------------------------------------------------")
185
+ print("\n--- Suitability Distribution in Final Selected Dataset ---")
186
+ print(final_df['llm_suitability'].value_counts().sort_index())
187
+ print("----------------------------------------------------------")
188
+
189
+
190
+ # --- Convert back to Hugging Face Dataset using original features ---
191
+ try:
192
+ # Ensure the DataFrame columns match the original features before conversion
193
+ # Select only columns present in the original features schema
194
+ columns_to_keep = list(original_features.keys())
195
+ final_df_aligned = final_df[columns_to_keep]
196
+
197
+ final_dataset = Dataset.from_pandas(final_df_aligned, features=original_features, preserve_index=False)
198
+ logging.info("Successfully converted final Pandas DataFrame back to Hugging Face Dataset.")
199
+ except Exception as e:
200
+ logging.error(f"Failed to convert final DataFrame back to Dataset: {e}", exc_info=True)
201
+ logging.warning("Attempting to save the final DataFrame as a CSV as a fallback.")
202
+ fallback_csv_path = FINAL_DATASET_PATH + ".csv"
203
+ try:
204
+ os.makedirs(os.path.dirname(fallback_csv_path), exist_ok=True)
205
+ final_df.to_csv(fallback_csv_path, index=False)
206
+ logging.info(f"Fallback CSV saved to {fallback_csv_path}")
207
+ except Exception as csv_e:
208
+ logging.error(f"Failed to save fallback CSV: {csv_e}", exc_info=True)
209
+ exit(1) # Exit after attempting fallback save
210
+
211
+ # --- Save the Final Dataset ---
212
+ logging.info(f"Saving the final selected dataset ({len(final_dataset)} examples) to: {FINAL_DATASET_PATH}")
213
+ save_successful = save_dataset_atomically(final_dataset, FINAL_DATASET_PATH)
214
+
215
+ if save_successful:
216
+ logging.info("Final dataset saved successfully.")
217
+ else:
218
+ logging.error(f"Failed to save the final dataset to {FINAL_DATASET_PATH}.")
219
+
220
+ # --- Script End ---
221
+ end_time = time.time()
222
+ logging.info("------------------------------------------------------")
223
+ logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
224
+ logging.info(f"Final top {TOP_PERCENTAGE}% dataset saved at: {FINAL_DATASET_PATH}" if save_successful else "Final dataset saving failed.")
225
+ logging.info("======================================================")
r1-a/dataset/filter/ultra_final.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ # Make sure necessary types are imported
4
+ from datasets import load_dataset, Dataset, Features, Value
5
+ import logging
6
+ import math
7
+ import shutil
8
+ import time
9
+
10
+ # --- Configuration ---
11
+ # --- !! MODIFIED: Point to the LLM-filtered UltraChat dataset !! ---
12
+ # This should match the FILTERED_OUTPUT_PATH from the UltraChat evaluation script
13
+ INPUT_LLM_FILTERED_PATH = "./ultrachat_evaluated/ultrachat_llm_filtered"
14
+
15
+ # --- !! MODIFIED: Update output directory names for UltraChat !! ---
16
+ OUTPUT_DIR_FINAL_SELECTION = "./ultrachat_final_top20_percent" # New output directory
17
+ FINAL_DATASET_PATH = os.path.join(OUTPUT_DIR_FINAL_SELECTION, "ultrachat_top20_percent_by_complexity") # New output dataset name
18
+
19
+ # Percentage to select from each complexity group (keep at 20% or adjust as needed)
20
+ TOP_PERCENTAGE = 20.0
21
+
22
+ # --- Setup Logging ---
23
+ # Keep logging setup the same
24
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
25
+ logging.getLogger("datasets").setLevel(logging.WARNING)
26
+ logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
27
+ logging.getLogger("filelock").setLevel(logging.WARNING)
28
+ logging.getLogger("pandas").setLevel(logging.WARNING)
29
+
30
+ # --- Function to Save Dataset Atomically ---
31
+ # Keep this function exactly the same
32
+ def save_dataset_atomically(dataset_to_save, output_path):
33
+ """Saves a Hugging Face Dataset object atomically."""
34
+ if not dataset_to_save or len(dataset_to_save) == 0:
35
+ logging.warning(f"No data provided or dataset is empty. Skipping save for {output_path}.")
36
+ return False
37
+ temp_output_path = output_path + "_saving"
38
+ final_output_path = output_path
39
+ logging.info(f"Attempting to save {len(dataset_to_save)} examples to temp path {temp_output_path}...")
40
+ try:
41
+ # Ensure output directory exists
42
+ os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
43
+ # Remove existing temp directory if it exists
44
+ if os.path.exists(temp_output_path):
45
+ logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
46
+ shutil.rmtree(temp_output_path)
47
+ # Save to temporary path
48
+ dataset_to_save.save_to_disk(temp_output_path)
49
+ logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
50
+ # Remove final destination if it exists
51
+ if os.path.exists(final_output_path):
52
+ logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
53
+ shutil.rmtree(final_output_path)
54
+ # Move temporary to final destination
55
+ os.rename(temp_output_path, final_output_path)
56
+ logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
57
+ return True
58
+ except Exception as e:
59
+ logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
60
+ # Cleanup temp directory on failure
61
+ if os.path.exists(temp_output_path):
62
+ try:
63
+ shutil.rmtree(temp_output_path)
64
+ logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
65
+ except Exception as cleanup_e:
66
+ logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
67
+ return False
68
+
69
+ # --- Main Execution ---
70
+ if __name__ == "__main__":
71
+ start_time = time.time()
72
+ logging.info("===============================================================")
73
+ # --- !! MODIFIED: Update log title !! ---
74
+ logging.info(" Starting UltraChat Final Selection: Top 20% by Complexity, Quality & Suitability")
75
+ logging.info(f" Input LLM-Filtered Dataset Path: {INPUT_LLM_FILTERED_PATH}")
76
+ logging.info(f" Output Final Dataset Path: {FINAL_DATASET_PATH}")
77
+ logging.info(f" Selection Percentage per Complexity Group: {TOP_PERCENTAGE}%")
78
+ logging.info("===============================================================")
79
+
80
+ # --- Load the LLM-Filtered Dataset ---
81
+ if not os.path.exists(INPUT_LLM_FILTERED_PATH):
82
+ logging.error(f"Input dataset not found at '{INPUT_LLM_FILTERED_PATH}'.")
83
+ logging.error("Please ensure the UltraChat LLM evaluation script ran successfully and produced the dataset.")
84
+ exit(1)
85
+
86
+ try:
87
+ logging.info(f"Loading dataset from {INPUT_LLM_FILTERED_PATH}...")
88
+ llm_filtered_dataset = Dataset.load_from_disk(INPUT_LLM_FILTERED_PATH)
89
+ logging.info(f"Successfully loaded dataset with {len(llm_filtered_dataset)} examples.")
90
+ # Store features for later conversion back to Dataset
91
+ original_features = llm_filtered_dataset.features
92
+ logging.info(f"Original features: {original_features}")
93
+ # Check if essential score columns exist in the loaded features
94
+ if not all(col in original_features for col in ['llm_complexity', 'llm_quality', 'llm_suitability']):
95
+ logging.error(f"Loaded dataset from '{INPUT_LLM_FILTERED_PATH}' is missing one or more required score columns (llm_quality, llm_complexity, llm_suitability). Cannot proceed.")
96
+ exit(1)
97
+ except Exception as e:
98
+ logging.error(f"Failed to load dataset from {INPUT_LLM_FILTERED_PATH}: {e}", exc_info=True)
99
+ exit(1)
100
+
101
+ # --- Convert to Pandas DataFrame ---
102
+ try:
103
+ df = llm_filtered_dataset.to_pandas()
104
+ logging.info("Converted dataset to Pandas DataFrame.")
105
+ required_cols = ['llm_complexity', 'llm_quality', 'llm_suitability'] # These are needed for filtering
106
+
107
+ # Handle potential placeholder values (-1) used for None during saving in the previous step
108
+ # Replace them with pd.NA for correct handling by dropna and numeric conversion
109
+ for col in required_cols:
110
+ if col in df.columns:
111
+ df[col] = df[col].replace(-1, pd.NA)
112
+
113
+ # Drop rows with missing essential scores AFTER replacing placeholder
114
+ initial_count = len(df)
115
+ df.dropna(subset=required_cols, inplace=True)
116
+ dropped_count = initial_count - len(df)
117
+ if dropped_count > 0:
118
+ logging.warning(f"Dropped {dropped_count} rows with missing essential scores (quality, complexity, suitability) after handling placeholders.")
119
+
120
+ # Ensure scores are numeric (should be okay after dropna, but good practice)
121
+ # Using 'integer' dtype allows pd.NA
122
+ df['llm_quality'] = df['llm_quality'].astype('Int64') # Use nullable integer type
123
+ df['llm_complexity'] = df['llm_complexity'].astype('Int64')
124
+ df['llm_suitability'] = df['llm_suitability'].astype('Int64')
125
+
126
+ except ImportError:
127
+ logging.error("Pandas library is required for this script. Please install it (`pip install pandas`).")
128
+ exit(1)
129
+ except Exception as e:
130
+ logging.error(f"Error during DataFrame conversion or preparation: {e}", exc_info=True)
131
+ exit(1)
132
+
133
+ if df.empty:
134
+ logging.error("DataFrame is empty after loading and cleaning (dropping NA scores). Cannot proceed.")
135
+ exit(1)
136
+
137
+ # --- Group by Complexity and Select Top 20% ---
138
+ # This core logic remains unchanged as it relies on the standard score column names
139
+ logging.info("Grouping by complexity and selecting top 20% based on quality and suitability...")
140
+ all_selected_dfs = []
141
+ total_selected_count = 0
142
+
143
+ # Ensure complexity column is suitable for grouping (already converted to Int64)
144
+ grouped = df.groupby('llm_complexity')
145
+
146
+ # Get unique complexity levels present in the cleaned data
147
+ complexity_levels_found = sorted(df['llm_complexity'].dropna().unique())
148
+ logging.info(f"Found data for complexity levels: {complexity_levels_found}")
149
+
150
+ for complexity_level in complexity_levels_found:
151
+ # Need to handle potential NA group if groupby includes NA keys (usually doesn't by default)
152
+ if pd.isna(complexity_level):
153
+ continue
154
+
155
+ group_df = grouped.get_group(complexity_level)
156
+ group_size = len(group_df)
157
+ logging.info(f"\nProcessing Complexity Level: {complexity_level} (Size: {group_size})")
158
+
159
+ if group_size == 0:
160
+ logging.info(" -> Group is empty, skipping.") # Should not happen with get_group after unique()
161
+ continue
162
+
163
+ # Calculate number of items to select (top N)
164
+ num_to_select = math.ceil(group_size * (TOP_PERCENTAGE / 100.0))
165
+ # Ensure num_to_select is not greater than group_size (can happen with ceil and small groups)
166
+ num_to_select = min(num_to_select, group_size)
167
+ logging.info(f" -> Target top {TOP_PERCENTAGE}% = {num_to_select} items.")
168
+
169
+ # Sort by Quality (desc), then Suitability (desc)
170
+ sorted_group = group_df.sort_values(
171
+ by=['llm_quality', 'llm_suitability'],
172
+ ascending=[False, False] # Both descending
173
+ )
174
+
175
+ # Select the top N rows
176
+ selected_df = sorted_group.head(num_to_select)
177
+ all_selected_dfs.append(selected_df)
178
+ logging.info(f" -> Selected {len(selected_df)} items for complexity {complexity_level}.")
179
+ total_selected_count += len(selected_df)
180
+
181
+ # --- Combine Selected DataFrames ---
182
+ if not all_selected_dfs:
183
+ logging.error("No data was selected from any complexity group. Final dataset will be empty.")
184
+ final_df = pd.DataFrame(columns=df.columns) # Create empty df with same columns
185
+ else:
186
+ logging.info(f"\nCombining selected data from all complexity groups...")
187
+ final_df = pd.concat(all_selected_dfs, ignore_index=True)
188
+ logging.info(f"Combined DataFrame created with {len(final_df)} total selected examples.")
189
+ # Use initial_count (before dropna) for comparison basis
190
+ original_valid_score_count = initial_count - dropped_count
191
+ logging.info(f"Original number of examples with valid scores in input: {original_valid_score_count}")
192
+ logging.info(f"Final number of examples after top {TOP_PERCENTAGE}% selection: {total_selected_count}")
193
+
194
+ # Log distribution in the final selected dataset
195
+ print("\n--- Complexity Distribution in Final Selected Dataset ---")
196
+ print(final_df['llm_complexity'].value_counts().sort_index())
197
+ print("---------------------------------------------------------")
198
+ print("\n--- Quality Distribution in Final Selected Dataset ---")
199
+ print(final_df['llm_quality'].value_counts().sort_index())
200
+ print("-------------------------------------------------------")
201
+ print("\n--- Suitability Distribution in Final Selected Dataset ---")
202
+ print(final_df['llm_suitability'].value_counts().sort_index())
203
+ print("----------------------------------------------------------")
204
+
205
+
206
+ # --- Convert back to Hugging Face Dataset using original features ---
207
+ # This logic remains the same - crucial to use original_features
208
+ try:
209
+ # Ensure the DataFrame columns match the original features before conversion
210
+ # Select only columns present in the original features schema to avoid errors
211
+ columns_to_keep = list(original_features.keys())
212
+ # Check if all original columns still exist in final_df (they should)
213
+ final_df_aligned = final_df[columns_to_keep]
214
+
215
+ # Convert nullable Int64 back to standard int types if necessary for Features definition
216
+ # (HuggingFace handles standard int types well, usually no explicit cast needed here if Features are correct)
217
+ # E.g., if original_features['llm_quality'] was Value('int32'), pandas Int64 is compatible
218
+
219
+ # Create the Dataset object using the original features definition
220
+ final_dataset = Dataset.from_pandas(final_df_aligned, features=original_features, preserve_index=False)
221
+ logging.info("Successfully converted final Pandas DataFrame back to Hugging Face Dataset.")
222
+ except Exception as e:
223
+ logging.error(f"Failed to convert final DataFrame back to Dataset: {e}", exc_info=True)
224
+ logging.warning("Attempting to save the final DataFrame as a CSV as a fallback.")
225
+ # Make sure fallback path uses the correct final dataset path base
226
+ fallback_csv_path = FINAL_DATASET_PATH + ".csv"
227
+ try:
228
+ os.makedirs(os.path.dirname(fallback_csv_path), exist_ok=True)
229
+ final_df.to_csv(fallback_csv_path, index=False)
230
+ logging.info(f"Fallback CSV saved to {fallback_csv_path}")
231
+ except Exception as csv_e:
232
+ logging.error(f"Failed to save fallback CSV: {csv_e}", exc_info=True)
233
+ exit(1) # Exit after attempting fallback save
234
+
235
+ # --- Save the Final Dataset ---
236
+ logging.info(f"Saving the final selected UltraChat dataset ({len(final_dataset)} examples) to: {FINAL_DATASET_PATH}")
237
+ save_successful = save_dataset_atomically(final_dataset, FINAL_DATASET_PATH)
238
+
239
+ if save_successful:
240
+ logging.info("Final dataset saved successfully.")
241
+ else:
242
+ logging.error(f"Failed to save the final dataset to {FINAL_DATASET_PATH}.")
243
+
244
+ # --- Script End ---
245
+ end_time = time.time()
246
+ logging.info("------------------------------------------------------")
247
+ # --- !! MODIFIED: Update log message !! ---
248
+ logging.info(f"UltraChat Selection Script finished in {end_time - start_time:.2f} seconds.")
249
+ logging.info(f"Final top {TOP_PERCENTAGE}% UltraChat dataset saved at: {FINAL_DATASET_PATH}" if save_successful else "Final dataset saving failed.")
250
+ logging.info("======================================================")
r1-a/dataset/filter/ultrachat_gpt.py ADDED
@@ -0,0 +1,709 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import http.client
3
+ import json
4
+ import time
5
+ import random
6
+ import re
7
+ import pandas as pd
8
+ from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
9
+ from tqdm.auto import tqdm
10
+ import sys
11
+ import logging
12
+ import concurrent.futures
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ import shutil
15
+ import socket
16
+
17
+ # --- Configuration ---
18
+ # --- !! MODIFIED: Point to the pre-filtered UltraChat dataset !! ---
19
+ INPUT_DATA_PATH = "/root/autodl-tmp/audio-r1/r1-a/dataset/ultrachat_filtered_for_tts_preference_v3_nocode" # Path from the UltraChat filtering script's output
20
+
21
+ # --- Keep API configurations ---
22
+ API_HOST = "api2.aigcbest.top"
23
+ API_PATH = "/v1/chat/completions"
24
+ LLM_MODEL = "gpt-4.1-mini-2025-04-14" # Or consider gpt-4-turbo if available and cheaper for long context
25
+ API_KEY = os.environ.get('AIGCBEST_API_KEY', "sk-N8IsyCniMZoVpa0zn0IYQMY0b0Py53WyFxmNag4vtnzCtXeA") # Replace or set env variable
26
+ if not API_KEY or API_KEY == "YOUR_API_KEY_HERE":
27
+ print("API Key is not set correctly. Please set the AIGCBEST_API_KEY environment variable or replace the placeholder.")
28
+ sys.exit(1)
29
+
30
+ # --- !! MODIFIED: Update output directory names for UltraChat !! ---
31
+ OUTPUT_DIR = f"./ultrachat_evaluated" # Base directory for evaluated UltraChat
32
+ PROCESSED_DATA_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_evaluated_intermediate") # Intermediate save file for this run
33
+ FINAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_evaluated_final") # Final annotated data
34
+ FILTERED_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_llm_filtered") # Final filtered data
35
+
36
+ # --- Keep processing configurations ---
37
+ MAX_WORKERS = 40
38
+ REQUEST_DELAY_SECONDS = 0.1
39
+ MAX_RETRIES = 4
40
+ SAVE_INTERVAL = 1000
41
+
42
+ # --- Filtering Thresholds (LLM scores) - Can be adjusted after seeing distributions ---
43
+ MIN_QUALITY_SCORE = 3
44
+ MIN_SUITABILITY_SCORE = 3
45
+ # Optional: MAX_COMPLEXITY_SCORE = 4
46
+
47
+ # Setup logging (keep as is)
48
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
49
+ # ... (keep other logging level settings) ...
50
+
51
+ # --- !! MODIFIED: Updated LLM System Prompt for Multi-Turn Context !! ---
52
+ SYSTEM_PROMPT = """
53
+ You are an AI Quality Assessor evaluating user queries within multi-turn conversations for AI voice assistants.
54
+ Your task is to analyze the **Current User Query** in the context of the preceding **Conversation History**. Assign scores based on three metrics: Overall Quality, Complexity, and Voice Response Suitability. Provide a brief justification.
55
+
56
+ **Input:** You will receive the conversation history followed by the current user query.
57
+
58
+ **Output Format:** Respond ONLY with a single string in the following format, replacing bracketed values with your scores and justification. Do NOT include any other text, greetings, or explanations outside this format.
59
+
60
+ Quality: [1-5], Complexity: [1-5], Suitability: [1-5], Justification: [Your brief justification text here]
61
+
62
+ **Metric Definitions:**
63
+
64
+ 1. **Overall Quality (Score 1-5):** Clarity, coherence, relevance, and grammatical correctness of the **Current User Query** *considering the Conversation History*.
65
+ * 1 (Very Low): Nonsensical, irrelevant to history, ungrammatical, contains corrupted placeholders, abrupt unrelated topic shift without clear transition.
66
+ * 2 (Low): Vague, poorly worded, slightly off-topic, requires significant interpretation *even with history*, minor grammatical errors.
67
+ * 3 (Medium): Understandable, generally relevant, reasonably phrased. Might be a simple follow-up or a slightly generic query. Acceptable.
68
+ * 4 (High): Clear, well-phrased, specific, directly relevant to the history or a natural conversation progression. Good standalone query even if it builds on context.
69
+ * 5 (Very High): Exceptionally clear, concise, specific, contextually relevant, and well-formulated. Represents a natural and effective conversational turn.
70
+
71
+ 2. **Complexity (Score 1-5):** Cognitive load required for the AI to understand the *history + current query* and generate the *next appropriate assistant response*.
72
+ * 1 (Very Simple): Simple acknowledgement, yes/no confirmation, trivial fact recall based directly on the last turn.
73
+ * 2 (Simple): Basic info recall related to history, slight elaboration on previous point, simple instruction.
74
+ * 3 (Moderate): Requires synthesizing information from a few turns back, comparing points made earlier, generating a moderately detailed explanation or creative text based on context.
75
+ * 4 (Complex): Requires understanding nuanced context across multiple turns, deep reasoning, complex instruction synthesis, detailed analysis based on the dialogue.
76
+ * 5 (Very Complex): Needs to track intricate state/details over a long history, highly specialized knowledge synthesis based on context, complex multi-step problem-solving rooted in the conversation.
77
+
78
+ 3. **Voice Response Suitability (Score 1-5):** Is the *expected assistant's answer to the Current User Query* suitable for delivery via voice ONLY? (Focus on the likely *next turn's* content).
79
+ * 1 (Very Unsuitable): Expected answer likely requires visuals (graphs, code, tables), complex formatting, UI interaction, or is excessively long/structured even for conversational context (e.g., reading out a large diff).
80
+ * 2 (Unsuitable): Expected answer probably very long, has complex structure (nested lists), significantly easier to parse visually. Poor audio UX for the *next* response.
81
+ * 3 (Moderate): Expected answer might be slightly long or have simple structure (e.g., short list of steps mentioned earlier), but generally digestible via audio. Upper limit for conversational comfort.
82
+ * 4 (Suitable): Expected answer reasonably concise, informational/conversational, flows well in dialogue, easy to understand when spoken.
83
+ * 5 (Highly Suitable): Ideal for voice - short confirmation, direct answer based on context, brief explanation, conversational response.
84
+
85
+ 4. **Justification (Brief Text):** 1-2 sentences explaining the scores, especially for low (<3) or unusual scores, referencing context if necessary.
86
+
87
+ **Example Input Structure (What your 'user' message will contain):**
88
+
89
+ Conversation History:
90
+ [USER]
91
+ Tell me about the Eiffel Tower.
92
+ [ASSISTANT]
93
+ The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
94
+
95
+ ---
96
+
97
+ Current User Query:
98
+ How tall is it and when was it built?
99
+
100
+ **Example Output String:**
101
+ Quality: 4, Complexity: 2, Suitability: 5, Justification: Clear follow-up query based on the history. Asks for simple facts, suitable for a short voice response.
102
+ """
103
+
104
+ # --- LLM API Function (evaluate_prompt_with_llm) ---
105
+ # (Keep the function definition exactly the same as before - it handles API calls generically)
106
+ def evaluate_prompt_with_llm(prompt_text, api_key, host, path, model, retries=MAX_RETRIES):
107
+ """Calls the LLM API to get evaluation scores for a prompt (or query+history)."""
108
+ # Add check for None or empty prompt_text
109
+ if not prompt_text or not isinstance(prompt_text, str) or not prompt_text.strip():
110
+ logging.warning("evaluate_prompt_with_llm received empty or invalid input text.")
111
+ return None # Cannot evaluate empty input
112
+
113
+ payload = json.dumps({
114
+ "model": model,
115
+ "messages": [
116
+ {"role": "system", "content": SYSTEM_PROMPT},
117
+ # --- !! CRITICAL !! ---
118
+ # The combined history + query will be passed as the 'user' content here
119
+ {"role": "user", "content": prompt_text}
120
+ ],
121
+ "temperature": 0.1, # Low temperature for consistent evaluation
122
+ "max_tokens": 100 # Should be enough for the scores + justification
123
+ })
124
+ headers = {
125
+ 'Accept': 'application/json',
126
+ 'Authorization': f'Bearer {api_key}',
127
+ 'User-Agent': 'HuggingFace UltraChat Evaluation Script', # Updated User-Agent
128
+ 'Content-Type': 'application/json'
129
+ }
130
+ # Add a small random delay before each request
131
+ time.sleep(random.uniform(REQUEST_DELAY_SECONDS * 0.8, REQUEST_DELAY_SECONDS * 1.2))
132
+
133
+ # --- (Keep the rest of the API call, retry, and error handling logic exactly the same) ---
134
+ for attempt in range(retries):
135
+ try:
136
+ conn = http.client.HTTPSConnection(host, timeout=60) # Added timeout
137
+ conn.request("POST", path, payload, headers)
138
+ res = conn.getresponse()
139
+ status = res.status
140
+ data = res.read()
141
+ conn.close()
142
+
143
+ if status == 200:
144
+ response_json = json.loads(data.decode("utf-8"))
145
+ # print(f"DEBUG: API Response JSON: {response_json}") # Uncomment for debugging API response
146
+ if response_json.get("choices") and len(response_json["choices"]) > 0:
147
+ message = response_json["choices"][0].get("message")
148
+ if message and message.get("content"):
149
+ raw_response = message["content"].strip()
150
+ # Basic check for expected format start - parsing function handles details
151
+ if raw_response.startswith("Quality:") and "Complexity:" in raw_response and "Suitability:" in raw_response:
152
+ # print(f"DEBUG: Received potential valid format: {raw_response}")
153
+ return raw_response
154
+ else:
155
+ logging.warning(f"LLM response format unexpected for input starting with '{prompt_text[:50]}...': {raw_response}")
156
+ # print(f"DEBUG: Received unexpected format: {raw_response}")
157
+ return raw_response # Return potentially malformed for parsing attempt later
158
+ logging.error(f"Unexpected API response structure (no choices/content): {data.decode('utf-8')}")
159
+
160
+ elif status == 429: # Rate limit
161
+ retry_after_header = res.getheader('Retry-After', str(int(REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5))))
162
+ try: wait_time = int(retry_after_header)
163
+ except ValueError: wait_time = REQUEST_DELAY_SECONDS * (2 ** attempt) + random.uniform(1, 5) # Exponential backoff + jitter
164
+ logging.warning(f"Rate limit exceeded (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
165
+ time.sleep(wait_time)
166
+ elif status >= 500: # Server error
167
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5) # Exponential backoff + jitter
168
+ logging.warning(f"Server error (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
169
+ time.sleep(wait_time)
170
+ else: # Other client errors (4xx) - likely not recoverable by retry
171
+ logging.error(f"API Client Error: Status {status}, Response: {data.decode('utf-8')} for input: {prompt_text[:60]}")
172
+ return None # Don't retry on definitive client errors like bad auth (401) or not found (404)
173
+
174
+ except (http.client.HTTPException, ConnectionError, socket.gaierror, TimeoutError, socket.timeout) as e: # Network/HTTP level errors
175
+ logging.error(f"Network/HTTP error during API call: {e}. Attempt {attempt + 1}/{retries}")
176
+ if attempt + 1 == retries: return None
177
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3) # Exponential backoff + jitter
178
+ logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
179
+ time.sleep(wait_time)
180
+ except json.JSONDecodeError as e:
181
+ logging.error(f"Failed to decode API response: {e}. Response snippet: {data[:200] if data else 'N/A'}")
182
+ # print(f"DEBUG: JSON Decode Error. Raw Data: {data}") # Uncomment for debugging
183
+ return None # Cannot proceed if response isn't JSON
184
+ except Exception as e:
185
+ # Catch any other unexpected errors during the API call/processing
186
+ logging.error(f"An unexpected error occurred during API call processing: {e}", exc_info=True)
187
+ if attempt + 1 == retries: return None
188
+ wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
189
+ logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
190
+ time.sleep(wait_time)
191
+
192
+ logging.error(f"API call failed after {retries} retries for input: {prompt_text[:60]}...")
193
+ return None
194
+
195
+
196
+ # --- Function to Parse LLM Response ---
197
+ # (Keep the function definition exactly the same as before - it parses the expected output format)
198
+ def parse_llm_evaluation(response_string):
199
+ """Parses the structured string response from the LLM."""
200
+ if not response_string:
201
+ return None, None, None, None, "error_empty_response"
202
+
203
+ # Primary regex targeting the specific format
204
+ match = re.match(
205
+ r"Quality:\s*([1-5])\s*,\s*Complexity:\s*([1-5])\s*,\s*Suitability:\s*([1-5])\s*,\s*Justification:\s*(.*)",
206
+ response_string.strip(),
207
+ re.IGNORECASE | re.DOTALL # Ignore case and allow '.' to match newlines in justification
208
+ )
209
+
210
+ if match:
211
+ try:
212
+ quality = int(match.group(1))
213
+ complexity = int(match.group(2))
214
+ suitability = int(match.group(3))
215
+ # Handle potential empty justification if the regex matches everything before it
216
+ justification = match.group(4).strip() if match.group(4) else ""
217
+ # print(f"DEBUG: Regex Parse Success: Q={quality}, C={complexity}, S={suitability}, J='{justification}'")
218
+ return quality, complexity, suitability, justification, "success"
219
+ except (ValueError, IndexError) as e:
220
+ # This case means regex matched structure, but numbers were invalid or groups missing unexpectedly
221
+ logging.warning(f"Parsing failed for matched string (invalid numbers?Groups missing?): {response_string}. Error: {e}")
222
+ # print(f"DEBUG: Regex Matched, but Value/Index Error: {response_string}")
223
+ return None, None, None, None, "error_parsing_matched"
224
+ else:
225
+ # Log if the primary regex didn't match at all
226
+ logging.warning(f"Regex did not match LLM response format: {response_string}")
227
+ # print(f"DEBUG: Regex No Match: {response_string}")
228
+
229
+ # Fallback attempt: Try splitting and key-value parsing (less robust)
230
+ parts = [p.strip() for p in response_string.split(',')]
231
+ scores = {}
232
+ justification = ""
233
+ try:
234
+ # Attempt to find key-value pairs even if formatting is slightly off
235
+ for part in parts:
236
+ if ':' in part:
237
+ key, val = part.split(':', 1)
238
+ key = key.strip().lower()
239
+ val = val.strip()
240
+ if key == 'quality' and val.isdigit() and 1 <= int(val) <= 5: scores['quality'] = int(val)
241
+ elif key == 'complexity' and val.isdigit() and 1 <= int(val) <= 5: scores['complexity'] = int(val)
242
+ elif key == 'suitability' and val.isdigit() and 1 <= int(val) <= 5: scores['suitability'] = int(val)
243
+ elif key == 'justification': justification = val # Assume the rest is justification
244
+ # Check if all required scores were found via fallback
245
+ if 'quality' in scores and 'complexity' in scores and 'suitability' in scores:
246
+ logging.info(f"Fallback parsing successful for: {response_string[:50]}...")
247
+ # print(f"DEBUG: Fallback Parse Success: Q={scores['quality']}, C={scores['complexity']}, S={scores['suitability']}, J='{justification}'")
248
+ return scores['quality'], scores['complexity'], scores['suitability'], justification, "success_fallback_parse"
249
+ except Exception as e:
250
+ # Catch errors during the fallback splitting/parsing itself
251
+ logging.warning(f"Fallback parsing attempt also failed: {e}")
252
+ # print(f"DEBUG: Fallback Parse Exception: {e}")
253
+ pass # Fall through to return the final error status
254
+
255
+ # If neither primary regex nor fallback worked
256
+ return None, None, None, None, "error_parsing_no_match"
257
+
258
+
259
+ # --- !! MODIFIED: Dataset Processing Function for UltraChat !! ---
260
+ def evaluate_dataset_entry(example):
261
+ """Processes a single UltraChat filtered entry to get LLM evaluation."""
262
+ processed_example = example.copy() # Work on a copy
263
+ # Initialize evaluation fields (or keep existing ones if resuming)
264
+ processed_example['llm_quality'] = example.get('llm_quality', None)
265
+ processed_example['llm_complexity'] = example.get('llm_complexity', None)
266
+ processed_example['llm_suitability'] = example.get('llm_suitability', None)
267
+ processed_example['llm_justification'] = example.get('llm_justification', '')
268
+ # Start assuming we'll try processing, change status based on outcome
269
+ processed_example['llm_evaluation_status'] = 'pending_evaluation' # Or keep existing status if retrying
270
+
271
+ # --- Get Query and History ---
272
+ query_text = example.get("query")
273
+ history_text = example.get("history", "") # Get history, default to empty string if missing
274
+
275
+ # --- Validate Input ---
276
+ if not query_text or not isinstance(query_text, str) or not query_text.strip():
277
+ processed_example['llm_evaluation_status'] = 'skipped_invalid_query'
278
+ logging.debug(f"Skipping entry (Dialogue: {example.get('dialogue_id', 'N/A')}, Turn: {example.get('turn_index', 'N/A')}): Invalid query.")
279
+ return processed_example
280
+ # Optional: Add check for history if it's strictly required?
281
+ # if not isinstance(history_text, str): # History should be string from previous script
282
+ # processed_example['llm_evaluation_status'] = 'skipped_invalid_history'
283
+ # logging.warning(f"Entry (Dialogue: {example.get('dialogue_id', 'N/A')}, Turn: {example.get('turn_index', 'N/A')}) has non-string history: {type(history_text)}")
284
+ # return processed_example
285
+
286
+ # --- Format Input for LLM ---
287
+ # Combine history and query into the format the system prompt expects
288
+ llm_input_text = f"Conversation History:\n{history_text}\n\n---\n\nCurrent User Query:\n{query_text}"
289
+
290
+ # --- Call LLM API ---
291
+ # print(f"DEBUG: Calling LLM for Turn {example.get('turn_index')}, Query: {query_text[:50]}...") # Debug print
292
+ llm_response_string = evaluate_prompt_with_llm(llm_input_text, API_KEY, API_HOST, API_PATH, LLM_MODEL)
293
+
294
+ # --- Parse Response and Update Example ---
295
+ if llm_response_string:
296
+ q, c, s, j, parse_status = parse_llm_evaluation(llm_response_string)
297
+ # print(f"DEBUG: Parse Result: Q={q}, C={c}, S={s}, Status={parse_status}, Raw='{llm_response_string[:50]}...'") # Debug print
298
+ if parse_status.startswith("success"):
299
+ processed_example["llm_quality"] = q
300
+ processed_example["llm_complexity"] = c
301
+ processed_example["llm_suitability"] = s
302
+ processed_example["llm_justification"] = j
303
+ processed_example['llm_evaluation_status'] = 'success' # Final success state
304
+ else:
305
+ # Log the parsing error type and store raw response for potential manual review
306
+ processed_example['llm_evaluation_status'] = parse_status # e.g., "error_parsing_no_match"
307
+ processed_example['llm_justification'] = f"RAW_RESPONSE: {llm_response_string}" # Store raw response in justification
308
+ logging.warning(f"Parsing failed ({parse_status}) for dialogue {example.get('dialogue_id', 'N/A')}, turn {example.get('turn_index', 'N/A')}. Raw response saved.")
309
+ else:
310
+ # LLM call itself failed after retries
311
+ processed_example['llm_evaluation_status'] = 'failed_llm_call'
312
+ logging.error(f"LLM call failed for dialogue {example.get('dialogue_id', 'N/A')}, turn {example.get('turn_index', 'N/A')}.")
313
+
314
+
315
+ return processed_example
316
+
317
+ # --- Function to Save Dataset Atomically ---
318
+ # (Keep the function definition exactly the same as before - it needs the correct 'features')
319
+ # NOTE: Ensure the Features object passed to this function matches the UltraChat + LLM structure.
320
+ def save_dataset_atomically(data_list, output_path, features):
321
+ """Saves the list of data dictionaries atomically using the correct schema."""
322
+ if not data_list:
323
+ logging.info("No data provided for saving.")
324
+ return False
325
+ temp_output_path = output_path + "_saving"
326
+ final_output_path = output_path
327
+ logging.info(f"Attempting to save {len(data_list)} examples to temp path {temp_output_path}...")
328
+ try:
329
+ processed_data_list = []
330
+ # Handle potential None for integer columns before creating Dataset
331
+ for item in data_list:
332
+ item_copy = item.copy() # Work on a copy
333
+ # Replace None with a placeholder like -1 if the Feature type is integer
334
+ for key in ['llm_quality', 'llm_complexity', 'llm_suitability']:
335
+ # Check if the key exists and its value is None before attempting replacement
336
+ if key in item_copy and item_copy[key] is None and isinstance(features[key], Value) and features[key].dtype == 'int32':
337
+ # Use -1 as placeholder for missing integer scores (easier for Pandas later)
338
+ item_copy[key] = -1
339
+ processed_data_list.append(item_copy)
340
+
341
+ # Create dataset from the list of dictionaries using the defined features
342
+ processed_dataset = Dataset.from_list(processed_data_list, features=features)
343
+
344
+ # Ensure parent directory exists
345
+ os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
346
+
347
+ # Clean up potential stale temporary directory first
348
+ if os.path.exists(temp_output_path):
349
+ logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
350
+ shutil.rmtree(temp_output_path)
351
+
352
+ # Save to temporary path
353
+ processed_dataset.save_to_disk(temp_output_path)
354
+ logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
355
+
356
+ # Clean up final destination path if it exists, before renaming
357
+ if os.path.exists(final_output_path):
358
+ logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
359
+ shutil.rmtree(final_output_path)
360
+
361
+ # Atomically rename the temporary directory to the final path
362
+ os.rename(temp_output_path, final_output_path)
363
+ logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
364
+ return True
365
+ except Exception as e:
366
+ logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
367
+ # Cleanup failed temporary directory if it exists
368
+ if os.path.exists(temp_output_path):
369
+ try:
370
+ shutil.rmtree(temp_output_path)
371
+ logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
372
+ except Exception as cleanup_e:
373
+ logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
374
+
375
+ # Fallback: Try saving as JSON Lines (less ideal but better than nothing)
376
+ fallback_json_path = final_output_path + ".jsonl.failed_save"
377
+ logging.warning(f"Attempting fallback save to JSON Lines file: {fallback_json_path}")
378
+ try:
379
+ with open(fallback_json_path, 'w', encoding='utf-8') as f:
380
+ for item in data_list: # Use original list for fallback
381
+ # Convert potential non-serializable items (like complex objects if any) to string
382
+ f.write(json.dumps(dict(item), ensure_ascii=False, default=str) + '\n')
383
+ logging.info(f"Successfully saved fallback JSON Lines file.")
384
+ except Exception as json_e:
385
+ logging.error(f"Fallback JSON save also failed: {json_e}", exc_info=True)
386
+
387
+ return False
388
+
389
+
390
+ # --- Function to Check if Retry is Needed ---
391
+ # (Keep the function definition exactly the same as before)
392
+ def needs_retry(example):
393
+ """Checks if an example needs evaluation or retry."""
394
+ status = example.get('llm_evaluation_status')
395
+ # Retry if status is not 'success' AND not explicitly 'skipped_*'
396
+ # Handles None status, 'pending', 'failed_*', 'error_*' etc.
397
+ retry_flag = (status != 'success') and (not str(status).startswith('skipped_'))
398
+ return retry_flag
399
+
400
+ # --- !! MODIFIED: Get Dataset Features for Filtered UltraChat + Evaluation !! ---
401
+ def get_ultrachat_features_with_evaluation():
402
+ """Defines features for the pre-filtered UltraChat dataset + evaluation columns."""
403
+ logging.info(f"Defining features for pre-filtered UltraChat data + LLM evaluation.")
404
+
405
+ # Define features based on the output of the UltraChat filtering script
406
+ base_features = Features({
407
+ 'dialogue_id': Value(dtype='string', id=None),
408
+ 'turn_index': Value('int64'), # Use int64 for potentially large indices, check source dataset type
409
+ 'query': Value(dtype='string', id=None),
410
+ 'history': Value(dtype='string', id=None),
411
+ })
412
+
413
+ # Add new features for LLM evaluation
414
+ # Use int32 for scores, string for justification/status.
415
+ # The save function handles None -> -1 for int32 fields.
416
+ augmented_features = Features({
417
+ **base_features,
418
+ 'llm_quality': Value('int32'),
419
+ 'llm_complexity': Value('int32'),
420
+ 'llm_suitability': Value('int32'),
421
+ 'llm_justification': Value('string'),
422
+ 'llm_evaluation_status': Value('string') # Stores 'success', 'failed_*', 'skipped_*', 'error_*' etc.
423
+ })
424
+ logging.info(f"Defined features: {augmented_features}")
425
+ return augmented_features
426
+
427
+ # --- Main Execution ---
428
+ if __name__ == "__main__":
429
+ start_time = time.time()
430
+ logging.info("======================================================")
431
+ logging.info(f" Starting Filtered UltraChat Dataset Evaluation - {LLM_MODEL}") # Updated title
432
+ logging.info(f" Input Data Path (Filtered UltraChat): {INPUT_DATA_PATH}")
433
+ logging.info(f" Output Dir: {OUTPUT_DIR}")
434
+ logging.info(f" Intermediate Save Path: {PROCESSED_DATA_PATH}")
435
+ logging.info(f" Final Annotated Path: {FINAL_OUTPUT_PATH}")
436
+ logging.info(f" LLM-Filtered Output Path: {FILTERED_OUTPUT_PATH}")
437
+ logging.info("======================================================")
438
+
439
+ # --- Define Features for UltraChat + LLM ---
440
+ dataset_features = get_ultrachat_features_with_evaluation() # Use the correct feature function
441
+
442
+ # --- Load or Initialize Dataset ---
443
+ results_list = []
444
+ # Check for intermediate save file from *this* script first
445
+ if os.path.exists(PROCESSED_DATA_PATH):
446
+ logging.info(f"Loading existing intermediate dataset from {PROCESSED_DATA_PATH}...")
447
+ try:
448
+ # Load with trust_remote_code=True if dataset structure might have custom code (less likely here)
449
+ existing_dataset = Dataset.load_from_disk(PROCESSED_DATA_PATH)
450
+
451
+ # Optional: Verify features match exactly if needed (can cause issues if minor changes occur)
452
+ # if existing_dataset.features != dataset_features:
453
+ # logging.warning(f"Loaded intermediate dataset features mismatch expected. Trying to continue...")
454
+ # # Potentially try casting or just proceed carefully
455
+ results_list = existing_dataset.to_list() # Convert loaded dataset to list of dicts
456
+ total_examples = len(results_list)
457
+ logging.info(f"Loaded {total_examples} examples from intermediate save.")
458
+ except Exception as e:
459
+ logging.error(f"Failed to load intermediate dataset from {PROCESSED_DATA_PATH}: {e}", exc_info=True)
460
+ logging.warning("Will attempt to load fresh dataset from input path.")
461
+ results_list = [] # Reset list if loading failed
462
+
463
+ # If no intermediate data loaded, load the initial filtered UltraChat data
464
+ if not results_list:
465
+ logging.info(f"Loading pre-filtered UltraChat dataset from: {INPUT_DATA_PATH}")
466
+ if not os.path.exists(INPUT_DATA_PATH):
467
+ logging.error(f"Input dataset not found at '{INPUT_DATA_PATH}'. Please run the UltraChat filtering script first.")
468
+ sys.exit(1)
469
+ try:
470
+ # Load the dataset generated by the previous UltraChat filtering script
471
+ original_filtered_dataset = Dataset.load_from_disk(INPUT_DATA_PATH)
472
+ total_examples = len(original_filtered_dataset)
473
+ logging.info(f"Loaded {total_examples} original examples from {INPUT_DATA_PATH}.")
474
+
475
+ # Initialize results list with original data + placeholder evaluation fields
476
+ results_list = []
477
+ # Iterate through the loaded dataset and add placeholder fields
478
+ for example in tqdm(original_filtered_dataset, desc="Initializing data structure"):
479
+ init_example = dict(example) # Make a copy
480
+ # Ensure all expected base features are present, provide defaults if necessary
481
+ init_example['dialogue_id'] = init_example.get('dialogue_id', f'missing_id_{len(results_list)}')
482
+ init_example['turn_index'] = init_example.get('turn_index', -1) # Use -1 if missing?
483
+ init_example['query'] = init_example.get('query', '')
484
+ init_example['history'] = init_example.get('history', '')
485
+ # Add evaluation placeholders
486
+ init_example['llm_quality'] = None
487
+ init_example['llm_complexity'] = None
488
+ init_example['llm_suitability'] = None
489
+ init_example['llm_justification'] = ''
490
+ init_example['llm_evaluation_status'] = 'pending' # Initial status before processing
491
+ results_list.append(init_example)
492
+
493
+ # Perform an initial save to the intermediate path for this script run
494
+ logging.info(f"Performing initial save of placeholder data ({len(results_list)} items) to {PROCESSED_DATA_PATH}...")
495
+ # Use the correct features for saving
496
+ if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
497
+ logging.info("Initial data structure saved successfully.")
498
+ else:
499
+ logging.error("Failed to save initial data structure. Exiting.")
500
+ sys.exit(1)
501
+
502
+ except Exception as e:
503
+ logging.error(f"Failed to load or initialize dataset from {INPUT_DATA_PATH}: {e}", exc_info=True)
504
+ sys.exit(1)
505
+
506
+ # --- Identify Indices to Process/Retry ---
507
+ logging.info("Identifying examples needing evaluation/retry...")
508
+ # Use needs_retry to find indices where evaluation hasn't succeeded or been skipped
509
+ indices_to_process = [
510
+ i for i, example in enumerate(tqdm(results_list, desc="Checking examples status")) if needs_retry(example)
511
+ ]
512
+ num_to_process = len(indices_to_process)
513
+ total_examples = len(results_list) # Recalculate total based on loaded list
514
+
515
+ if num_to_process == 0:
516
+ logging.info("No examples found needing evaluation/retry based on current status.")
517
+ # Ensure final data exists even if no processing was needed in this run
518
+ if not os.path.exists(FINAL_OUTPUT_PATH):
519
+ logging.info(f"Copying data from {PROCESSED_DATA_PATH} to final location {FINAL_OUTPUT_PATH} as no retries needed...")
520
+ if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
521
+ logging.info("Dataset copied to final location.")
522
+ else:
523
+ logging.error("Failed to copy dataset to final location.")
524
+ else:
525
+ logging.info(f"Identified {num_to_process} examples to process/retry out of {total_examples}.")
526
+ # --- Concurrent Processing Logic ---
527
+ processed_count_total = 0 # Count processed in this run
528
+ processed_since_last_save = 0
529
+ last_save_time = time.time()
530
+ logging.info(f"Starting concurrent evaluation ({MAX_WORKERS} workers) with periodic saving...")
531
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
532
+ # Submit tasks only for the indices that need processing
533
+ futures = {
534
+ executor.submit(evaluate_dataset_entry, results_list[i]): i
535
+ for i in indices_to_process
536
+ }
537
+ try:
538
+ # Setup progress bar for the number of tasks submitted
539
+ pbar = tqdm(total=num_to_process, desc="Evaluating turns", unit="turn") # Updated description
540
+ for future in concurrent.futures.as_completed(futures):
541
+ original_index = futures[future] # Get the original list index for this future
542
+ try:
543
+ # Get the result (the updated example dictionary)
544
+ updated_example_dict = future.result()
545
+ # Update the main list with the processed data
546
+ results_list[original_index] = updated_example_dict
547
+ # Update progress bar postfix with the status of the completed item
548
+ pbar.set_postfix({"LastStatus": updated_example_dict.get('llm_evaluation_status', 'N/A')}, refresh=False) # Don't refresh too often
549
+ except Exception as exc:
550
+ # Log errors from the future execution itself (should be rare if evaluate_dataset_entry handles errors)
551
+ logging.error(f'Evaluation task for index {original_index} encountered an exception: {exc}', exc_info=True)
552
+ # Update the status in the main list to indicate failure
553
+ error_placeholder = results_list[original_index].copy()
554
+ error_placeholder['llm_evaluation_status'] = f'failed_future_exception_{type(exc).__name__}'
555
+ results_list[original_index] = error_placeholder
556
+ pbar.set_postfix({"LastStatus": error_placeholder['llm_evaluation_status']}, refresh=False)
557
+ finally:
558
+ # Increment counters regardless of success or failure
559
+ processed_count_total += 1
560
+ processed_since_last_save += 1
561
+ pbar.update(1) # Update progress bar
562
+
563
+ # Periodic save logic
564
+ if processed_since_last_save >= SAVE_INTERVAL:
565
+ current_time = time.time()
566
+ time_since_last = current_time - last_save_time
567
+ logging.info(f"\n--- Processed {processed_since_last_save} items (Total this run: {processed_count_total}/{num_to_process}). Time since last save: {time_since_last:.1f}s. Saving progress... ---")
568
+ # Save intermediate progress to PROCESSED_DATA_PATH using the correct features
569
+ if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
570
+ logging.info(f"--- Progress successfully saved to {PROCESSED_DATA_PATH} ---")
571
+ processed_since_last_save = 0 # Reset counter
572
+ last_save_time = current_time
573
+ else:
574
+ # Log error but continue processing, hoping the next save works
575
+ logging.error(f"--- FAILED TO SAVE PROGRESS to {PROCESSED_DATA_PATH}! Check errors. Will retry later. ---")
576
+ except KeyboardInterrupt:
577
+ logging.warning("\nCtrl+C detected! Attempting to shut down executor and save progress...")
578
+ # Gracefully shutdown the executor - wait for currently running tasks to finish (or cancel them)
579
+ # executor.shutdown(wait=False) # Cancel pending futures - results may be incomplete
580
+ # Consider just letting the 'finally' block handle the save
581
+ except Exception as e:
582
+ logging.error(f"An unexpected error occurred during the main processing loop: {e}", exc_info=True)
583
+ logging.error("Attempting final save...")
584
+ finally:
585
+ # Ensure progress bar is closed
586
+ if 'pbar' in locals() and pbar is not None:
587
+ pbar.close()
588
+ logging.info("--- Processing loop finished or interrupted. ---")
589
+ # --- Final Save Attempt (Save the complete results_list) ---
590
+ logging.info(f"Attempting final save of the fully annotated dataset ({len(results_list)} items) to: {FINAL_OUTPUT_PATH}")
591
+ if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
592
+ logging.info("--- Final annotated dataset state saved successfully. ---")
593
+ else:
594
+ # Critical error if final save fails
595
+ logging.error(f">>> FINAL ANNOTATED SAVE FAILED to {FINAL_OUTPUT_PATH}! <<< Check logs. Fallback JSON/Intermediate data might exist at {PROCESSED_DATA_PATH}.")
596
+
597
+ # --- Post-Processing: Verification, Analysis, Filtering (using LLM scores) ---
598
+ # This section remains largely the same, just ensures it loads from FINAL_OUTPUT_PATH
599
+ # and saves the filtered result to FILTERED_OUTPUT_PATH.
600
+ logging.info("======================================================")
601
+ logging.info("Post-Processing: Verification, Analysis, and LLM Filtering")
602
+ logging.info("======================================================")
603
+
604
+ # --- Verification of Final Annotated Data ---
605
+ logging.info(f"Verifying and Analyzing final annotated dataset: {FINAL_OUTPUT_PATH}")
606
+ if not os.path.exists(FINAL_OUTPUT_PATH):
607
+ logging.error(f"Final annotated dataset not found at {FINAL_OUTPUT_PATH}. Cannot perform analysis or filtering.")
608
+ else:
609
+ try:
610
+ # Reload the final dataset to ensure integrity and perform analysis/filtering
611
+ final_annotated_dataset = Dataset.load_from_disk(FINAL_OUTPUT_PATH)
612
+ num_final_examples = len(final_annotated_dataset)
613
+ logging.info(f"Successfully reloaded final annotated dataset with {num_final_examples} examples.")
614
+
615
+ # --- Calculate Score Distributions (using Pandas if available) ---
616
+ logging.info("Calculating score distributions...")
617
+ try:
618
+ df = final_annotated_dataset.to_pandas()
619
+ # Handle the placeholder -1 used for None in integer columns during saving
620
+ df['llm_quality'].replace(-1, pd.NA, inplace=True)
621
+ df['llm_complexity'].replace(-1, pd.NA, inplace=True)
622
+ df['llm_suitability'].replace(-1, pd.NA, inplace=True)
623
+
624
+ # Calculate value counts, including missing/placeholder values (NA)
625
+ quality_dist = df['llm_quality'].value_counts(dropna=False).sort_index()
626
+ complexity_dist = df['llm_complexity'].value_counts(dropna=False).sort_index()
627
+ suitability_dist = df['llm_suitability'].value_counts(dropna=False).sort_index()
628
+ status_dist = df['llm_evaluation_status'].value_counts()
629
+
630
+ print("\n--- Score Distributions (Annotated UltraChat Dataset) ---")
631
+ print("\nOverall Quality Distribution (NA indicates missing/placeholder):")
632
+ print(quality_dist)
633
+ print("\nComplexity Distribution (NA indicates missing/placeholder):")
634
+ print(complexity_dist)
635
+ print("\nVoice Response Suitability Distribution (NA indicates missing/placeholder):")
636
+ print(suitability_dist)
637
+ print("\nEvaluation Status Distribution:")
638
+ print(status_dist)
639
+ print("--------------------------------------------------")
640
+
641
+ except ImportError:
642
+ logging.warning("Pandas not found. Cannot perform detailed distribution analysis.")
643
+ # Fallback: Basic status count
644
+ status_counts = {}
645
+ for ex in final_annotated_dataset:
646
+ st = ex.get('llm_evaluation_status', 'unknown')
647
+ status_counts[st] = status_counts.get(st, 0) + 1
648
+ print("\n--- Evaluation Status Distribution (Basic) ---")
649
+ print(f"Status: {sorted(status_counts.items())}")
650
+ print("--------------------------------------------------")
651
+ except Exception as e:
652
+ logging.error(f"Error during Pandas analysis: {e}", exc_info=True)
653
+
654
+
655
+ # --- Filtering based on LLM scores ---
656
+ logging.info(f"Filtering annotated dataset based on LLM scores: Quality >= {MIN_QUALITY_SCORE}, Suitability >= {MIN_SUITABILITY_SCORE}")
657
+
658
+ # Define the filtering function (same logic, checks scores)
659
+ def filter_criteria(example):
660
+ q = example.get('llm_quality')
661
+ s = example.get('llm_suitability')
662
+ # Check if scores are valid (not None and not the -1 placeholder) before comparing
663
+ if q is None or q == -1 or s is None or s == -1:
664
+ return False # Filter out entries with missing/invalid scores
665
+ # Apply the thresholds
666
+ passes = q >= MIN_QUALITY_SCORE and s >= MIN_SUITABILITY_SCORE
667
+ # Optional: Add complexity filter here if needed
668
+ # c = example.get('llm_complexity')
669
+ # if c is not None and c != -1 and MAX_COMPLEXITY_SCORE is not None:
670
+ # passes = passes and c <= MAX_COMPLEXITY_SCORE
671
+ return passes
672
+
673
+ # Apply the filter using datasets.filter
674
+ # Use multiple processes if beneficial and safe (check memory usage)
675
+ num_proc_filter = max(1, os.cpu_count() // 2 if os.cpu_count() else 1)
676
+ logging.info(f"Applying filter with num_proc={num_proc_filter}...")
677
+ filtered_llm_dataset = final_annotated_dataset.filter(
678
+ filter_criteria,
679
+ num_proc=num_proc_filter # Adjust based on system resources
680
+ )
681
+ num_filtered = len(filtered_llm_dataset)
682
+ filter_percentage = (num_filtered / num_final_examples * 100) if num_final_examples > 0 else 0
683
+ logging.info(f"LLM-Filtered dataset size: {num_filtered} examples ({filter_percentage:.2f}% of annotated)")
684
+
685
+ # --- Save LLM-Filtered Dataset ---
686
+ logging.info(f"Saving LLM-filtered dataset to: {FILTERED_OUTPUT_PATH}")
687
+ try:
688
+ # Ensure parent directory exists
689
+ os.makedirs(os.path.dirname(FILTERED_OUTPUT_PATH), exist_ok=True)
690
+ # Clean up old filtered data if it exists
691
+ if os.path.exists(FILTERED_OUTPUT_PATH):
692
+ logging.debug(f"Removing existing LLM-filtered directory: {FILTERED_OUTPUT_PATH}")
693
+ shutil.rmtree(FILTERED_OUTPUT_PATH)
694
+ # Save the filtered dataset
695
+ filtered_llm_dataset.save_to_disk(FILTERED_OUTPUT_PATH)
696
+ logging.info("LLM-Filtered dataset saved successfully.")
697
+ except Exception as e:
698
+ logging.error(f"Failed to save LLM-filtered dataset to {FILTERED_OUTPUT_PATH}: {e}", exc_info=True)
699
+
700
+ except Exception as e:
701
+ logging.error(f"Verification/Analysis/Filtering failed on final annotated dataset: {e}", exc_info=True)
702
+
703
+ # --- Script End ---
704
+ end_time = time.time()
705
+ logging.info("------------------------------------------------------")
706
+ logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
707
+ logging.info(f"Final annotated dataset saved at: {FINAL_OUTPUT_PATH}")
708
+ logging.info(f"LLM-Filtered dataset saved at: {FILTERED_OUTPUT_PATH}")
709
+ logging.info("======================================================")
r1-a/dataset/gsm8k_final_filtered/combined/dataset_info.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "query": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "answer": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "source_dataset": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "audio": {
18
+ "dtype": "string",
19
+ "_type": "Value"
20
+ },
21
+ "question_type": {
22
+ "dtype": "string",
23
+ "_type": "Value"
24
+ },
25
+ "difficulty": {
26
+ "dtype": "string",
27
+ "_type": "Value"
28
+ }
29
+ },
30
+ "homepage": "",
31
+ "license": ""
32
+ }
r1-a/dataset/gsm8k_final_filtered/combined/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "3636165bbeb98bf3",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
r1-a/dataset/gsm8k_final_filtered/test/dataset_info.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "query": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "answer": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "source_dataset": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "audio": {
18
+ "dtype": "string",
19
+ "_type": "Value"
20
+ },
21
+ "question_type": {
22
+ "dtype": "string",
23
+ "_type": "Value"
24
+ },
25
+ "difficulty": {
26
+ "dtype": "string",
27
+ "_type": "Value"
28
+ }
29
+ },
30
+ "homepage": "",
31
+ "license": ""
32
+ }
r1-a/dataset/gsm8k_final_filtered/test/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "19de9358ac0cc73a",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
r1-a/dataset/gsm8k_final_filtered/train/dataset_info.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "query": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "answer": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "source_dataset": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "audio": {
18
+ "dtype": "string",
19
+ "_type": "Value"
20
+ },
21
+ "question_type": {
22
+ "dtype": "string",
23
+ "_type": "Value"
24
+ },
25
+ "difficulty": {
26
+ "dtype": "string",
27
+ "_type": "Value"
28
+ }
29
+ },
30
+ "homepage": "",
31
+ "license": ""
32
+ }
r1-a/dataset/gsm8k_final_filtered/train/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "92a03df5b878397b",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
r1-a/dataset/mtcs_verified/get_response_gpt4o.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
2
+
3
+ dataset = Dataset.load_from_disk("/root/autodl-tmp/audio-r1/r1-a/dataset/Multi-subject-RLVR_rephrased/train_processed")
4
+ breakpoint()
r1-a/dataset/mtcs_verified/mtcs.py ADDED
File without changes
r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/dataset_info.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "prompt": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "response_0": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "response_1": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "is_safe_0": {
18
+ "dtype": "bool",
19
+ "_type": "Value"
20
+ },
21
+ "is_safe_1": {
22
+ "dtype": "bool",
23
+ "_type": "Value"
24
+ },
25
+ "involved_harm_categories": {
26
+ "feature": {
27
+ "dtype": "string",
28
+ "_type": "Value"
29
+ },
30
+ "_type": "Sequence"
31
+ },
32
+ "better_response_id": {
33
+ "dtype": "int64",
34
+ "_type": "Value"
35
+ },
36
+ "safer_response_id": {
37
+ "dtype": "int64",
38
+ "_type": "Value"
39
+ }
40
+ },
41
+ "homepage": "",
42
+ "license": ""
43
+ }
r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "86ec040d4b942521",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
r1-a/dataset/shp2_filtered_tts_high_quality_train_only/dataset_info.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "query": {
6
+ "dtype": "string",
7
+ "_type": "Value"
8
+ },
9
+ "chosen": {
10
+ "dtype": "string",
11
+ "_type": "Value"
12
+ },
13
+ "reject": {
14
+ "dtype": "string",
15
+ "_type": "Value"
16
+ },
17
+ "domain": {
18
+ "dtype": "string",
19
+ "_type": "Value"
20
+ }
21
+ },
22
+ "homepage": "",
23
+ "license": ""
24
+ }
r1-a/dataset/shp2_filtered_tts_high_quality_train_only/state.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00002.arrow"
5
+ },
6
+ {
7
+ "filename": "data-00001-of-00002.arrow"
8
+ }
9
+ ],
10
+ "_fingerprint": "d339b25f13802884",
11
+ "_format_columns": null,
12
+ "_format_kwargs": {},
13
+ "_format_type": null,
14
+ "_output_all_columns": false,
15
+ "_split": null
16
+ }