1f commited on Jun 7, 2025

Commit

2a79d4e

verified ·

1 Parent(s): f65aa03

Add files using upload-large-folder tool

Browse files

Files changed (17) hide show

r1-a/dataset/filter/gpt_filter_shp2.py +603 -0
r1-a/dataset/filter/gsm8k.py +63 -0
r1-a/dataset/filter/shp2_final.py +225 -0
r1-a/dataset/filter/ultra_final.py +250 -0
r1-a/dataset/filter/ultrachat_gpt.py +709 -0
r1-a/dataset/gsm8k_final_filtered/combined/dataset_info.json +32 -0
r1-a/dataset/gsm8k_final_filtered/combined/state.json +13 -0
r1-a/dataset/gsm8k_final_filtered/test/dataset_info.json +32 -0
r1-a/dataset/gsm8k_final_filtered/test/state.json +13 -0
r1-a/dataset/gsm8k_final_filtered/train/dataset_info.json +32 -0
r1-a/dataset/gsm8k_final_filtered/train/state.json +13 -0
r1-a/dataset/mtcs_verified/get_response_gpt4o.py +4 -0
r1-a/dataset/mtcs_verified/mtcs.py +0 -0
r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/dataset_info.json +43 -0
r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/state.json +13 -0
r1-a/dataset/shp2_filtered_tts_high_quality_train_only/dataset_info.json +24 -0
r1-a/dataset/shp2_filtered_tts_high_quality_train_only/state.json +16 -0

r1-a/dataset/filter/gpt_filter_shp2.py ADDED Viewed

	@@ -0,0 +1,603 @@

+import os
+import http.client
+import json
+import time
+import random
+import re # Import regex for parsing
+import pandas as pd # For data distribution analysis
+# Make sure necessary types are imported
+from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
+from tqdm.auto import tqdm
+import sys
+import logging
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+import shutil
+import socket # Added for potential error catching, though http.client might cover it
+# --- Configuration ---
+# --- !! MODIFIED: Point to the pre-filtered dataset !! ---
+INPUT_DATA_PATH = "./shp2_filtered_tts_high_quality_train_only" # Path from the previous script's output
+# --- Keep other configurations ---
+API_HOST = "api2.aigcbest.top"
+API_PATH = "/v1/chat/completions"
+LLM_MODEL = "gemini-2.5-flash-preview-04-17-nothinking"
+API_KEY = os.environ.get('AIGCBEST_API_KEY', "sk-U15cDXxI0bboL6iH4Hymzl30ws6oWzazWe1Ndwq9QtiPUEgI") # Replace or set env variable
+if not API_KEY or API_KEY == "YOUR_API_KEY_HERE":
+    print("API Key is not set correctly. Please set the AIGCBEST_API_KEY environment variable or replace the placeholder.")
+    sys.exit(1)
+# --- !! MODIFIED: Update output directory name !! ---
+OUTPUT_DIR = f"./shp2_filtered_evaluated" # Reflects evaluation applied to filtered data
+# Path to the existing, potentially incomplete, processed dataset (LOAD ONLY) - specific to this script's run
+PROCESSED_DATA_PATH = os.path.join(OUTPUT_DIR, f"train_split_evaluated_intermediate") # Use descriptive name
+# Path where final results will be saved (SAVE ONLY) - specific to this script's run
+FINAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"train_split_evaluated_final")
+# Path for the filtered dataset (based on LLM scores)
+FILTERED_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"train_split_llm_filtered")
+MAX_WORKERS = 40
+REQUEST_DELAY_SECONDS = 0.1
+MAX_RETRIES = 4
+SAVE_INTERVAL = 1000
+# --- Filtering Thresholds (LLM scores) ---
+MIN_QUALITY_SCORE = 4
+MIN_SUITABILITY_SCORE = 3
+# Optional: MAX_COMPLEXITY_SCORE = 4
+# Setup logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
+logging.getLogger("filelock").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+# --- LLM API Function (evaluate_prompt_with_llm) ---
+# (Keep the same SYSTEM_PROMPT and function definition as before)
+SYSTEM_PROMPT = """
+You are an AI Quality Assessor specializing in evaluating prompts for AI models, particularly voice-based assistants.
+Your task is to analyze the given user prompt and assign scores based on three metrics: Overall Quality, Complexity, and Voice Response Suitability. You must also provide a brief justification.
+**Input:** You will receive a single user prompt.
+**Metric Definitions:**
+1.  **Overall Quality (Score 1-5):** Clarity, coherence, and completeness of the prompt itself.
+    *   1 (Very Low): Nonsensical, ambiguous, ungrammatical, incomplete template/placeholder text.
+    *   2 (Low): Vague, poorly worded, significant errors, requires excessive interpretation.
+    *   3 (Medium): Understandable but could be clearer/more specific. Basic, functional.
+    *   4 (High): Clear, well-phrased, specific, unambiguous, effective.
+    *   5 (Very High): Exceptionally clear, concise, specific, well-formulated, ideal.
+2.  **Complexity (Score 1-5):** Cognitive load/intricacy needed to understand the request and generate the *answer*.
+    *   1 (Very Simple): Single simple fact, definition, common phrase.
+    *   2 (Simple): Basic info recall, single calculation, short standard text generation.
+    *   3 (Moderate): Multi-step reasoning, combining info, comparison, moderately complex text/explanation.
+    *   4 (Complex): Deep analysis, synthesis, advanced reasoning, creative problem-solving, detailed nuanced text.
+    *   5 (Very Complex): Highly specialized knowledge, intricate multi-stage problems, long-form creative content, detailed technical procedures.
+3.  **Voice Response Suitability (Score 1-5):** Is the *expected answer's content* suitable for delivery via voice ONLY? And whether it can be responded to by llm and whether it is suitable to be converted into speech as a sample.
+    *   1 (Very Unsuitable): Answer requires visuals (graphs, tables, code formatting), UI interaction, or is excessively long/structured (e.g., long lists, large code blocks).
+    *   2 (Unsuitable): Answer likely very long, complex formatting, significantly easier to parse visually. Poor audio UX.
+    *   3 (Moderate): Answer might be slightly long or have simple structure (e.g., short lists), but generally digestible via audio. Upper limit for comfort.
+    *   4 (Suitable): Answer reasonably concise, informational/conversational, easy to understand when spoken.
+    *   5 (Highly Suitable): Ideal for voice - short facts, direct answers, conversational responses, short creative outputs.
+4.  **Justification (Brief Text):** 1-2 sentences explaining the scores, especially for low (<3) or unusual scores.
+**Output Format:** Respond ONLY with a single string in the following format, replacing bracketed values with your scores and justification. Do NOT include any other text, greetings, or explanations outside this format.
+Quality: [1-5], Complexity: [1-5], Suitability: [1-5], Justification: [Your brief justification text here]
+**Example Input Prompt:**
+"Explain the process of photosynthesis in detail, including the chemical equations and the differences between C3 and C4 pathways."
+**Example Output String:**
+Quality: 4, Complexity: 4, Suitability: 3, Justification: Clear prompt asking for detailed scientific explanation. Complex topic, potentially long answer making voice suitability moderate.
+"""
+def evaluate_prompt_with_llm(prompt_text, api_key, host, path, model, retries=MAX_RETRIES):
+    """Calls the LLM API to get evaluation scores for a prompt."""
+    # Add check for None or empty prompt_text
+    if not prompt_text or not isinstance(prompt_text, str) or not prompt_text.strip():
+        logging.warning("evaluate_prompt_with_llm received empty or invalid prompt text.")
+        return None # Cannot evaluate an empty prompt
+    payload = json.dumps({
+        "model": model,
+        "messages": [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": prompt_text}
+        ],
+        "temperature": 0.1,
+        "max_tokens": 100
+    })
+    headers = {
+        'Accept': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+        'User-Agent': 'HuggingFace SHP2-Filtered Evaluation Script', # Updated User-Agent
+        'Content-Type': 'application/json'
+    }
+    time.sleep(random.uniform(REQUEST_DELAY_SECONDS * 0.8, REQUEST_DELAY_SECONDS * 1.2))
+    for attempt in range(retries):
+        try:
+            conn = http.client.HTTPSConnection(host, timeout=60)
+            conn.request("POST", path, payload, headers)
+            res = conn.getresponse()
+            status = res.status
+            data = res.read()
+            conn.close()
+            if status == 200:
+                response_json = json.loads(data.decode("utf-8"))
+                if response_json.get("choices") and len(response_json["choices"]) > 0:
+                    message = response_json["choices"][0].get("message")
+                    if message and message.get("content"):
+                        raw_response = message["content"].strip()
+                        if raw_response.startswith("Quality:") and "Complexity:" in raw_response and "Suitability:" in raw_response:
+                            return raw_response
+                        else:
+                            logging.warning(f"LLM response format unexpected for prompt '{prompt_text[:50]}...': {raw_response}")
+                            return raw_response # Return potentially malformed for parsing attempt
+                logging.error(f"Unexpected API response structure: {data.decode('utf-8')}")
+            elif status == 429:
+                retry_after_header = res.getheader('Retry-After', str(int(REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5))))
+                try: wait_time = int(retry_after_header)
+                except ValueError: wait_time = REQUEST_DELAY_SECONDS * (2 ** attempt) + random.uniform(1, 5)
+                logging.warning(f"Rate limit exceeded (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
+                time.sleep(wait_time)
+            elif status >= 500:
+                 wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5)
+                 logging.warning(f"Server error (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
+                 time.sleep(wait_time)
+            else:
+                logging.error(f"API Client Error: Status {status}, Response: {data.decode('utf-8')} for prompt: {prompt_text[:60]}")
+                return None
+        except (http.client.HTTPException, ConnectionError, socket.gaierror, TimeoutError, socket.timeout) as e: # Added socket errors
+            logging.error(f"Network/HTTP error during API call: {e}. Attempt {attempt + 1}/{retries}")
+            if attempt + 1 == retries: return None
+            wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
+            logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
+            time.sleep(wait_time)
+        except json.JSONDecodeError as e:
+             logging.error(f"Failed to decode API response: {e}. Response snippet: {data[:200] if data else 'N/A'}")
+             return None
+        except Exception as e:
+             logging.error(f"An unexpected error occurred during API call: {e}", exc_info=True)
+             if attempt + 1 == retries: return None
+             wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
+             logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
+             time.sleep(wait_time)
+    logging.error(f"API call failed after {retries} retries for prompt: {prompt_text[:60]}...")
+    return None
+# --- Function to Parse LLM Response ---
+# (Keep the same parse_llm_evaluation function as before)
+def parse_llm_evaluation(response_string):
+    """Parses the structured string response from the LLM."""
+    if not response_string:
+        return None, None, None, None, "error_empty_response"
+    match = re.match(
+        r"Quality:\s*([1-5])\s*,\s*Complexity:\s*([1-5])\s*,\s*Suitability:\s*([1-5])\s*,\s*Justification:\s*(.*)",
+        response_string.strip(),
+        re.IGNORECASE | re.DOTALL
+    )
+    if match:
+        try:
+            quality = int(match.group(1))
+            complexity = int(match.group(2))
+            suitability = int(match.group(3))
+            justification = match.group(4).strip() if match.group(4) else ""
+            return quality, complexity, suitability, justification, "success"
+        except (ValueError, IndexError):
+            logging.warning(f"Parsing failed for matched string (invalid numbers?): {response_string}")
+            return None, None, None, None, "error_parsing_matched"
+    else:
+        logging.warning(f"Regex did not match LLM response format: {response_string}")
+        # Fallback (optional, kept from previous version)
+        parts = [p.strip() for p in response_string.split(',')]
+        scores = {}
+        justification = ""
+        try:
+            for part in parts:
+                if ':' in part:
+                    key, val = part.split(':', 1)
+                    key = key.strip().lower()
+                    val = val.strip()
+                    if key == 'quality' and val.isdigit() and 1 <= int(val) <= 5: scores['quality'] = int(val)
+                    elif key == 'complexity' and val.isdigit() and 1 <= int(val) <= 5: scores['complexity'] = int(val)
+                    elif key == 'suitability' and val.isdigit() and 1 <= int(val) <= 5: scores['suitability'] = int(val)
+                    elif key == 'justification': justification = val
+            if 'quality' in scores and 'complexity' in scores and 'suitability' in scores:
+                 logging.info(f"Fallback parsing successful for: {response_string[:50]}...")
+                 return scores['quality'], scores['complexity'], scores['suitability'], justification, "success_fallback_parse"
+        except Exception as e:
+            logging.warning(f"Fallback parsing also failed: {e}")
+            pass
+        return None, None, None, None, "error_parsing_no_match"
+# --- Dataset Processing Function (Adapted for Evaluation) ---
+# --- !! MODIFIED: Target 'query' column !! ---
+def evaluate_dataset_entry(example):
+    """Processes a single dataset entry to get LLM evaluation."""
+    processed_example = example.copy()
+    processed_example['llm_quality'] = example.get('llm_quality', None)
+    processed_example['llm_complexity'] = example.get('llm_complexity', None)
+    processed_example['llm_suitability'] = example.get('llm_suitability', None)
+    processed_example['llm_justification'] = example.get('llm_justification', '')
+    processed_example['llm_evaluation_status'] = 'processing_retry'
+    # --- MODIFIED: Get text from 'query' column ---
+    query_text = example.get("query")
+    # --- MODIFIED: Check the 'query' column ---
+    if not query_text or not isinstance(query_text, str) or not query_text.strip():
+        processed_example['llm_evaluation_status'] = 'skipped_invalid_query' # Changed status name
+        return processed_example
+    # Call LLM API with the query text
+    llm_response_string = evaluate_prompt_with_llm(query_text, API_KEY, API_HOST, API_PATH, LLM_MODEL)
+    if llm_response_string:
+        q, c, s, j, parse_status = parse_llm_evaluation(llm_response_string)
+        if parse_status.startswith("success"):
+            processed_example["llm_quality"] = q
+            processed_example["llm_complexity"] = c
+            processed_example["llm_suitability"] = s
+            processed_example["llm_justification"] = j
+            processed_example['llm_evaluation_status'] = 'success'
+        else:
+            processed_example['llm_evaluation_status'] = parse_status
+            processed_example['llm_justification'] = f"RAW_RESPONSE: {llm_response_string}"
+    else:
+        processed_example['llm_evaluation_status'] = 'failed_llm_call'
+    return processed_example
+# --- Function to Save Dataset Atomically ---
+# (Keep the same save_dataset_atomically function as before)
+def save_dataset_atomically(data_list, output_path, features):
+    """Saves the list of data dictionaries atomically using the correct schema."""
+    if not data_list:
+        logging.info("No data provided for saving.")
+        return False
+    temp_output_path = output_path + "_saving"
+    final_output_path = output_path
+    logging.info(f"Attempting to save {len(data_list)} examples to temp path {temp_output_path}...")
+    try:
+        processed_data_list = []
+        # Handle potential None for integer columns before creating Dataset
+        for item in data_list:
+            item_copy = item.copy() # Work on a copy
+            # Replace None with a placeholder like -1 if the Feature type is integer
+            # Or ensure the Feature type allows None (e.g., use Value('float32') or check default behavior)
+            # For now, assume Value('int32') might require a number, using -1 as placeholder for None
+            for key in ['llm_quality', 'llm_complexity', 'llm_suitability']:
+                 if item_copy.get(key) is None and isinstance(features[key], Value) and features[key].dtype == 'int32':
+                     # logging.debug(f"Replacing None with -1 for int32 field '{key}' in item: {item_copy.get('query', '')[:30]}...")
+                     item_copy[key] = -1 # Or other suitable placeholder
+            processed_data_list.append(item_copy)
+        # Create dataset from the list of dictionaries using the defined features
+        processed_dataset = Dataset.from_list(processed_data_list, features=features)
+        os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
+        if os.path.exists(temp_output_path):
+            logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
+            shutil.rmtree(temp_output_path)
+        processed_dataset.save_to_disk(temp_output_path)
+        logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
+        if os.path.exists(final_output_path):
+            logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
+            shutil.rmtree(final_output_path)
+        os.rename(temp_output_path, final_output_path)
+        logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
+        return True
+    except Exception as e:
+        logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
+        if os.path.exists(temp_output_path):
+            try:
+                shutil.rmtree(temp_output_path)
+                logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
+            except Exception as cleanup_e:
+                logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
+        # Fallback JSON Lines save
+        fallback_json_path = final_output_path + ".jsonl.failed_save"
+        logging.warning(f"Attempting fallback save to JSON Lines file: {fallback_json_path}")
+        try:
+            with open(fallback_json_path, 'w', encoding='utf-8') as f:
+                for item in data_list: # Use original list for fallback
+                    f.write(json.dumps(dict(item), ensure_ascii=False, default=str) + '\n')
+            logging.info(f"Successfully saved fallback JSON Lines file.")
+        except Exception as json_e:
+            logging.error(f"Fallback JSON save also failed: {json_e}", exc_info=True)
+        return False
+# --- Function to Check if Retry is Needed ---
+# (Keep the same needs_retry function as before)
+def needs_retry(example):
+    """Checks if an example needs evaluation or retry."""
+    status = example.get('llm_evaluation_status')
+    retry_flag = (status != 'success') and (not str(status).startswith('skipped_')) # Check status string safely
+    return retry_flag
+# --- Get Dataset Features ---
+# --- !! MODIFIED: Define features explicitly for the filtered SHP-2 data !! ---
+def get_filtered_shp2_features_with_evaluation():
+    """Defines features for the pre-filtered SHP-2 dataset + evaluation columns."""
+    logging.info(f"Defining features for pre-filtered SHP-2 data + LLM evaluation.")
+    # Define features based on the known output of the filtering script
+    # Using Value('string', id=None) ensures compatibility if 'id' attribute exists
+    base_features = Features({
+        'query': Value(dtype='string', id=None),
+        'chosen': Value(dtype='string', id=None),
+        'reject': Value(dtype='string', id=None),
+        'domain': Value(dtype='string', id=None),
+    })
+    # Add new features for LLM evaluation
+    # Using int32, remember save function replaces None with -1
+    augmented_features = Features({
+        **base_features,
+        'llm_quality': Value('int32'),
+        'llm_complexity': Value('int32'),
+        'llm_suitability': Value('int32'),
+        'llm_justification': Value('string'),
+        'llm_evaluation_status': Value('string')
+    })
+    logging.info(f"Defined features: {augmented_features}")
+    return augmented_features
+# --- Main Execution ---
+if __name__ == "__main__":
+    start_time = time.time()
+    logging.info("======================================================")
+    logging.info(f" Starting Filtered SHP-2 Dataset Evaluation - {LLM_MODEL}")
+    logging.info(f" Input Data Path: {INPUT_DATA_PATH}") # Log input path
+    logging.info(f" Output Dir: {OUTPUT_DIR}")
+    logging.info(f" Intermediate Save Path: {PROCESSED_DATA_PATH}")
+    logging.info(f" Final Annotated Path: {FINAL_OUTPUT_PATH}")
+    logging.info(f" LLM-Filtered Output Path: {FILTERED_OUTPUT_PATH}")
+    logging.info("======================================================")
+    # --- Define Features ---
+    dataset_features = get_filtered_shp2_features_with_evaluation()
+    # --- Load or Initialize Dataset ---
+    results_list = []
+    # Check for intermediate save file from *this* script first
+    if os.path.exists(PROCESSED_DATA_PATH):
+        logging.info(f"Loading existing intermediate dataset from {PROCESSED_DATA_PATH}...")
+        try:
+            existing_dataset = Dataset.load_from_disk(PROCESSED_DATA_PATH)
+            # Optional: verify features match
+            if existing_dataset.features.keys() != dataset_features.keys():
+                 logging.warning(f"Loaded intermediate dataset features mismatch expected. Trying to continue...")
+            results_list = existing_dataset.to_list()
+            total_examples = len(results_list)
+            logging.info(f"Loaded {total_examples} examples from intermediate save.")
+        except Exception as e:
+            logging.error(f"Failed to load intermediate dataset from {PROCESSED_DATA_PATH}: {e}", exc_info=True)
+            logging.warning("Will attempt to load fresh dataset from input path.")
+            results_list = []
+    if not results_list:
+        # --- MODIFIED: Load from the local pre-filtered dataset path ---
+        logging.info(f"Loading pre-filtered dataset from: {INPUT_DATA_PATH}")
+        if not os.path.exists(INPUT_DATA_PATH):
+             logging.error(f"Input dataset not found at '{INPUT_DATA_PATH}'. Please run the initial filtering script first.")
+             sys.exit(1)
+        try:
+            # Load the dataset generated by the previous script
+            original_filtered_dataset = Dataset.load_from_disk(INPUT_DATA_PATH)
+            total_examples = len(original_filtered_dataset)
+            logging.info(f"Loaded {total_examples} original examples from {INPUT_DATA_PATH}.")
+            # Initialize results list with original data + placeholder fields
+            results_list = []
+            for example in tqdm(original_filtered_dataset, desc="Initializing data"):
+                 init_example = dict(example) # Make a copy
+                 # Ensure all base features are present, handle potential missing ones if needed
+                 init_example['query'] = init_example.get('query', '') # Ensure defaults if schema uncertain
+                 init_example['chosen'] = init_example.get('chosen', '')
+                 init_example['reject'] = init_example.get('reject', '')
+                 init_example['domain'] = init_example.get('domain', '')
+                 # Add evaluation placeholders
+                 init_example['llm_quality'] = None
+                 init_example['llm_complexity'] = None
+                 init_example['llm_suitability'] = None
+                 init_example['llm_justification'] = ''
+                 init_example['llm_evaluation_status'] = 'pending'
+                 results_list.append(init_example)
+            # Perform an initial save to the intermediate path for this script run
+            logging.info(f"Performing initial save of placeholder data to {PROCESSED_DATA_PATH}...")
+            save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features)
+        except Exception as e:
+            logging.error(f"Failed to load or initialize dataset from {INPUT_DATA_PATH}: {e}", exc_info=True)
+            sys.exit(1)
+    # --- Identify Indices to Process/Retry ---
+    logging.info("Identifying examples needing evaluation/retry...")
+    indices_to_process = [
+        i for i, example in enumerate(tqdm(results_list, desc="Checking examples")) if needs_retry(example)
+    ]
+    num_to_process = len(indices_to_process)
+    if num_to_process == 0:
+        logging.info("No examples found needing evaluation/retry based on status.")
+        # Ensure final data exists even if no retries needed
+        if not os.path.exists(FINAL_OUTPUT_PATH):
+             logging.info(f"Copying data from {PROCESSED_DATA_PATH} to final location {FINAL_OUTPUT_PATH}...")
+             if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
+                  logging.info("Dataset copied to final location.")
+             else:
+                  logging.error("Failed to copy dataset to final location.")
+    else:
+        logging.info(f"Identified {num_to_process} examples to process/retry out of {total_examples}.")
+        # --- Concurrent Processing Logic (remains the same structure) ---
+        processed_count_total = 0
+        processed_since_last_save = 0
+        last_save_time = time.time()
+        logging.info("Starting concurrent evaluation with periodic saving...")
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            futures = {
+                executor.submit(evaluate_dataset_entry, results_list[i]): i
+                for i in indices_to_process
+            }
+            try:
+                pbar = tqdm(total=num_to_process, desc="Evaluating queries", unit="query") # Updated desc
+                for future in concurrent.futures.as_completed(futures):
+                    original_index = futures[future]
+                    try:
+                        updated_example_dict = future.result()
+                        results_list[original_index] = updated_example_dict
+                        pbar.set_postfix({"LastStatus": updated_example_dict.get('llm_evaluation_status', 'N/A')}, refresh=True)
+                    except Exception as exc:
+                        logging.error(f'Evaluation task for index {original_index} encountered an exception: {exc}', exc_info=True)
+                        error_placeholder = results_list[original_index].copy()
+                        error_placeholder['llm_evaluation_status'] = f'failed_future_exception_{type(exc).__name__}'
+                        results_list[original_index] = error_placeholder
+                        pbar.set_postfix({"LastStatus": error_placeholder['llm_evaluation_status']}, refresh=True)
+                    finally:
+                        processed_count_total += 1
+                        processed_since_last_save += 1
+                        pbar.update(1)
+                        if processed_since_last_save >= SAVE_INTERVAL:
+                            current_time = time.time()
+                            time_since_last = current_time - last_save_time
+                            logging.info(f"\n--- Processed {processed_since_last_save} items (Total this run: {processed_count_total}/{num_to_process}). Time since last save: {time_since_last:.1f}s. Saving progress... ---")
+                            # Save intermediate progress to PROCESSED_DATA_PATH
+                            if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
+                                logging.info(f"--- Progress successfully saved to {PROCESSED_DATA_PATH} ---")
+                                processed_since_last_save = 0
+                                last_save_time = current_time
+                            else:
+                                logging.error(f"--- FAILED TO SAVE PROGRESS to {PROCESSED_DATA_PATH}! Check errors. Will retry later. ---")
+            except KeyboardInterrupt:
+                logging.warning("\nCtrl+C detected! Attempting final save...")
+            except Exception as e:
+                logging.error(f"An unexpected error occurred during the main processing loop: {e}", exc_info=True)
+                logging.error("Attempting final save...")
+            finally:
+                if 'pbar' in locals() and pbar is not None:
+                    pbar.close()
+                logging.info("--- Processing loop finished or interrupted. ---")
+                # --- Final Save Attempt (to FINAL_OUTPUT_PATH) ---
+                logging.info(f"Attempting final save of the fully annotated dataset ({len(results_list)} items) to: {FINAL_OUTPUT_PATH}")
+                if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
+                    logging.info("--- Final annotated dataset state saved successfully. ---")
+                else:
+                    logging.error(f">>> FINAL ANNOTATED SAVE FAILED to {FINAL_OUTPUT_PATH}! <<< Check logs. Fallback JSON/Intermediate data might exist.")
+    # --- Post-Processing: Verification, Analysis, Filtering ---
+    logging.info("======================================================")
+    logging.info("Post-Processing: Verification, Analysis, and LLM Filtering")
+    logging.info("======================================================")
+    # --- Verification of Final Annotated Data ---
+    logging.info(f"Verifying and Analyzing final annotated dataset: {FINAL_OUTPUT_PATH}")
+    if not os.path.exists(FINAL_OUTPUT_PATH):
+         logging.error(f"Final annotated dataset not found at {FINAL_OUTPUT_PATH}. Cannot perform analysis or filtering.")
+    else:
+        try:
+            final_annotated_dataset = Dataset.load_from_disk(FINAL_OUTPUT_PATH)
+            num_final_examples = len(final_annotated_dataset)
+            logging.info(f"Successfully reloaded final annotated dataset with {num_final_examples} examples.")
+            # --- Calculate Score Distributions ---
+            logging.info("Calculating score distributions...")
+            try:
+                df = final_annotated_dataset.to_pandas()
+                # Handle the placeholder -1 we might have used for None in integer columns
+                df['llm_quality'].replace(-1, pd.NA, inplace=True)
+                df['llm_complexity'].replace(-1, pd.NA, inplace=True)
+                df['llm_suitability'].replace(-1, pd.NA, inplace=True)
+                quality_dist = df['llm_quality'].value_counts(dropna=False).sort_index() # Include NA count
+                complexity_dist = df['llm_complexity'].value_counts(dropna=False).sort_index()
+                suitability_dist = df['llm_suitability'].value_counts(dropna=False).sort_index()
+                status_dist = df['llm_evaluation_status'].value_counts()
+                print("\n--- Score Distributions (Annotated Dataset) ---")
+                print("\nOverall Quality Distribution (NA indicates missing/placeholder):")
+                print(quality_dist)
+                print("\nComplexity Distribution (NA indicates missing/placeholder):")
+                print(complexity_dist)
+                print("\nVoice Response Suitability Distribution (NA indicates missing/placeholder):")
+                print(suitability_dist)
+                print("\nEvaluation Status Distribution:")
+                print(status_dist)
+                print("--------------------------------------------------")
+            except ImportError:
+                 logging.warning("Pandas not found. Performing basic counts (may not show None correctly).")
+                 # Basic counting (less informative about None/-1)
+                 quality_counts, complexity_counts, suitability_counts, status_counts = {}, {}, {}, {}
+                 for ex in final_annotated_dataset:
+                     q = ex.get('llm_quality', -99) # Use distinct value for missing
+                     c = ex.get('llm_complexity', -99)
+                     s = ex.get('llm_suitability', -99)
+                     st = ex.get('llm_evaluation_status', 'unknown')
+                     quality_counts[q] = quality_counts.get(q, 0) + 1
+                     complexity_counts[c] = complexity_counts.get(c, 0) + 1
+                     suitability_counts[s] = suitability_counts.get(s, 0) + 1
+                     status_counts[st] = status_counts.get(st, 0) + 1
+                 print("\n--- Score Distributions (Annotated Dataset - Basic) ---")
+                 print(f"Quality (-99=missing): {sorted(quality_counts.items())}")
+                 print(f"Complexity (-99=missing): {sorted(complexity_counts.items())}")
+                 print(f"Suitability (-99=missing): {sorted(suitability_counts.items())}")
+                 print(f"Status: {sorted(status_counts.items())}")
+                 print("--------------------------------------------------")
+            # --- Filtering based on LLM scores ---
+            logging.info(f"Filtering annotated dataset: Quality >= {MIN_QUALITY_SCORE}, Suitability >= {MIN_SUITABILITY_SCORE}")
+            def filter_criteria(example):
+                q = example.get('llm_quality')
+                s = example.get('llm_suitability')
+                # Handle potential None or placeholder (-1) scores before comparing
+                if q is None or q == -1 or s is None or s == -1:
+                    return False
+                passes = q >= MIN_QUALITY_SCORE and s >= MIN_SUITABILITY_SCORE
+                # Optional: Add complexity filter
+                # c = example.get('llm_complexity')
+                # if c is not None and c != -1 and MAX_COMPLEXITY_SCORE is not None:
+                #     passes = passes and c <= MAX_COMPLEXITY_SCORE
+                return passes
+            # Use num_proc=1 if filtering is fast enough or to avoid potential issues
+            filtered_llm_dataset = final_annotated_dataset.filter(filter_criteria, num_proc=max(1, os.cpu_count() // 2))
+            num_filtered = len(filtered_llm_dataset)
+            logging.info(f"LLM-Filtered dataset size: {num_filtered} examples ({num_filtered / num_final_examples:.2%} of annotated)")
+            # --- Save LLM-Filtered Dataset ---
+            logging.info(f"Saving LLM-filtered dataset to: {FILTERED_OUTPUT_PATH}")
+            try:
+                os.makedirs(os.path.dirname(FILTERED_OUTPUT_PATH), exist_ok=True)
+                if os.path.exists(FILTERED_OUTPUT_PATH):
+                    logging.debug(f"Removing existing LLM-filtered directory: {FILTERED_OUTPUT_PATH}")
+                    shutil.rmtree(FILTERED_OUTPUT_PATH)
+                filtered_llm_dataset.save_to_disk(FILTERED_OUTPUT_PATH)
+                logging.info("LLM-Filtered dataset saved successfully.")
+            except Exception as e:
+                logging.error(f"Failed to save LLM-filtered dataset to {FILTERED_OUTPUT_PATH}: {e}", exc_info=True)
+        except Exception as e:
+            logging.error(f"Verification/Analysis/Filtering failed on final annotated dataset: {e}", exc_info=True)
+    # --- Script End ---
+    end_time = time.time()
+    logging.info("------------------------------------------------------")
+    logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
+    logging.info(f"Final annotated dataset saved at: {FINAL_OUTPUT_PATH}")
+    logging.info(f"LLM-Filtered dataset saved at: {FILTERED_OUTPUT_PATH}")
+    logging.info("======================================================")

r1-a/dataset/filter/gsm8k.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import re
+from datasets import load_from_disk, Dataset
+# --- 配置参数 ---
+INPUT_BASE = '/root/autodl-tmp/audio-r1/r1-a/dataset/gsm8k_with_audio'
+OUTPUT_BASE = './gsm8k_final_filtered'
+os.makedirs(OUTPUT_BASE, exist_ok=True)
+# --- 过滤函数（同之前） ---
+def is_suitable_for_tts_question(q: str) -> bool:
+    words = q.split()
+    if len(words) < 5 or len(words) > 100:
+        return False
+    if re.search(r'[\(\)\[\]/\^<>]', q):
+        return False
+    if q.count(',') > 2:
+        return False
+    return True
+# --- 处理每个 split ---
+all_samples = []
+for split_name in os.listdir(INPUT_BASE):
+    split_dir = os.path.join(INPUT_BASE, split_name, 'final_dataset')
+    if not os.path.isdir(split_dir):
+        continue
+    print(f"→ Loading split '{split_name}'")
+    ds = load_from_disk(split_dir)
+    filtered = []
+    for ex in ds:
+        q = ex.get('question_text', '')
+        wav = ex.get('audio_filepath', '')
+        # 跳过无音频或文件缺失
+        if not wav or not os.path.exists(wav):
+            continue
+        # 过滤不合适的问句
+        if not is_suitable_for_tts_question(q):
+            continue
+        rec = {
+            'query':             q,
+            'answer':            ex.get('answer', ''),
+            'source_dataset':    "gsm8k",
+            'audio':             wav,
+            'question_type':     'Math',
+            'difficulty':        ''
+        }
+        filtered.append(rec)
+        all_samples.append(rec)
+    print(f"  Kept {len(filtered)}/{len(ds)} examples in '{split_name}'")
+    # 保存该 split
+    out_dir = os.path.join(OUTPUT_BASE, split_name)
+    os.makedirs(out_dir, exist_ok=True)
+    Dataset.from_list(filtered).save_to_disk(out_dir)
+# --- 可选：合并所有 split ---
+print("→ Saving combined dataset")
+combined_dir = os.path.join(OUTPUT_BASE, 'combined')
+os.makedirs(combined_dir, exist_ok=True)
+Dataset.from_list(all_samples).save_to_disk(combined_dir)
+print(f"Total kept examples: {len(all_samples)}")

r1-a/dataset/filter/shp2_final.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import os
+import pandas as pd
+from datasets import load_dataset, Dataset, Features, Value
+import logging
+import math
+import shutil
+import time
+# --- Configuration ---
+# Path to the LLM-filtered dataset created by the previous script
+# !! Make sure this matches the FILTERED_OUTPUT_PATH from the previous script !!
+INPUT_LLM_FILTERED_PATH = "./shp2_filtered_evaluated/train_split_llm_filtered"
+# Output directory for the final top 20% dataset
+OUTPUT_DIR_FINAL_SELECTION = "./shp2_final_top20_percent"
+FINAL_DATASET_PATH = os.path.join(OUTPUT_DIR_FINAL_SELECTION, "train_split_top20_percent_by_complexity")
+# Percentage to select from each complexity group
+TOP_PERCENTAGE = 20.0
+# --- Setup Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
+logging.getLogger("filelock").setLevel(logging.WARNING)
+logging.getLogger("pandas").setLevel(logging.WARNING) # Keep pandas less verbose
+# --- Function to Save Dataset Atomically (Adapted for Dataset object) ---
+def save_dataset_atomically(dataset_to_save, output_path):
+    """Saves a Hugging Face Dataset object atomically."""
+    if not dataset_to_save or len(dataset_to_save) == 0:
+        logging.warning(f"No data provided or dataset is empty. Skipping save for {output_path}.")
+        return False
+    temp_output_path = output_path + "_saving"
+    final_output_path = output_path
+    logging.info(f"Attempting to save {len(dataset_to_save)} examples to temp path {temp_output_path}...")
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
+        # Remove existing temp directory if it exists
+        if os.path.exists(temp_output_path):
+            logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
+            shutil.rmtree(temp_output_path)
+        # Save to temporary path
+        dataset_to_save.save_to_disk(temp_output_path)
+        logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
+        # Remove final destination if it exists
+        if os.path.exists(final_output_path):
+            logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
+            shutil.rmtree(final_output_path)
+        # Move temporary to final destination
+        os.rename(temp_output_path, final_output_path)
+        logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
+        return True
+    except Exception as e:
+        logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
+        # Cleanup temp directory on failure
+        if os.path.exists(temp_output_path):
+            try:
+                shutil.rmtree(temp_output_path)
+                logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
+            except Exception as cleanup_e:
+                logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
+        return False
+# --- Main Execution ---
+if __name__ == "__main__":
+    start_time = time.time()
+    logging.info("===============================================================")
+    logging.info(" Starting Final Selection: Top 20% by Complexity, Quality & Suitability")
+    logging.info(f" Input LLM-Filtered Dataset Path: {INPUT_LLM_FILTERED_PATH}")
+    logging.info(f" Output Final Dataset Path: {FINAL_DATASET_PATH}")
+    logging.info(f" Selection Percentage per Complexity Group: {TOP_PERCENTAGE}%")
+    logging.info("===============================================================")
+    # --- Load the LLM-Filtered Dataset ---
+    if not os.path.exists(INPUT_LLM_FILTERED_PATH):
+        logging.error(f"Input dataset not found at '{INPUT_LLM_FILTERED_PATH}'.")
+        logging.error("Please ensure the previous script ran successfully and produced the dataset.")
+        exit(1)
+    try:
+        logging.info(f"Loading dataset from {INPUT_LLM_FILTERED_PATH}...")
+        llm_filtered_dataset = Dataset.load_from_disk(INPUT_LLM_FILTERED_PATH)
+        logging.info(f"Successfully loaded dataset with {len(llm_filtered_dataset)} examples.")
+        # Store features for later conversion back to Dataset
+        original_features = llm_filtered_dataset.features
+        logging.info(f"Original features: {original_features}")
+    except Exception as e:
+        logging.error(f"Failed to load dataset from {INPUT_LLM_FILTERED_PATH}: {e}", exc_info=True)
+        exit(1)
+    # --- Convert to Pandas DataFrame ---
+    try:
+        df = llm_filtered_dataset.to_pandas()
+        logging.info("Converted dataset to Pandas DataFrame.")
+        # Basic check for required columns
+        required_cols = ['llm_complexity', 'llm_quality', 'llm_suitability']
+        if not all(col in df.columns for col in required_cols):
+            logging.error(f"DataFrame is missing one or more required columns: {required_cols}")
+            exit(1)
+        # Handle potential placeholder values (-1) if they were used for None during saving
+        for col in ['llm_quality', 'llm_complexity', 'llm_suitability']:
+            if col in df.columns:
+                 # Replace -1 with NaN for proper handling if necessary
+                 # df[col] = df[col].replace(-1, pd.NA) # Use pd.NA for nullable integers
+                 pass # Assuming valid scores (>=1) in the filtered dataset from previous step
+        # Drop rows with missing essential scores (shouldn't happen if filtered correctly, but good practice)
+        initial_count = len(df)
+        df.dropna(subset=required_cols, inplace=True)
+        if len(df) < initial_count:
+            logging.warning(f"Dropped {initial_count - len(df)} rows with missing essential scores (quality, complexity, suitability).")
+        # Ensure scores are numeric
+        df['llm_quality'] = pd.to_numeric(df['llm_quality'])
+        df['llm_complexity'] = pd.to_numeric(df['llm_complexity'])
+        df['llm_suitability'] = pd.to_numeric(df['llm_suitability'])
+    except ImportError:
+        logging.error("Pandas library is required for this script. Please install it (`pip install pandas`).")
+        exit(1)
+    except Exception as e:
+        logging.error(f"Error during DataFrame conversion or preparation: {e}", exc_info=True)
+        exit(1)
+    if df.empty:
+        logging.error("DataFrame is empty after loading and preparation. Cannot proceed.")
+        exit(1)
+    # --- Group by Complexity and Select Top 20% ---
+    logging.info("Grouping by complexity and selecting top 20% based on quality and suitability...")
+    all_selected_dfs = []
+    total_selected_count = 0
+    grouped = df.groupby('llm_complexity')
+    complexity_levels_found = sorted(df['llm_complexity'].unique())
+    logging.info(f"Found data for complexity levels: {complexity_levels_found}")
+    for complexity_level, group_df in grouped:
+        group_size = len(group_df)
+        logging.info(f"\nProcessing Complexity Level: {complexity_level} (Size: {group_size})")
+        if group_size == 0:
+            logging.info(" -> Group is empty, skipping.")
+            continue
+        # Calculate number of items to select (top N)
+        # Use math.ceil to ensure at least one item is selected if percentage > 0 and group > 0
+        num_to_select = math.ceil(group_size * (TOP_PERCENTAGE / 100.0))
+        logging.info(f" -> Target top {TOP_PERCENTAGE}% = {num_to_select} items.")
+        # Sort by Quality (desc), then Suitability (desc)
+        # Higher quality is better, higher suitability is better
+        sorted_group = group_df.sort_values(
+            by=['llm_quality', 'llm_suitability'],
+            ascending=[False, False] # Both descending
+        )
+        # Select the top N rows
+        selected_df = sorted_group.head(num_to_select)
+        all_selected_dfs.append(selected_df)
+        logging.info(f" -> Selected {len(selected_df)} items for complexity {complexity_level}.")
+        total_selected_count += len(selected_df)
+    # --- Combine Selected DataFrames ---
+    if not all_selected_dfs:
+        logging.error("No data was selected from any complexity group. Final dataset will be empty.")
+        final_df = pd.DataFrame(columns=df.columns) # Create empty df with same columns
+    else:
+        logging.info(f"\nCombining selected data from all complexity groups...")
+        final_df = pd.concat(all_selected_dfs, ignore_index=True)
+        logging.info(f"Combined DataFrame created with {len(final_df)} total selected examples.")
+        logging.info(f"Original number of examples in filtered input: {initial_count}") # Use count before dropna
+        logging.info(f"Final number of examples after top 20% selection: {total_selected_count}")
+        # Optional: Log distribution in the final dataset
+        print("\n--- Complexity Distribution in Final Selected Dataset ---")
+        print(final_df['llm_complexity'].value_counts().sort_index())
+        print("---------------------------------------------------------")
+        print("\n--- Quality Distribution in Final Selected Dataset ---")
+        print(final_df['llm_quality'].value_counts().sort_index())
+        print("-------------------------------------------------------")
+        print("\n--- Suitability Distribution in Final Selected Dataset ---")
+        print(final_df['llm_suitability'].value_counts().sort_index())
+        print("----------------------------------------------------------")
+    # --- Convert back to Hugging Face Dataset using original features ---
+    try:
+        # Ensure the DataFrame columns match the original features before conversion
+        # Select only columns present in the original features schema
+        columns_to_keep = list(original_features.keys())
+        final_df_aligned = final_df[columns_to_keep]
+        final_dataset = Dataset.from_pandas(final_df_aligned, features=original_features, preserve_index=False)
+        logging.info("Successfully converted final Pandas DataFrame back to Hugging Face Dataset.")
+    except Exception as e:
+        logging.error(f"Failed to convert final DataFrame back to Dataset: {e}", exc_info=True)
+        logging.warning("Attempting to save the final DataFrame as a CSV as a fallback.")
+        fallback_csv_path = FINAL_DATASET_PATH + ".csv"
+        try:
+            os.makedirs(os.path.dirname(fallback_csv_path), exist_ok=True)
+            final_df.to_csv(fallback_csv_path, index=False)
+            logging.info(f"Fallback CSV saved to {fallback_csv_path}")
+        except Exception as csv_e:
+            logging.error(f"Failed to save fallback CSV: {csv_e}", exc_info=True)
+        exit(1) # Exit after attempting fallback save
+    # --- Save the Final Dataset ---
+    logging.info(f"Saving the final selected dataset ({len(final_dataset)} examples) to: {FINAL_DATASET_PATH}")
+    save_successful = save_dataset_atomically(final_dataset, FINAL_DATASET_PATH)
+    if save_successful:
+        logging.info("Final dataset saved successfully.")
+    else:
+        logging.error(f"Failed to save the final dataset to {FINAL_DATASET_PATH}.")
+    # --- Script End ---
+    end_time = time.time()
+    logging.info("------------------------------------------------------")
+    logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
+    logging.info(f"Final top {TOP_PERCENTAGE}% dataset saved at: {FINAL_DATASET_PATH}" if save_successful else "Final dataset saving failed.")
+    logging.info("======================================================")

r1-a/dataset/filter/ultra_final.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import os
+import pandas as pd
+# Make sure necessary types are imported
+from datasets import load_dataset, Dataset, Features, Value
+import logging
+import math
+import shutil
+import time
+# --- Configuration ---
+# --- !! MODIFIED: Point to the LLM-filtered UltraChat dataset !! ---
+# This should match the FILTERED_OUTPUT_PATH from the UltraChat evaluation script
+INPUT_LLM_FILTERED_PATH = "./ultrachat_evaluated/ultrachat_llm_filtered"
+# --- !! MODIFIED: Update output directory names for UltraChat !! ---
+OUTPUT_DIR_FINAL_SELECTION = "./ultrachat_final_top20_percent" # New output directory
+FINAL_DATASET_PATH = os.path.join(OUTPUT_DIR_FINAL_SELECTION, "ultrachat_top20_percent_by_complexity") # New output dataset name
+# Percentage to select from each complexity group (keep at 20% or adjust as needed)
+TOP_PERCENTAGE = 20.0
+# --- Setup Logging ---
+# Keep logging setup the same
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
+logging.getLogger("filelock").setLevel(logging.WARNING)
+logging.getLogger("pandas").setLevel(logging.WARNING)
+# --- Function to Save Dataset Atomically ---
+# Keep this function exactly the same
+def save_dataset_atomically(dataset_to_save, output_path):
+    """Saves a Hugging Face Dataset object atomically."""
+    if not dataset_to_save or len(dataset_to_save) == 0:
+        logging.warning(f"No data provided or dataset is empty. Skipping save for {output_path}.")
+        return False
+    temp_output_path = output_path + "_saving"
+    final_output_path = output_path
+    logging.info(f"Attempting to save {len(dataset_to_save)} examples to temp path {temp_output_path}...")
+    try:
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
+        # Remove existing temp directory if it exists
+        if os.path.exists(temp_output_path):
+            logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
+            shutil.rmtree(temp_output_path)
+        # Save to temporary path
+        dataset_to_save.save_to_disk(temp_output_path)
+        logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
+        # Remove final destination if it exists
+        if os.path.exists(final_output_path):
+            logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
+            shutil.rmtree(final_output_path)
+        # Move temporary to final destination
+        os.rename(temp_output_path, final_output_path)
+        logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
+        return True
+    except Exception as e:
+        logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
+        # Cleanup temp directory on failure
+        if os.path.exists(temp_output_path):
+            try:
+                shutil.rmtree(temp_output_path)
+                logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
+            except Exception as cleanup_e:
+                logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
+        return False
+# --- Main Execution ---
+if __name__ == "__main__":
+    start_time = time.time()
+    logging.info("===============================================================")
+    # --- !! MODIFIED: Update log title !! ---
+    logging.info(" Starting UltraChat Final Selection: Top 20% by Complexity, Quality & Suitability")
+    logging.info(f" Input LLM-Filtered Dataset Path: {INPUT_LLM_FILTERED_PATH}")
+    logging.info(f" Output Final Dataset Path: {FINAL_DATASET_PATH}")
+    logging.info(f" Selection Percentage per Complexity Group: {TOP_PERCENTAGE}%")
+    logging.info("===============================================================")
+    # --- Load the LLM-Filtered Dataset ---
+    if not os.path.exists(INPUT_LLM_FILTERED_PATH):
+        logging.error(f"Input dataset not found at '{INPUT_LLM_FILTERED_PATH}'.")
+        logging.error("Please ensure the UltraChat LLM evaluation script ran successfully and produced the dataset.")
+        exit(1)
+    try:
+        logging.info(f"Loading dataset from {INPUT_LLM_FILTERED_PATH}...")
+        llm_filtered_dataset = Dataset.load_from_disk(INPUT_LLM_FILTERED_PATH)
+        logging.info(f"Successfully loaded dataset with {len(llm_filtered_dataset)} examples.")
+        # Store features for later conversion back to Dataset
+        original_features = llm_filtered_dataset.features
+        logging.info(f"Original features: {original_features}")
+        # Check if essential score columns exist in the loaded features
+        if not all(col in original_features for col in ['llm_complexity', 'llm_quality', 'llm_suitability']):
+            logging.error(f"Loaded dataset from '{INPUT_LLM_FILTERED_PATH}' is missing one or more required score columns (llm_quality, llm_complexity, llm_suitability). Cannot proceed.")
+            exit(1)
+    except Exception as e:
+        logging.error(f"Failed to load dataset from {INPUT_LLM_FILTERED_PATH}: {e}", exc_info=True)
+        exit(1)
+    # --- Convert to Pandas DataFrame ---
+    try:
+        df = llm_filtered_dataset.to_pandas()
+        logging.info("Converted dataset to Pandas DataFrame.")
+        required_cols = ['llm_complexity', 'llm_quality', 'llm_suitability'] # These are needed for filtering
+        # Handle potential placeholder values (-1) used for None during saving in the previous step
+        # Replace them with pd.NA for correct handling by dropna and numeric conversion
+        for col in required_cols:
+            if col in df.columns:
+                 df[col] = df[col].replace(-1, pd.NA)
+        # Drop rows with missing essential scores AFTER replacing placeholder
+        initial_count = len(df)
+        df.dropna(subset=required_cols, inplace=True)
+        dropped_count = initial_count - len(df)
+        if dropped_count > 0:
+            logging.warning(f"Dropped {dropped_count} rows with missing essential scores (quality, complexity, suitability) after handling placeholders.")
+        # Ensure scores are numeric (should be okay after dropna, but good practice)
+        # Using 'integer' dtype allows pd.NA
+        df['llm_quality'] = df['llm_quality'].astype('Int64') # Use nullable integer type
+        df['llm_complexity'] = df['llm_complexity'].astype('Int64')
+        df['llm_suitability'] = df['llm_suitability'].astype('Int64')
+    except ImportError:
+        logging.error("Pandas library is required for this script. Please install it (`pip install pandas`).")
+        exit(1)
+    except Exception as e:
+        logging.error(f"Error during DataFrame conversion or preparation: {e}", exc_info=True)
+        exit(1)
+    if df.empty:
+        logging.error("DataFrame is empty after loading and cleaning (dropping NA scores). Cannot proceed.")
+        exit(1)
+    # --- Group by Complexity and Select Top 20% ---
+    # This core logic remains unchanged as it relies on the standard score column names
+    logging.info("Grouping by complexity and selecting top 20% based on quality and suitability...")
+    all_selected_dfs = []
+    total_selected_count = 0
+    # Ensure complexity column is suitable for grouping (already converted to Int64)
+    grouped = df.groupby('llm_complexity')
+    # Get unique complexity levels present in the cleaned data
+    complexity_levels_found = sorted(df['llm_complexity'].dropna().unique())
+    logging.info(f"Found data for complexity levels: {complexity_levels_found}")
+    for complexity_level in complexity_levels_found:
+        # Need to handle potential NA group if groupby includes NA keys (usually doesn't by default)
+        if pd.isna(complexity_level):
+            continue
+        group_df = grouped.get_group(complexity_level)
+        group_size = len(group_df)
+        logging.info(f"\nProcessing Complexity Level: {complexity_level} (Size: {group_size})")
+        if group_size == 0:
+            logging.info(" -> Group is empty, skipping.") # Should not happen with get_group after unique()
+            continue
+        # Calculate number of items to select (top N)
+        num_to_select = math.ceil(group_size * (TOP_PERCENTAGE / 100.0))
+        # Ensure num_to_select is not greater than group_size (can happen with ceil and small groups)
+        num_to_select = min(num_to_select, group_size)
+        logging.info(f" -> Target top {TOP_PERCENTAGE}% = {num_to_select} items.")
+        # Sort by Quality (desc), then Suitability (desc)
+        sorted_group = group_df.sort_values(
+            by=['llm_quality', 'llm_suitability'],
+            ascending=[False, False] # Both descending
+        )
+        # Select the top N rows
+        selected_df = sorted_group.head(num_to_select)
+        all_selected_dfs.append(selected_df)
+        logging.info(f" -> Selected {len(selected_df)} items for complexity {complexity_level}.")
+        total_selected_count += len(selected_df)
+    # --- Combine Selected DataFrames ---
+    if not all_selected_dfs:
+        logging.error("No data was selected from any complexity group. Final dataset will be empty.")
+        final_df = pd.DataFrame(columns=df.columns) # Create empty df with same columns
+    else:
+        logging.info(f"\nCombining selected data from all complexity groups...")
+        final_df = pd.concat(all_selected_dfs, ignore_index=True)
+        logging.info(f"Combined DataFrame created with {len(final_df)} total selected examples.")
+        # Use initial_count (before dropna) for comparison basis
+        original_valid_score_count = initial_count - dropped_count
+        logging.info(f"Original number of examples with valid scores in input: {original_valid_score_count}")
+        logging.info(f"Final number of examples after top {TOP_PERCENTAGE}% selection: {total_selected_count}")
+        # Log distribution in the final selected dataset
+        print("\n--- Complexity Distribution in Final Selected Dataset ---")
+        print(final_df['llm_complexity'].value_counts().sort_index())
+        print("---------------------------------------------------------")
+        print("\n--- Quality Distribution in Final Selected Dataset ---")
+        print(final_df['llm_quality'].value_counts().sort_index())
+        print("-------------------------------------------------------")
+        print("\n--- Suitability Distribution in Final Selected Dataset ---")
+        print(final_df['llm_suitability'].value_counts().sort_index())
+        print("----------------------------------------------------------")
+    # --- Convert back to Hugging Face Dataset using original features ---
+    # This logic remains the same - crucial to use original_features
+    try:
+        # Ensure the DataFrame columns match the original features before conversion
+        # Select only columns present in the original features schema to avoid errors
+        columns_to_keep = list(original_features.keys())
+        # Check if all original columns still exist in final_df (they should)
+        final_df_aligned = final_df[columns_to_keep]
+        # Convert nullable Int64 back to standard int types if necessary for Features definition
+        # (HuggingFace handles standard int types well, usually no explicit cast needed here if Features are correct)
+        # E.g., if original_features['llm_quality'] was Value('int32'), pandas Int64 is compatible
+        # Create the Dataset object using the original features definition
+        final_dataset = Dataset.from_pandas(final_df_aligned, features=original_features, preserve_index=False)
+        logging.info("Successfully converted final Pandas DataFrame back to Hugging Face Dataset.")
+    except Exception as e:
+        logging.error(f"Failed to convert final DataFrame back to Dataset: {e}", exc_info=True)
+        logging.warning("Attempting to save the final DataFrame as a CSV as a fallback.")
+        # Make sure fallback path uses the correct final dataset path base
+        fallback_csv_path = FINAL_DATASET_PATH + ".csv"
+        try:
+            os.makedirs(os.path.dirname(fallback_csv_path), exist_ok=True)
+            final_df.to_csv(fallback_csv_path, index=False)
+            logging.info(f"Fallback CSV saved to {fallback_csv_path}")
+        except Exception as csv_e:
+            logging.error(f"Failed to save fallback CSV: {csv_e}", exc_info=True)
+        exit(1) # Exit after attempting fallback save
+    # --- Save the Final Dataset ---
+    logging.info(f"Saving the final selected UltraChat dataset ({len(final_dataset)} examples) to: {FINAL_DATASET_PATH}")
+    save_successful = save_dataset_atomically(final_dataset, FINAL_DATASET_PATH)
+    if save_successful:
+        logging.info("Final dataset saved successfully.")
+    else:
+        logging.error(f"Failed to save the final dataset to {FINAL_DATASET_PATH}.")
+    # --- Script End ---
+    end_time = time.time()
+    logging.info("------------------------------------------------------")
+    # --- !! MODIFIED: Update log message !! ---
+    logging.info(f"UltraChat Selection Script finished in {end_time - start_time:.2f} seconds.")
+    logging.info(f"Final top {TOP_PERCENTAGE}% UltraChat dataset saved at: {FINAL_DATASET_PATH}" if save_successful else "Final dataset saving failed.")
+    logging.info("======================================================")

r1-a/dataset/filter/ultrachat_gpt.py ADDED Viewed

	@@ -0,0 +1,709 @@

+import os
+import http.client
+import json
+import time
+import random
+import re
+import pandas as pd
+from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
+from tqdm.auto import tqdm
+import sys
+import logging
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+import shutil
+import socket
+# --- Configuration ---
+# --- !! MODIFIED: Point to the pre-filtered UltraChat dataset !! ---
+INPUT_DATA_PATH = "/root/autodl-tmp/audio-r1/r1-a/dataset/ultrachat_filtered_for_tts_preference_v3_nocode" # Path from the UltraChat filtering script's output
+# --- Keep API configurations ---
+API_HOST = "api2.aigcbest.top"
+API_PATH = "/v1/chat/completions"
+LLM_MODEL = "gpt-4.1-mini-2025-04-14" # Or consider gpt-4-turbo if available and cheaper for long context
+API_KEY = os.environ.get('AIGCBEST_API_KEY', "sk-N8IsyCniMZoVpa0zn0IYQMY0b0Py53WyFxmNag4vtnzCtXeA") # Replace or set env variable
+if not API_KEY or API_KEY == "YOUR_API_KEY_HERE":
+    print("API Key is not set correctly. Please set the AIGCBEST_API_KEY environment variable or replace the placeholder.")
+    sys.exit(1)
+# --- !! MODIFIED: Update output directory names for UltraChat !! ---
+OUTPUT_DIR = f"./ultrachat_evaluated" # Base directory for evaluated UltraChat
+PROCESSED_DATA_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_evaluated_intermediate") # Intermediate save file for this run
+FINAL_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_evaluated_final")       # Final annotated data
+FILTERED_OUTPUT_PATH = os.path.join(OUTPUT_DIR, f"ultrachat_llm_filtered")         # Final filtered data
+# --- Keep processing configurations ---
+MAX_WORKERS = 40
+REQUEST_DELAY_SECONDS = 0.1
+MAX_RETRIES = 4
+SAVE_INTERVAL = 1000
+# --- Filtering Thresholds (LLM scores) - Can be adjusted after seeing distributions ---
+MIN_QUALITY_SCORE = 3
+MIN_SUITABILITY_SCORE = 3
+# Optional: MAX_COMPLEXITY_SCORE = 4
+# Setup logging (keep as is)
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# ... (keep other logging level settings) ...
+# --- !! MODIFIED: Updated LLM System Prompt for Multi-Turn Context !! ---
+SYSTEM_PROMPT = """
+You are an AI Quality Assessor evaluating user queries within multi-turn conversations for AI voice assistants.
+Your task is to analyze the **Current User Query** in the context of the preceding **Conversation History**. Assign scores based on three metrics: Overall Quality, Complexity, and Voice Response Suitability. Provide a brief justification.
+**Input:** You will receive the conversation history followed by the current user query.
+**Output Format:** Respond ONLY with a single string in the following format, replacing bracketed values with your scores and justification. Do NOT include any other text, greetings, or explanations outside this format.
+Quality: [1-5], Complexity: [1-5], Suitability: [1-5], Justification: [Your brief justification text here]
+**Metric Definitions:**
+1.  **Overall Quality (Score 1-5):** Clarity, coherence, relevance, and grammatical correctness of the **Current User Query** *considering the Conversation History*.
+    *   1 (Very Low): Nonsensical, irrelevant to history, ungrammatical, contains corrupted placeholders, abrupt unrelated topic shift without clear transition.
+    *   2 (Low): Vague, poorly worded, slightly off-topic, requires significant interpretation *even with history*, minor grammatical errors.
+    *   3 (Medium): Understandable, generally relevant, reasonably phrased. Might be a simple follow-up or a slightly generic query. Acceptable.
+    *   4 (High): Clear, well-phrased, specific, directly relevant to the history or a natural conversation progression. Good standalone query even if it builds on context.
+    *   5 (Very High): Exceptionally clear, concise, specific, contextually relevant, and well-formulated. Represents a natural and effective conversational turn.
+2.  **Complexity (Score 1-5):** Cognitive load required for the AI to understand the *history + current query* and generate the *next appropriate assistant response*.
+    *   1 (Very Simple): Simple acknowledgement, yes/no confirmation, trivial fact recall based directly on the last turn.
+    *   2 (Simple): Basic info recall related to history, slight elaboration on previous point, simple instruction.
+    *   3 (Moderate): Requires synthesizing information from a few turns back, comparing points made earlier, generating a moderately detailed explanation or creative text based on context.
+    *   4 (Complex): Requires understanding nuanced context across multiple turns, deep reasoning, complex instruction synthesis, detailed analysis based on the dialogue.
+    *   5 (Very Complex): Needs to track intricate state/details over a long history, highly specialized knowledge synthesis based on context, complex multi-step problem-solving rooted in the conversation.
+3.  **Voice Response Suitability (Score 1-5):** Is the *expected assistant's answer to the Current User Query* suitable for delivery via voice ONLY? (Focus on the likely *next turn's* content).
+    *   1 (Very Unsuitable): Expected answer likely requires visuals (graphs, code, tables), complex formatting, UI interaction, or is excessively long/structured even for conversational context (e.g., reading out a large diff).
+    *   2 (Unsuitable): Expected answer probably very long, has complex structure (nested lists), significantly easier to parse visually. Poor audio UX for the *next* response.
+    *   3 (Moderate): Expected answer might be slightly long or have simple structure (e.g., short list of steps mentioned earlier), but generally digestible via audio. Upper limit for conversational comfort.
+    *   4 (Suitable): Expected answer reasonably concise, informational/conversational, flows well in dialogue, easy to understand when spoken.
+    *   5 (Highly Suitable): Ideal for voice - short confirmation, direct answer based on context, brief explanation, conversational response.
+4.  **Justification (Brief Text):** 1-2 sentences explaining the scores, especially for low (<3) or unusual scores, referencing context if necessary.
+**Example Input Structure (What your 'user' message will contain):**
+Conversation History:
+[USER]
+Tell me about the Eiffel Tower.
+[ASSISTANT]
+The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower.
+---
+Current User Query:
+How tall is it and when was it built?
+**Example Output String:**
+Quality: 4, Complexity: 2, Suitability: 5, Justification: Clear follow-up query based on the history. Asks for simple facts, suitable for a short voice response.
+"""
+# --- LLM API Function (evaluate_prompt_with_llm) ---
+# (Keep the function definition exactly the same as before - it handles API calls generically)
+def evaluate_prompt_with_llm(prompt_text, api_key, host, path, model, retries=MAX_RETRIES):
+    """Calls the LLM API to get evaluation scores for a prompt (or query+history)."""
+    # Add check for None or empty prompt_text
+    if not prompt_text or not isinstance(prompt_text, str) or not prompt_text.strip():
+        logging.warning("evaluate_prompt_with_llm received empty or invalid input text.")
+        return None # Cannot evaluate empty input
+    payload = json.dumps({
+        "model": model,
+        "messages": [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            # --- !! CRITICAL !! ---
+            # The combined history + query will be passed as the 'user' content here
+            {"role": "user", "content": prompt_text}
+        ],
+        "temperature": 0.1, # Low temperature for consistent evaluation
+        "max_tokens": 100  # Should be enough for the scores + justification
+    })
+    headers = {
+        'Accept': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+        'User-Agent': 'HuggingFace UltraChat Evaluation Script', # Updated User-Agent
+        'Content-Type': 'application/json'
+    }
+    # Add a small random delay before each request
+    time.sleep(random.uniform(REQUEST_DELAY_SECONDS * 0.8, REQUEST_DELAY_SECONDS * 1.2))
+    # --- (Keep the rest of the API call, retry, and error handling logic exactly the same) ---
+    for attempt in range(retries):
+        try:
+            conn = http.client.HTTPSConnection(host, timeout=60) # Added timeout
+            conn.request("POST", path, payload, headers)
+            res = conn.getresponse()
+            status = res.status
+            data = res.read()
+            conn.close()
+            if status == 200:
+                response_json = json.loads(data.decode("utf-8"))
+                # print(f"DEBUG: API Response JSON: {response_json}") # Uncomment for debugging API response
+                if response_json.get("choices") and len(response_json["choices"]) > 0:
+                    message = response_json["choices"][0].get("message")
+                    if message and message.get("content"):
+                        raw_response = message["content"].strip()
+                        # Basic check for expected format start - parsing function handles details
+                        if raw_response.startswith("Quality:") and "Complexity:" in raw_response and "Suitability:" in raw_response:
+                             # print(f"DEBUG: Received potential valid format: {raw_response}")
+                             return raw_response
+                        else:
+                             logging.warning(f"LLM response format unexpected for input starting with '{prompt_text[:50]}...': {raw_response}")
+                             # print(f"DEBUG: Received unexpected format: {raw_response}")
+                             return raw_response # Return potentially malformed for parsing attempt later
+                logging.error(f"Unexpected API response structure (no choices/content): {data.decode('utf-8')}")
+            elif status == 429: # Rate limit
+                retry_after_header = res.getheader('Retry-After', str(int(REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5))))
+                try: wait_time = int(retry_after_header)
+                except ValueError: wait_time = REQUEST_DELAY_SECONDS * (2 ** attempt) + random.uniform(1, 5) # Exponential backoff + jitter
+                logging.warning(f"Rate limit exceeded (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
+                time.sleep(wait_time)
+            elif status >= 500: # Server error
+                 wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 5) # Exponential backoff + jitter
+                 logging.warning(f"Server error (HTTP {status}). Retrying after {wait_time:.2f} seconds...")
+                 time.sleep(wait_time)
+            else: # Other client errors (4xx) - likely not recoverable by retry
+                logging.error(f"API Client Error: Status {status}, Response: {data.decode('utf-8')} for input: {prompt_text[:60]}")
+                return None # Don't retry on definitive client errors like bad auth (401) or not found (404)
+        except (http.client.HTTPException, ConnectionError, socket.gaierror, TimeoutError, socket.timeout) as e: # Network/HTTP level errors
+            logging.error(f"Network/HTTP error during API call: {e}. Attempt {attempt + 1}/{retries}")
+            if attempt + 1 == retries: return None
+            wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3) # Exponential backoff + jitter
+            logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
+            time.sleep(wait_time)
+        except json.JSONDecodeError as e:
+             logging.error(f"Failed to decode API response: {e}. Response snippet: {data[:200] if data else 'N/A'}")
+             # print(f"DEBUG: JSON Decode Error. Raw Data: {data}") # Uncomment for debugging
+             return None # Cannot proceed if response isn't JSON
+        except Exception as e:
+             # Catch any other unexpected errors during the API call/processing
+             logging.error(f"An unexpected error occurred during API call processing: {e}", exc_info=True)
+             if attempt + 1 == retries: return None
+             wait_time = REQUEST_DELAY_SECONDS * (1.5 ** attempt) + random.uniform(1, 3)
+             logging.warning(f"Waiting {wait_time:.2f} seconds before retry...")
+             time.sleep(wait_time)
+    logging.error(f"API call failed after {retries} retries for input: {prompt_text[:60]}...")
+    return None
+# --- Function to Parse LLM Response ---
+# (Keep the function definition exactly the same as before - it parses the expected output format)
+def parse_llm_evaluation(response_string):
+    """Parses the structured string response from the LLM."""
+    if not response_string:
+        return None, None, None, None, "error_empty_response"
+    # Primary regex targeting the specific format
+    match = re.match(
+        r"Quality:\s*([1-5])\s*,\s*Complexity:\s*([1-5])\s*,\s*Suitability:\s*([1-5])\s*,\s*Justification:\s*(.*)",
+        response_string.strip(),
+        re.IGNORECASE | re.DOTALL # Ignore case and allow '.' to match newlines in justification
+    )
+    if match:
+        try:
+            quality = int(match.group(1))
+            complexity = int(match.group(2))
+            suitability = int(match.group(3))
+            # Handle potential empty justification if the regex matches everything before it
+            justification = match.group(4).strip() if match.group(4) else ""
+            # print(f"DEBUG: Regex Parse Success: Q={quality}, C={complexity}, S={suitability}, J='{justification}'")
+            return quality, complexity, suitability, justification, "success"
+        except (ValueError, IndexError) as e:
+            # This case means regex matched structure, but numbers were invalid or groups missing unexpectedly
+            logging.warning(f"Parsing failed for matched string (invalid numbers?Groups missing?): {response_string}. Error: {e}")
+            # print(f"DEBUG: Regex Matched, but Value/Index Error: {response_string}")
+            return None, None, None, None, "error_parsing_matched"
+    else:
+        # Log if the primary regex didn't match at all
+        logging.warning(f"Regex did not match LLM response format: {response_string}")
+        # print(f"DEBUG: Regex No Match: {response_string}")
+        # Fallback attempt: Try splitting and key-value parsing (less robust)
+        parts = [p.strip() for p in response_string.split(',')]
+        scores = {}
+        justification = ""
+        try:
+            # Attempt to find key-value pairs even if formatting is slightly off
+            for part in parts:
+                if ':' in part:
+                    key, val = part.split(':', 1)
+                    key = key.strip().lower()
+                    val = val.strip()
+                    if key == 'quality' and val.isdigit() and 1 <= int(val) <= 5: scores['quality'] = int(val)
+                    elif key == 'complexity' and val.isdigit() and 1 <= int(val) <= 5: scores['complexity'] = int(val)
+                    elif key == 'suitability' and val.isdigit() and 1 <= int(val) <= 5: scores['suitability'] = int(val)
+                    elif key == 'justification': justification = val # Assume the rest is justification
+            # Check if all required scores were found via fallback
+            if 'quality' in scores and 'complexity' in scores and 'suitability' in scores:
+                 logging.info(f"Fallback parsing successful for: {response_string[:50]}...")
+                 # print(f"DEBUG: Fallback Parse Success: Q={scores['quality']}, C={scores['complexity']}, S={scores['suitability']}, J='{justification}'")
+                 return scores['quality'], scores['complexity'], scores['suitability'], justification, "success_fallback_parse"
+        except Exception as e:
+            # Catch errors during the fallback splitting/parsing itself
+            logging.warning(f"Fallback parsing attempt also failed: {e}")
+            # print(f"DEBUG: Fallback Parse Exception: {e}")
+            pass # Fall through to return the final error status
+        # If neither primary regex nor fallback worked
+        return None, None, None, None, "error_parsing_no_match"
+# --- !! MODIFIED: Dataset Processing Function for UltraChat !! ---
+def evaluate_dataset_entry(example):
+    """Processes a single UltraChat filtered entry to get LLM evaluation."""
+    processed_example = example.copy() # Work on a copy
+    # Initialize evaluation fields (or keep existing ones if resuming)
+    processed_example['llm_quality'] = example.get('llm_quality', None)
+    processed_example['llm_complexity'] = example.get('llm_complexity', None)
+    processed_example['llm_suitability'] = example.get('llm_suitability', None)
+    processed_example['llm_justification'] = example.get('llm_justification', '')
+    # Start assuming we'll try processing, change status based on outcome
+    processed_example['llm_evaluation_status'] = 'pending_evaluation' # Or keep existing status if retrying
+    # --- Get Query and History ---
+    query_text = example.get("query")
+    history_text = example.get("history", "") # Get history, default to empty string if missing
+    # --- Validate Input ---
+    if not query_text or not isinstance(query_text, str) or not query_text.strip():
+        processed_example['llm_evaluation_status'] = 'skipped_invalid_query'
+        logging.debug(f"Skipping entry (Dialogue: {example.get('dialogue_id', 'N/A')}, Turn: {example.get('turn_index', 'N/A')}): Invalid query.")
+        return processed_example
+    # Optional: Add check for history if it's strictly required?
+    # if not isinstance(history_text, str): # History should be string from previous script
+    #     processed_example['llm_evaluation_status'] = 'skipped_invalid_history'
+    #     logging.warning(f"Entry (Dialogue: {example.get('dialogue_id', 'N/A')}, Turn: {example.get('turn_index', 'N/A')}) has non-string history: {type(history_text)}")
+    #     return processed_example
+    # --- Format Input for LLM ---
+    # Combine history and query into the format the system prompt expects
+    llm_input_text = f"Conversation History:\n{history_text}\n\n---\n\nCurrent User Query:\n{query_text}"
+    # --- Call LLM API ---
+    # print(f"DEBUG: Calling LLM for Turn {example.get('turn_index')}, Query: {query_text[:50]}...") # Debug print
+    llm_response_string = evaluate_prompt_with_llm(llm_input_text, API_KEY, API_HOST, API_PATH, LLM_MODEL)
+    # --- Parse Response and Update Example ---
+    if llm_response_string:
+        q, c, s, j, parse_status = parse_llm_evaluation(llm_response_string)
+        # print(f"DEBUG: Parse Result: Q={q}, C={c}, S={s}, Status={parse_status}, Raw='{llm_response_string[:50]}...'") # Debug print
+        if parse_status.startswith("success"):
+            processed_example["llm_quality"] = q
+            processed_example["llm_complexity"] = c
+            processed_example["llm_suitability"] = s
+            processed_example["llm_justification"] = j
+            processed_example['llm_evaluation_status'] = 'success' # Final success state
+        else:
+            # Log the parsing error type and store raw response for potential manual review
+            processed_example['llm_evaluation_status'] = parse_status # e.g., "error_parsing_no_match"
+            processed_example['llm_justification'] = f"RAW_RESPONSE: {llm_response_string}" # Store raw response in justification
+            logging.warning(f"Parsing failed ({parse_status}) for dialogue {example.get('dialogue_id', 'N/A')}, turn {example.get('turn_index', 'N/A')}. Raw response saved.")
+    else:
+        # LLM call itself failed after retries
+        processed_example['llm_evaluation_status'] = 'failed_llm_call'
+        logging.error(f"LLM call failed for dialogue {example.get('dialogue_id', 'N/A')}, turn {example.get('turn_index', 'N/A')}.")
+    return processed_example
+# --- Function to Save Dataset Atomically ---
+# (Keep the function definition exactly the same as before - it needs the correct 'features')
+# NOTE: Ensure the Features object passed to this function matches the UltraChat + LLM structure.
+def save_dataset_atomically(data_list, output_path, features):
+    """Saves the list of data dictionaries atomically using the correct schema."""
+    if not data_list:
+        logging.info("No data provided for saving.")
+        return False
+    temp_output_path = output_path + "_saving"
+    final_output_path = output_path
+    logging.info(f"Attempting to save {len(data_list)} examples to temp path {temp_output_path}...")
+    try:
+        processed_data_list = []
+        # Handle potential None for integer columns before creating Dataset
+        for item in data_list:
+            item_copy = item.copy() # Work on a copy
+            # Replace None with a placeholder like -1 if the Feature type is integer
+            for key in ['llm_quality', 'llm_complexity', 'llm_suitability']:
+                 # Check if the key exists and its value is None before attempting replacement
+                 if key in item_copy and item_copy[key] is None and isinstance(features[key], Value) and features[key].dtype == 'int32':
+                     # Use -1 as placeholder for missing integer scores (easier for Pandas later)
+                     item_copy[key] = -1
+            processed_data_list.append(item_copy)
+        # Create dataset from the list of dictionaries using the defined features
+        processed_dataset = Dataset.from_list(processed_data_list, features=features)
+        # Ensure parent directory exists
+        os.makedirs(os.path.dirname(final_output_path), exist_ok=True)
+        # Clean up potential stale temporary directory first
+        if os.path.exists(temp_output_path):
+            logging.warning(f"Removing existing temporary save directory: {temp_output_path}")
+            shutil.rmtree(temp_output_path)
+        # Save to temporary path
+        processed_dataset.save_to_disk(temp_output_path)
+        logging.info(f"Successfully saved dataset to temporary path: {temp_output_path}")
+        # Clean up final destination path if it exists, before renaming
+        if os.path.exists(final_output_path):
+            logging.debug(f"Removing existing final destination directory before rename: {final_output_path}")
+            shutil.rmtree(final_output_path)
+        # Atomically rename the temporary directory to the final path
+        os.rename(temp_output_path, final_output_path)
+        logging.info(f"Successfully moved temporary save to final path: {final_output_path}")
+        return True
+    except Exception as e:
+        logging.error(f"Failed during atomic save process to {final_output_path}: {e}", exc_info=True)
+        # Cleanup failed temporary directory if it exists
+        if os.path.exists(temp_output_path):
+            try:
+                shutil.rmtree(temp_output_path)
+                logging.info(f"Cleaned up temporary directory {temp_output_path} after error.")
+            except Exception as cleanup_e:
+                logging.error(f"Could not clean up temporary directory {temp_output_path} after error: {cleanup_e}")
+        # Fallback: Try saving as JSON Lines (less ideal but better than nothing)
+        fallback_json_path = final_output_path + ".jsonl.failed_save"
+        logging.warning(f"Attempting fallback save to JSON Lines file: {fallback_json_path}")
+        try:
+            with open(fallback_json_path, 'w', encoding='utf-8') as f:
+                for item in data_list: # Use original list for fallback
+                    # Convert potential non-serializable items (like complex objects if any) to string
+                    f.write(json.dumps(dict(item), ensure_ascii=False, default=str) + '\n')
+            logging.info(f"Successfully saved fallback JSON Lines file.")
+        except Exception as json_e:
+            logging.error(f"Fallback JSON save also failed: {json_e}", exc_info=True)
+        return False
+# --- Function to Check if Retry is Needed ---
+# (Keep the function definition exactly the same as before)
+def needs_retry(example):
+    """Checks if an example needs evaluation or retry."""
+    status = example.get('llm_evaluation_status')
+    # Retry if status is not 'success' AND not explicitly 'skipped_*'
+    # Handles None status, 'pending', 'failed_*', 'error_*' etc.
+    retry_flag = (status != 'success') and (not str(status).startswith('skipped_'))
+    return retry_flag
+# --- !! MODIFIED: Get Dataset Features for Filtered UltraChat + Evaluation !! ---
+def get_ultrachat_features_with_evaluation():
+    """Defines features for the pre-filtered UltraChat dataset + evaluation columns."""
+    logging.info(f"Defining features for pre-filtered UltraChat data + LLM evaluation.")
+    # Define features based on the output of the UltraChat filtering script
+    base_features = Features({
+        'dialogue_id': Value(dtype='string', id=None),
+        'turn_index': Value('int64'), # Use int64 for potentially large indices, check source dataset type
+        'query': Value(dtype='string', id=None),
+        'history': Value(dtype='string', id=None),
+    })
+    # Add new features for LLM evaluation
+    # Use int32 for scores, string for justification/status.
+    # The save function handles None -> -1 for int32 fields.
+    augmented_features = Features({
+        **base_features,
+        'llm_quality': Value('int32'),
+        'llm_complexity': Value('int32'),
+        'llm_suitability': Value('int32'),
+        'llm_justification': Value('string'),
+        'llm_evaluation_status': Value('string') # Stores 'success', 'failed_*', 'skipped_*', 'error_*' etc.
+    })
+    logging.info(f"Defined features: {augmented_features}")
+    return augmented_features
+# --- Main Execution ---
+if __name__ == "__main__":
+    start_time = time.time()
+    logging.info("======================================================")
+    logging.info(f" Starting Filtered UltraChat Dataset Evaluation - {LLM_MODEL}") # Updated title
+    logging.info(f" Input Data Path (Filtered UltraChat): {INPUT_DATA_PATH}")
+    logging.info(f" Output Dir: {OUTPUT_DIR}")
+    logging.info(f" Intermediate Save Path: {PROCESSED_DATA_PATH}")
+    logging.info(f" Final Annotated Path: {FINAL_OUTPUT_PATH}")
+    logging.info(f" LLM-Filtered Output Path: {FILTERED_OUTPUT_PATH}")
+    logging.info("======================================================")
+    # --- Define Features for UltraChat + LLM ---
+    dataset_features = get_ultrachat_features_with_evaluation() # Use the correct feature function
+    # --- Load or Initialize Dataset ---
+    results_list = []
+    # Check for intermediate save file from *this* script first
+    if os.path.exists(PROCESSED_DATA_PATH):
+        logging.info(f"Loading existing intermediate dataset from {PROCESSED_DATA_PATH}...")
+        try:
+            # Load with trust_remote_code=True if dataset structure might have custom code (less likely here)
+            existing_dataset = Dataset.load_from_disk(PROCESSED_DATA_PATH)
+            # Optional: Verify features match exactly if needed (can cause issues if minor changes occur)
+            # if existing_dataset.features != dataset_features:
+            #     logging.warning(f"Loaded intermediate dataset features mismatch expected. Trying to continue...")
+            #     # Potentially try casting or just proceed carefully
+            results_list = existing_dataset.to_list() # Convert loaded dataset to list of dicts
+            total_examples = len(results_list)
+            logging.info(f"Loaded {total_examples} examples from intermediate save.")
+        except Exception as e:
+            logging.error(f"Failed to load intermediate dataset from {PROCESSED_DATA_PATH}: {e}", exc_info=True)
+            logging.warning("Will attempt to load fresh dataset from input path.")
+            results_list = [] # Reset list if loading failed
+    # If no intermediate data loaded, load the initial filtered UltraChat data
+    if not results_list:
+        logging.info(f"Loading pre-filtered UltraChat dataset from: {INPUT_DATA_PATH}")
+        if not os.path.exists(INPUT_DATA_PATH):
+             logging.error(f"Input dataset not found at '{INPUT_DATA_PATH}'. Please run the UltraChat filtering script first.")
+             sys.exit(1)
+        try:
+            # Load the dataset generated by the previous UltraChat filtering script
+            original_filtered_dataset = Dataset.load_from_disk(INPUT_DATA_PATH)
+            total_examples = len(original_filtered_dataset)
+            logging.info(f"Loaded {total_examples} original examples from {INPUT_DATA_PATH}.")
+            # Initialize results list with original data + placeholder evaluation fields
+            results_list = []
+            # Iterate through the loaded dataset and add placeholder fields
+            for example in tqdm(original_filtered_dataset, desc="Initializing data structure"):
+                 init_example = dict(example) # Make a copy
+                 # Ensure all expected base features are present, provide defaults if necessary
+                 init_example['dialogue_id'] = init_example.get('dialogue_id', f'missing_id_{len(results_list)}')
+                 init_example['turn_index'] = init_example.get('turn_index', -1) # Use -1 if missing?
+                 init_example['query'] = init_example.get('query', '')
+                 init_example['history'] = init_example.get('history', '')
+                 # Add evaluation placeholders
+                 init_example['llm_quality'] = None
+                 init_example['llm_complexity'] = None
+                 init_example['llm_suitability'] = None
+                 init_example['llm_justification'] = ''
+                 init_example['llm_evaluation_status'] = 'pending' # Initial status before processing
+                 results_list.append(init_example)
+            # Perform an initial save to the intermediate path for this script run
+            logging.info(f"Performing initial save of placeholder data ({len(results_list)} items) to {PROCESSED_DATA_PATH}...")
+            # Use the correct features for saving
+            if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
+                logging.info("Initial data structure saved successfully.")
+            else:
+                logging.error("Failed to save initial data structure. Exiting.")
+                sys.exit(1)
+        except Exception as e:
+            logging.error(f"Failed to load or initialize dataset from {INPUT_DATA_PATH}: {e}", exc_info=True)
+            sys.exit(1)
+    # --- Identify Indices to Process/Retry ---
+    logging.info("Identifying examples needing evaluation/retry...")
+    # Use needs_retry to find indices where evaluation hasn't succeeded or been skipped
+    indices_to_process = [
+        i for i, example in enumerate(tqdm(results_list, desc="Checking examples status")) if needs_retry(example)
+    ]
+    num_to_process = len(indices_to_process)
+    total_examples = len(results_list) # Recalculate total based on loaded list
+    if num_to_process == 0:
+        logging.info("No examples found needing evaluation/retry based on current status.")
+        # Ensure final data exists even if no processing was needed in this run
+        if not os.path.exists(FINAL_OUTPUT_PATH):
+             logging.info(f"Copying data from {PROCESSED_DATA_PATH} to final location {FINAL_OUTPUT_PATH} as no retries needed...")
+             if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
+                  logging.info("Dataset copied to final location.")
+             else:
+                  logging.error("Failed to copy dataset to final location.")
+    else:
+        logging.info(f"Identified {num_to_process} examples to process/retry out of {total_examples}.")
+        # --- Concurrent Processing Logic ---
+        processed_count_total = 0 # Count processed in this run
+        processed_since_last_save = 0
+        last_save_time = time.time()
+        logging.info(f"Starting concurrent evaluation ({MAX_WORKERS} workers) with periodic saving...")
+        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+            # Submit tasks only for the indices that need processing
+            futures = {
+                executor.submit(evaluate_dataset_entry, results_list[i]): i
+                for i in indices_to_process
+            }
+            try:
+                # Setup progress bar for the number of tasks submitted
+                pbar = tqdm(total=num_to_process, desc="Evaluating turns", unit="turn") # Updated description
+                for future in concurrent.futures.as_completed(futures):
+                    original_index = futures[future] # Get the original list index for this future
+                    try:
+                        # Get the result (the updated example dictionary)
+                        updated_example_dict = future.result()
+                        # Update the main list with the processed data
+                        results_list[original_index] = updated_example_dict
+                        # Update progress bar postfix with the status of the completed item
+                        pbar.set_postfix({"LastStatus": updated_example_dict.get('llm_evaluation_status', 'N/A')}, refresh=False) # Don't refresh too often
+                    except Exception as exc:
+                        # Log errors from the future execution itself (should be rare if evaluate_dataset_entry handles errors)
+                        logging.error(f'Evaluation task for index {original_index} encountered an exception: {exc}', exc_info=True)
+                        # Update the status in the main list to indicate failure
+                        error_placeholder = results_list[original_index].copy()
+                        error_placeholder['llm_evaluation_status'] = f'failed_future_exception_{type(exc).__name__}'
+                        results_list[original_index] = error_placeholder
+                        pbar.set_postfix({"LastStatus": error_placeholder['llm_evaluation_status']}, refresh=False)
+                    finally:
+                        # Increment counters regardless of success or failure
+                        processed_count_total += 1
+                        processed_since_last_save += 1
+                        pbar.update(1) # Update progress bar
+                        # Periodic save logic
+                        if processed_since_last_save >= SAVE_INTERVAL:
+                            current_time = time.time()
+                            time_since_last = current_time - last_save_time
+                            logging.info(f"\n--- Processed {processed_since_last_save} items (Total this run: {processed_count_total}/{num_to_process}). Time since last save: {time_since_last:.1f}s. Saving progress... ---")
+                            # Save intermediate progress to PROCESSED_DATA_PATH using the correct features
+                            if save_dataset_atomically(results_list, PROCESSED_DATA_PATH, dataset_features):
+                                logging.info(f"--- Progress successfully saved to {PROCESSED_DATA_PATH} ---")
+                                processed_since_last_save = 0 # Reset counter
+                                last_save_time = current_time
+                            else:
+                                # Log error but continue processing, hoping the next save works
+                                logging.error(f"--- FAILED TO SAVE PROGRESS to {PROCESSED_DATA_PATH}! Check errors. Will retry later. ---")
+            except KeyboardInterrupt:
+                logging.warning("\nCtrl+C detected! Attempting to shut down executor and save progress...")
+                # Gracefully shutdown the executor - wait for currently running tasks to finish (or cancel them)
+                # executor.shutdown(wait=False) # Cancel pending futures - results may be incomplete
+                # Consider just letting the 'finally' block handle the save
+            except Exception as e:
+                logging.error(f"An unexpected error occurred during the main processing loop: {e}", exc_info=True)
+                logging.error("Attempting final save...")
+            finally:
+                # Ensure progress bar is closed
+                if 'pbar' in locals() and pbar is not None:
+                    pbar.close()
+                logging.info("--- Processing loop finished or interrupted. ---")
+                # --- Final Save Attempt (Save the complete results_list) ---
+                logging.info(f"Attempting final save of the fully annotated dataset ({len(results_list)} items) to: {FINAL_OUTPUT_PATH}")
+                if save_dataset_atomically(results_list, FINAL_OUTPUT_PATH, dataset_features):
+                    logging.info("--- Final annotated dataset state saved successfully. ---")
+                else:
+                    # Critical error if final save fails
+                    logging.error(f">>> FINAL ANNOTATED SAVE FAILED to {FINAL_OUTPUT_PATH}! <<< Check logs. Fallback JSON/Intermediate data might exist at {PROCESSED_DATA_PATH}.")
+    # --- Post-Processing: Verification, Analysis, Filtering (using LLM scores) ---
+    # This section remains largely the same, just ensures it loads from FINAL_OUTPUT_PATH
+    # and saves the filtered result to FILTERED_OUTPUT_PATH.
+    logging.info("======================================================")
+    logging.info("Post-Processing: Verification, Analysis, and LLM Filtering")
+    logging.info("======================================================")
+    # --- Verification of Final Annotated Data ---
+    logging.info(f"Verifying and Analyzing final annotated dataset: {FINAL_OUTPUT_PATH}")
+    if not os.path.exists(FINAL_OUTPUT_PATH):
+         logging.error(f"Final annotated dataset not found at {FINAL_OUTPUT_PATH}. Cannot perform analysis or filtering.")
+    else:
+        try:
+            # Reload the final dataset to ensure integrity and perform analysis/filtering
+            final_annotated_dataset = Dataset.load_from_disk(FINAL_OUTPUT_PATH)
+            num_final_examples = len(final_annotated_dataset)
+            logging.info(f"Successfully reloaded final annotated dataset with {num_final_examples} examples.")
+            # --- Calculate Score Distributions (using Pandas if available) ---
+            logging.info("Calculating score distributions...")
+            try:
+                df = final_annotated_dataset.to_pandas()
+                # Handle the placeholder -1 used for None in integer columns during saving
+                df['llm_quality'].replace(-1, pd.NA, inplace=True)
+                df['llm_complexity'].replace(-1, pd.NA, inplace=True)
+                df['llm_suitability'].replace(-1, pd.NA, inplace=True)
+                # Calculate value counts, including missing/placeholder values (NA)
+                quality_dist = df['llm_quality'].value_counts(dropna=False).sort_index()
+                complexity_dist = df['llm_complexity'].value_counts(dropna=False).sort_index()
+                suitability_dist = df['llm_suitability'].value_counts(dropna=False).sort_index()
+                status_dist = df['llm_evaluation_status'].value_counts()
+                print("\n--- Score Distributions (Annotated UltraChat Dataset) ---")
+                print("\nOverall Quality Distribution (NA indicates missing/placeholder):")
+                print(quality_dist)
+                print("\nComplexity Distribution (NA indicates missing/placeholder):")
+                print(complexity_dist)
+                print("\nVoice Response Suitability Distribution (NA indicates missing/placeholder):")
+                print(suitability_dist)
+                print("\nEvaluation Status Distribution:")
+                print(status_dist)
+                print("--------------------------------------------------")
+            except ImportError:
+                 logging.warning("Pandas not found. Cannot perform detailed distribution analysis.")
+                 # Fallback: Basic status count
+                 status_counts = {}
+                 for ex in final_annotated_dataset:
+                    st = ex.get('llm_evaluation_status', 'unknown')
+                    status_counts[st] = status_counts.get(st, 0) + 1
+                 print("\n--- Evaluation Status Distribution (Basic) ---")
+                 print(f"Status: {sorted(status_counts.items())}")
+                 print("--------------------------------------------------")
+            except Exception as e:
+                logging.error(f"Error during Pandas analysis: {e}", exc_info=True)
+            # --- Filtering based on LLM scores ---
+            logging.info(f"Filtering annotated dataset based on LLM scores: Quality >= {MIN_QUALITY_SCORE}, Suitability >= {MIN_SUITABILITY_SCORE}")
+            # Define the filtering function (same logic, checks scores)
+            def filter_criteria(example):
+                q = example.get('llm_quality')
+                s = example.get('llm_suitability')
+                # Check if scores are valid (not None and not the -1 placeholder) before comparing
+                if q is None or q == -1 or s is None or s == -1:
+                    return False # Filter out entries with missing/invalid scores
+                # Apply the thresholds
+                passes = q >= MIN_QUALITY_SCORE and s >= MIN_SUITABILITY_SCORE
+                # Optional: Add complexity filter here if needed
+                # c = example.get('llm_complexity')
+                # if c is not None and c != -1 and MAX_COMPLEXITY_SCORE is not None:
+                #     passes = passes and c <= MAX_COMPLEXITY_SCORE
+                return passes
+            # Apply the filter using datasets.filter
+            # Use multiple processes if beneficial and safe (check memory usage)
+            num_proc_filter = max(1, os.cpu_count() // 2 if os.cpu_count() else 1)
+            logging.info(f"Applying filter with num_proc={num_proc_filter}...")
+            filtered_llm_dataset = final_annotated_dataset.filter(
+                filter_criteria,
+                num_proc=num_proc_filter # Adjust based on system resources
+            )
+            num_filtered = len(filtered_llm_dataset)
+            filter_percentage = (num_filtered / num_final_examples * 100) if num_final_examples > 0 else 0
+            logging.info(f"LLM-Filtered dataset size: {num_filtered} examples ({filter_percentage:.2f}% of annotated)")
+            # --- Save LLM-Filtered Dataset ---
+            logging.info(f"Saving LLM-filtered dataset to: {FILTERED_OUTPUT_PATH}")
+            try:
+                # Ensure parent directory exists
+                os.makedirs(os.path.dirname(FILTERED_OUTPUT_PATH), exist_ok=True)
+                # Clean up old filtered data if it exists
+                if os.path.exists(FILTERED_OUTPUT_PATH):
+                    logging.debug(f"Removing existing LLM-filtered directory: {FILTERED_OUTPUT_PATH}")
+                    shutil.rmtree(FILTERED_OUTPUT_PATH)
+                # Save the filtered dataset
+                filtered_llm_dataset.save_to_disk(FILTERED_OUTPUT_PATH)
+                logging.info("LLM-Filtered dataset saved successfully.")
+            except Exception as e:
+                logging.error(f"Failed to save LLM-filtered dataset to {FILTERED_OUTPUT_PATH}: {e}", exc_info=True)
+        except Exception as e:
+            logging.error(f"Verification/Analysis/Filtering failed on final annotated dataset: {e}", exc_info=True)
+    # --- Script End ---
+    end_time = time.time()
+    logging.info("------------------------------------------------------")
+    logging.info(f"Script finished in {end_time - start_time:.2f} seconds.")
+    logging.info(f"Final annotated dataset saved at: {FINAL_OUTPUT_PATH}")
+    logging.info(f"LLM-Filtered dataset saved at: {FILTERED_OUTPUT_PATH}")
+    logging.info("======================================================")

r1-a/dataset/gsm8k_final_filtered/combined/dataset_info.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "query": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_type": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "difficulty": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/dataset/gsm8k_final_filtered/combined/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "3636165bbeb98bf3",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/dataset/gsm8k_final_filtered/test/dataset_info.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "query": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_type": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "difficulty": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/dataset/gsm8k_final_filtered/test/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "19de9358ac0cc73a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/dataset/gsm8k_final_filtered/train/dataset_info.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "query": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "answer": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_type": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "difficulty": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/dataset/gsm8k_final_filtered/train/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "92a03df5b878397b",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/dataset/mtcs_verified/get_response_gpt4o.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from datasets import load_dataset, Dataset, DatasetDict, Features, Value, Sequence
+dataset = Dataset.load_from_disk("/root/autodl-tmp/audio-r1/r1-a/dataset/Multi-subject-RLVR_rephrased/train_processed")
+breakpoint()

r1-a/dataset/mtcs_verified/mtcs.py ADDED Viewed

File without changes

r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/dataset_info.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "prompt": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_0": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "is_safe_0": {
+      "dtype": "bool",
+      "_type": "Value"
+    },
+    "is_safe_1": {
+      "dtype": "bool",
+      "_type": "Value"
+    },
+    "involved_harm_categories": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "better_response_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "safer_response_id": {
+      "dtype": "int64",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/dataset/pku_saferlhf_filtered_unsafe_diverse_hf/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "86ec040d4b942521",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/dataset/shp2_filtered_tts_high_quality_train_only/dataset_info.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "query": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "chosen": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "reject": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "domain": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/dataset/shp2_filtered_tts_high_quality_train_only/state.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00002.arrow"
+    },
+    {
+      "filename": "data-00001-of-00002.arrow"
+    }
+  ],
+  "_fingerprint": "d339b25f13802884",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}