Spaces:

intersteller2887
/

Turing-test-web

Sleeping

App Files Files Community

intersteller2887 commited on Jul 14, 2025

Commit

2d4a3ed

verified ·

1 Parent(s): 655f8d4

进一步修改了初始化试题时一系列文件锁相关逻辑，添加了后端相关函数的详细注释，调整了整体结构（还未调整提交试题时的文件锁相关逻辑）

Browse files

Files changed (1) hide show

app.py +94 -59

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from huggingface_hub import HfApi, hf_hub_download
 from multiprocessing import TimeoutError
 from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
 dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
 dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
@@ -41,7 +42,7 @@ sample1_audio_path = local_audio_paths[0]
 print(sample1_audio_path)
 # ==============================================================================
-# 数据定义 (Data Definition)
 # ==============================================================================
 DIMENSIONS_DATA = [
@@ -98,11 +99,9 @@ DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
 MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
 # ==============================================================================
-# Function Definitions
 # ==============================================================================
-# Function that load or initialize count.json
 # This version did not place file reading into filelock, concurrent read could happen
 """def load_or_initialize_count_json(audio_paths):
     try:
@@ -154,28 +153,39 @@ MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
     return count_data"""
 # This version also places file reading into filelock, and modified
 def load_or_initialize_count_json(audio_paths):
     # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
     with FileLock(lock_path, timeout=10):
-        # Only try downloading if file doesn't exist yet
         if not os.path.exists(COUNT_JSON_PATH):
             try:
                 downloaded_path = hf_hub_download(
                     repo_id="intersteller2887/Turing-test-dataset",
                     repo_type="dataset",
                     filename=COUNT_JSON_REPO_PATH,
                     token=os.getenv("HF_TOKEN")
                 )
             except Exception:
                 pass
-        # If count.json exists: load into count_data
         if os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
                 count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
         # Else initialize count_data with orderedDict
         else:
             count_data = collections.OrderedDict()
@@ -193,65 +203,86 @@ def load_or_initialize_count_json(audio_paths):
                     count_data[filename] = 0
                 updated = True
         if updated or not os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
                 json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return count_data
 # Shorten the time of playing previous audio when reached next question
 def append_cache_buster(audio_path):
     return f"{audio_path}?t={int(time.time() * 1000)}"
-# Shorten the time of playing previous audio when reached next question
-def append_cache_buster(audio_path):
-    return f"{audio_path}?t={int(time.time() * 1000)}"
-"""def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
         raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
-    eligible_paths_copy = eligible_paths.copy()
-    random.seed(int(time.time()))
-    selected = random.sample(eligible_paths_copy, k)
     for path in selected:
         filename = os.path.basename(path)
         count_data[filename] = count_data.get(filename, 0) + 1
-    with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
-        json.dump(count_data, f, indent=4, ensure_ascii=False)
     return selected, count_data"""
-def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
-    eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
-    if len(eligible_paths) < k:
-        raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
-    selected = random.sample(eligible_paths, k)
-    for path in selected:
-        filename = os.path.basename(path)
-        count_data[filename] = count_data.get(filename, 0) + 1
-    lock_path = COUNT_JSON_PATH + ".lock"
-    with FileLock(lock_path, timeout=10):
         with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
             json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return selected, count_data
 # Save question_set in each user_data_state, preventing global sharing
 def start_challenge(user_data_state):
-    count_data = load_or_initialize_count_json(all_data_audio_paths)
-    selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
     question_set = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
@@ -259,13 +290,18 @@ def start_challenge(user_data_state):
     ]
     user_data_state["question_set"] = question_set
-    user_data_state["updated_count_data"] = updated_count_data
-    return gr.update(visible=False), gr.update(visible=True), user_data_state
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
 def check_info_complete(username, age, gender, education, education_other, ai_experience):
     if username.strip() and age and gender and education and ai_experience:
         if education == "其他（请注明）" and not education_other.strip():
@@ -273,6 +309,7 @@ def check_info_complete(username, age, gender, education, education_other, ai_ex
         return gr.update(interactive=True)
     return gr.update(interactive=False)
 def show_sample_page_and_init(username, age, gender, education, education_other, ai_experience, user_data):
     final_edu = education_other if education == "其他（请注明）" else education
     user_data.update({
@@ -341,8 +378,6 @@ def update_test_dimension_view(d_idx, selections):
 def init_test_question(user_data, q_idx):
     d_idx = 0
-    # question = QUESTION_SET[q_idx]
-    # progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
     question = user_data["question_set"][q_idx]
     progress_q = f"第 {q_idx + 1} / {len(user_data['question_set'])} 题"
@@ -405,8 +440,30 @@ def navigate_dimensions(direction, q_idx, d_idx, selections, *slider_values):
             next_btn_update,
         ) + tuple(slider_updates)
 # ==============================================================================
-# 重连函数定义 (Retry Function Definitions)
 # ==============================================================================
 # Function for handling connection error
@@ -732,28 +789,6 @@ def save_all_results_to_file(all_results, user_data, count_data=None):
             commit_message=f"Update count.json after submission by {username}"
         )
-def toggle_reference_view(current):
-    if current == "参考":
-        return gr.update(visible=False), gr.update(visible=True), gr.update(value="返回")
-    else:
-        return gr.update(visible=True), gr.update(visible=False), gr.update(value="参考")
-def back_to_welcome():
-    return (
-        gr.update(visible=True),   # welcome_page
-        gr.update(visible=False),  # info_page
-        gr.update(visible=False),  # sample_page
-        gr.update(visible=False),  # pretest_page
-        gr.update(visible=False),  # test_page
-        gr.update(visible=False),  # final_judgment_page
-        gr.update(visible=False),  # result_page
-        {},                        # user_data_state
-        0,                         # current_question_index
-        0,                         # current_test_dimension_index
-        {},                        # current_question_selections
-        []                         # test_results
-    )
 # ==============================================================================
 # Gradio 界面定义 (Gradio UI Definition)
 # ==============================================================================

 from multiprocessing import TimeoutError
 from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+# Load dataset from HuggingFace
 dataset = load_dataset("intersteller2887/Turing-test-dataset", split="train")
 dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
 print(sample1_audio_path)
 # ==============================================================================
+# Data Definition
 # ==============================================================================
 DIMENSIONS_DATA = [
 MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
 # ==============================================================================
+# Backend Function Definitions
 # ==============================================================================
 # This version did not place file reading into filelock, concurrent read could happen
 """def load_or_initialize_count_json(audio_paths):
     try:
     return count_data"""
+# Function that load or initialize count.json
+# Function is called when user start a challenge, and this will load or initialize count.json to working directory
+# Initialize happens when count.json does not exist in the working directory as well as HuggingFace dataset
+# Load happens when count.json exists in HuggingFace dataset, and it's not loaded to the working directory yet
+# After load/initialize, all newly added audio files will be added to count.json with initial value of 0
+# Load/Initialize will generate count.json in the working directory for all users under this space
 # This version also places file reading into filelock, and modified
 def load_or_initialize_count_json(audio_paths):
     # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
     with FileLock(lock_path, timeout=10):
+        # If count.json does not exist in the working directory, try to download it from HuggingFace dataset
         if not os.path.exists(COUNT_JSON_PATH):
             try:
+                # Save latest count.json to working directory
                 downloaded_path = hf_hub_download(
                     repo_id="intersteller2887/Turing-test-dataset",
                     repo_type="dataset",
                     filename=COUNT_JSON_REPO_PATH,
                     token=os.getenv("HF_TOKEN")
                 )
+                with open(downloaded_path, "rb") as src, open(COUNT_JSON_PATH, "wb") as dst:
+                    dst.write(src.read())
             except Exception:
                 pass
+        # If count.json exists in the working directory: load into count_data for potential update
         if os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
                 count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
         # Else initialize count_data with orderedDict
+        # This happens when there is no count.json (both working directory and HuggingFace dataset)
         else:
             count_data = collections.OrderedDict()
                     count_data[filename] = 0
                 updated = True
+        # Write updated count_data to /home/user/app/count.json
         if updated or not os.path.exists(COUNT_JSON_PATH):
             with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
                 json.dump(count_data, f, indent=4, ensure_ascii=False)
+    return
 # Shorten the time of playing previous audio when reached next question
 def append_cache_buster(audio_path):
     return f"{audio_path}?t={int(time.time() * 1000)}"
+# Function that samples questions from avaliable question set
+# This version utilizes a given count_data to sample audio paths
+"""def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
         raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
+    # Shuffule to avoid fixed selections resulted from directory structure
+    selected = random.sample(eligible_paths, k)
+    # Once sampled a test, update these questions immediately
     for path in selected:
         filename = os.path.basename(path)
         count_data[filename] = count_data.get(filename, 0) + 1
+    # Add filelock to /workspace/count.json
+    lock_path = COUNT_JSON_PATH + ".lock"
+    with FileLock(lock_path, timeout=10):
+        with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
+            json.dump(count_data, f, indent=4, ensure_ascii=False)
     return selected, count_data"""
+# This version places file reading into filelock to guarantee correct update of count.json
+def sample_audio_paths(audio_paths, k=5, max_count=1):
+    # Add filelock to /workspace/count.json
+    lock_path = COUNT_JSON_PATH + ".lock"
+    # Load newest count.json
+    with FileLock(lock_path, timeout=10):
+        with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
+            count_data = json.load(f)
+        eligible_paths = [
+            p for p in audio_paths
+            if count_data.get(os.path.basename(p), 0) < max_count
+        ]
+        if len(eligible_paths) < k:
+            raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
+        selected = random.sample(eligible_paths, k)
+        # Update count_data
+        for path in selected:
+            filename = os.path.basename(path)
+            count_data[filename] = count_data.get(filename, 0) + 1
+        # Update count.json
         with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
             json.dump(count_data, f, indent=4, ensure_ascii=False)
+    # return selected, count_data
+    # Keep count_data atomic
+    return selected
+# ==============================================================================
+# Frontend Function Definitions
+# ==============================================================================
 # Save question_set in each user_data_state, preventing global sharing
 def start_challenge(user_data_state):
+    load_or_initialize_count_json(all_data_audio_paths)
+    # selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, k=5)
+    # Keep count_data atomic
+    selected_audio_paths = sample_audio_paths(all_data_audio_paths, k=5)
     question_set = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
     ]
     user_data_state["question_set"] = question_set
+    # count_data is not needed in the user data
+    # user_data_state["updated_count_data"] = updated_count_data
+    return gr.update(visible=False), gr.update(visible=True), user_data_state
+# This function toggles the visibility of the "其他（请注明）" input field based on the selected education choice
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
+# This function checks if the user information is complete
 def check_info_complete(username, age, gender, education, education_other, ai_experience):
     if username.strip() and age and gender and education and ai_experience:
         if education == "其他（请注明）" and not education_other.strip():
         return gr.update(interactive=True)
     return gr.update(interactive=False)
+# This function updates user_data and initializes the sample page (called when user submits their info)
 def show_sample_page_and_init(username, age, gender, education, education_other, ai_experience, user_data):
     final_edu = education_other if education == "其他（请注明）" else education
     user_data.update({
 def init_test_question(user_data, q_idx):
     d_idx = 0
     question = user_data["question_set"][q_idx]
     progress_q = f"第 {q_idx + 1} / {len(user_data['question_set'])} 题"
             next_btn_update,
         ) + tuple(slider_updates)
+def toggle_reference_view(current):
+    if current == "参考":
+        return gr.update(visible=False), gr.update(visible=True), gr.update(value="返回")
+    else:
+        return gr.update(visible=True), gr.update(visible=False), gr.update(value="参考")
+def back_to_welcome():
+    return (
+        gr.update(visible=True),   # welcome_page
+        gr.update(visible=False),  # info_page
+        gr.update(visible=False),  # sample_page
+        gr.update(visible=False),  # pretest_page
+        gr.update(visible=False),  # test_page
+        gr.update(visible=False),  # final_judgment_page
+        gr.update(visible=False),  # result_page
+        {},                        # user_data_state
+        0,                         # current_question_index
+        0,                         # current_test_dimension_index
+        {},                        # current_question_selections
+        []                         # test_results
+    )
 # ==============================================================================
+# Retry Function Definitions
 # ==============================================================================
 # Function for handling connection error
             commit_message=f"Update count.json after submission by {username}"
         )
 # ==============================================================================
 # Gradio 界面定义 (Gradio UI Definition)
 # ==============================================================================