Spaces:

Coco-18
/

Kapamtalk

Sleeping

App Files Files Community

Coco-18 commited on Apr 2, 2025

Commit

ff4f467

verified ·

1 Parent(s): 22eafa7

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +255 -183

evaluate.py CHANGED Viewed

@@ -6,6 +6,8 @@ import logging
 import traceback
 import tempfile
 import shutil
 from difflib import SequenceMatcher
 import torch
 import torchaudio
@@ -30,10 +32,17 @@ REFERENCE_CACHE = {}
 # Traditional evaluation cache for quick responses to identical requests
 EVALUATION_CACHE = {}
-# A flag to indicate if preprocessing is complete
 PREPROCESSING_COMPLETE = False
 PREPROCESSING_LOCK = threading.Lock()
 PREPROCESSING_THREAD = None
 def calculate_similarity(text1, text2):
     """Calculate text similarity percentage."""
@@ -46,6 +55,92 @@ def calculate_similarity(text1, text2):
     matcher = SequenceMatcher(None, clean1, clean2)
     return matcher.ratio() * 100
 def setup_reference_patterns(reference_dir, sample_rate=16000):
     """Create standard reference pattern directories without dummy files"""
     reference_patterns = [
@@ -159,76 +254,134 @@ def preprocess_reference_file(ref_file, sample_rate, asr_model, asr_processor):
 def preprocess_all_references(reference_dir, sample_rate=16000):
     """Preprocess all reference audio files at startup"""
-    global PREPROCESSING_COMPLETE, REFERENCE_CACHE
-    logger.info("🚀 Starting preprocessing of all reference audio files...")
-    # Get ASR model and processor
-    asr_model = get_asr_model()
-    asr_processor = get_asr_processor()
-    if asr_model is None or asr_processor is None:
-        logger.error("❌ Cannot preprocess reference audio - ASR models not loaded")
         return False
     try:
-        pattern_dirs = [d for d in os.listdir(reference_dir)
-                     if os.path.isdir(os.path.join(reference_dir, d))]
-        total_processed = 0
-        start_time = time.time()
-        # Process each reference pattern directory
-        for pattern in pattern_dirs:
-            pattern_path = os.path.join(reference_dir, pattern)
-            reference_files = glob.glob(os.path.join(pattern_path, "*.wav"))
-            reference_files = [f for f in reference_files if "dummy_reference" not in f]
-            if not reference_files:
-                continue
-            # Initialize cache for this pattern if needed
-            if pattern not in REFERENCE_CACHE:
-                REFERENCE_CACHE[pattern] = {}
-            logger.info(f"🔄 Preprocessing {len(reference_files)} references for pattern: {pattern}")
-            # Determine optimal number of workers
-            max_workers = min(os.cpu_count() or 4, len(reference_files), 5)
-            # Process files in parallel
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                tasks = {
-                    executor.submit(preprocess_reference_file, ref_file, sample_rate, asr_model, asr_processor):
-                    ref_file for ref_file in reference_files
-                }
-                for future in tasks:
-                    ref_file = tasks[future]
-                    try:
-                        result = future.result()
-                        if result:
-                            REFERENCE_CACHE[pattern][os.path.basename(ref_file)] = result
-                            total_processed += 1
-                    except Exception as e:
-                        logger.error(f"❌ Failed to process {ref_file}: {str(e)}")
-        elapsed_time = time.time() - start_time
-        logger.info(f"✅ Preprocessing complete! Processed {total_processed} reference files in {elapsed_time:.2f} seconds")
-        with PREPROCESSING_LOCK:
-            PREPROCESSING_COMPLETE = True
-        return True
     except Exception as e:
-        logger.error(f"❌ Error during reference preprocessing: {str(e)}")
-        logger.debug(f"Stack trace: {traceback.format_exc()}")
         return False
 def start_preprocessing_thread(reference_dir, sample_rate=16000):
     """Start preprocessing in a background thread"""
-    global PREPROCESSING_THREAD
     def preprocessing_worker():
         preprocess_all_references(reference_dir, sample_rate)
@@ -238,6 +391,7 @@ def start_preprocessing_thread(reference_dir, sample_rate=16000):
     PREPROCESSING_THREAD.start()
     logger.info("🧵 Started reference audio preprocessing in background thread")
 def init_reference_audio(reference_dir, output_dir):
     """Initialize reference audio directories and start preprocessing"""
@@ -359,114 +513,6 @@ def init_reference_audio(reference_dir, output_dir):
             logger.critical("💥 CRITICAL: Failed to create even a fallback directory")
             return reference_dir
-def handle_upload_reference(request, reference_dir, sample_rate):
-    """Handle upload of reference audio files and preprocess immediately"""
-    global REFERENCE_CACHE
-    try:
-        if "audio" not in request.files:
-            logger.warning("⚠️ Reference upload missing audio file")
-            return jsonify({"error": "No audio file uploaded"}), 400
-        reference_word = request.form.get("reference_word", "").strip()
-        if not reference_word:
-            logger.warning("⚠️ Reference upload missing reference word")
-            return jsonify({"error": "No reference word provided"}), 400
-        # Validate reference word
-        reference_patterns = [
-            "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", "mayap_a_bengi",
-            "komusta_ka", "malaus_ko_pu", "malaus_kayu", "agaganaka_da_ka",
-            "pagdulapan_da_ka", "kaluguran_da_ka", "dakal_a_salamat", "panapaya_mu_ku",
-            "wa", "ali", "tuknang", "lagwa", "galo", "buri_ke_ini", "tara_na",
-            "nokarin_ka_ibat", "nokarin_ka_munta", "atiu_na_ku", "nanung_panayan_mu",
-            "mako_na_ka", "muli_ta_na", "nanu_ing_pengan_mu", "mekeni", "mengan_na_ka",
-            "munta_ka_karin", "magkanu_ini", "mimingat_ka", "mangan_ta_na", "lakwan_da_ka",
-            "nanu_maliari_kung_daptan_keka", "pilan_na_ka_banwa", "saliwan_ke_ini",
-            "makananu_munta_king", "adwa", "anam", "apat", "apulu", "atlu", "dinalan", "libu", "lima",
-            "metung", "pitu", "siyam", "walu", "masala", "madalumdum", "maragul", "marimla", "malagu", "marok", "mababa", "malapit", "matuling", "maputi",
-            "arung", "asbuk", "balugbug", "bitis", "buntuk", "butit", "gamat", "kuku", "salu", "tud",
-            "pisan", "dara", "achi", "apu", "ima", "tatang", "pengari", "koya", "kapatad", "wali",
-            "pasbul", "awang", "dagis", "bale", "ulas", "sambra", "sulu", "pitudturan", "luklukan", "ulnan"
-        ]
-        if reference_word not in reference_patterns:
-            logger.warning(f"⚠️ Invalid reference word: {reference_word}")
-            return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
-        # Make sure we have a writable reference directory
-        if not os.path.exists(reference_dir):
-            reference_dir = os.path.join('/tmp', 'reference_audios')
-            os.makedirs(reference_dir, exist_ok=True)
-            logger.warning(f"⚠️ Using alternate reference directory for upload: {reference_dir}")
-        # Create directory for reference pattern if it doesn't exist
-        pattern_dir = os.path.join(reference_dir, reference_word)
-        os.makedirs(pattern_dir, exist_ok=True)
-        # Save the reference audio file
-        audio_file = request.files["audio"]
-        filename = secure_filename(audio_file.filename)
-        # Ensure filename has .wav extension
-        if not filename.lower().endswith('.wav'):
-            base_name = os.path.splitext(filename)[0]
-            filename = f"{base_name}.wav"
-        file_path = os.path.join(pattern_dir, filename)
-        # Create a temporary file first, then convert to WAV
-        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
-            audio_file.save(temp_file.name)
-            temp_path = temp_file.name
-        try:
-            # Process the audio file
-            audio = AudioSegment.from_file(temp_path)
-            audio = audio.set_frame_rate(sample_rate).set_channels(1)
-            audio.export(file_path, format="wav")
-            logger.info(f"✅ Reference audio saved successfully for {reference_word}: {file_path}")
-            # Clean up temp file
-            try:
-                os.unlink(temp_path)
-            except:
-                pass
-            # Immediately preprocess this new reference file and add to cache
-            asr_model = get_asr_model()
-            asr_processor = get_asr_processor()
-            if asr_model and asr_processor:
-                # Initialize cache for this pattern if needed
-                if reference_word not in REFERENCE_CACHE:
-                    REFERENCE_CACHE[reference_word] = {}
-                # Preprocess and add to cache
-                result = preprocess_reference_file(file_path, sample_rate, asr_model, asr_processor)
-                if result:
-                    REFERENCE_CACHE[reference_word][filename] = result
-                    logger.info(f"✅ New reference audio preprocessed and added to cache: {filename}")
-        except Exception as e:
-            logger.error(f"❌ Reference audio processing failed: {str(e)}")
-            return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
-        # Count how many references we have now
-        references = glob.glob(os.path.join(pattern_dir, "*.wav"))
-        return jsonify({
-            "message": "Reference audio uploaded successfully",
-            "reference_word": reference_word,
-            "file": filename,
-            "total_references": len(references),
-            "preprocessed": True
-        })
-    except Exception as e:
-        logger.error(f"❌ Unhandled exception in reference upload: {str(e)}")
-        logger.debug(f"Stack trace: {traceback.format_exc()}")
-        return jsonify({"error": f"Internal server error: {str(e)}"}), 500
 def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
     """Handle pronunciation evaluation requests with preprocessing optimization"""
     global REFERENCE_CACHE, PREPROCESSING_COMPLETE
@@ -474,6 +520,9 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
     request_id = f"req-{id(request)}"
     logger.info(f"[{request_id}] 🆕 Starting pronunciation evaluation request")
     temp_dir = None
     # Get the ASR model and processor using the getter functions
@@ -482,12 +531,16 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
     if asr_model is None or asr_processor is None:
         logger.error(f"[{request_id}] ❌ Evaluation endpoint called but ASR models aren't loaded")
         return jsonify({"error": "ASR model not available"}), 503
     try:
         # Check for basic request requirements
         if "audio" not in request.files:
             logger.warning(f"[{request_id}] ⚠️ Evaluation request missing audio file")
             return jsonify({"error": "No audio file uploaded"}), 400
         audio_file = request.files["audio"]
@@ -497,6 +550,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
         # Validate reference locator
         if not reference_locator:
             logger.warning(f"[{request_id}] ⚠️ No reference locator provided")
             return jsonify({"error": "Reference locator is required"}), 400
         # OPTIMIZATION: Simple caching based on audio content hash + reference_locator
@@ -509,6 +564,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
         # Check in-memory cache using the module-level cache
         if cache_key in EVALUATION_CACHE:
             logger.info(f"[{request_id}] ✅ Using cached evaluation result")
             return EVALUATION_CACHE[cache_key]
         # Construct full reference directory path
@@ -522,6 +579,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
                 logger.warning(f"[{request_id}] ⚠️ Created missing reference directory: {reference_dir_path}")
             except Exception as e:
                 logger.error(f"[{request_id}] ❌ Failed to create reference directory: {str(e)}")
                 return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
         # Check for reference files
@@ -533,6 +592,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
         # If no reference files exist, return a more detailed error message
         if not reference_files:
             logger.warning(f"[{request_id}] ⚠️ No valid reference audio files found in {reference_dir_path}")
             return jsonify({
                 "error": f"No reference audio found for {reference_locator}",
                 "message": "Please upload a reference audio file before evaluation.",
@@ -566,6 +627,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
             user_audio_path = processed_path
         except Exception as e:
             logger.error(f"[{request_id}] ❌ Audio processing failed: {str(e)}")
             return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
         # Transcribe user audio
@@ -575,6 +638,8 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
             logger.info(f"[{request_id}] ✅ User transcription: '{user_transcription}'")
         except Exception as e:
             logger.error(f"[{request_id}] ❌ ASR inference failed: {str(e)}")
             return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
         # Check if we have preprocessed data for this reference locator
@@ -683,9 +748,6 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
                         additional_files = remaining_files[:5]  # Process max 5 more
                         # Process remaining files
-                        additional_results = list(executor.map(process_reference_file, additional_files))
-                        all_results.extend(additional_results)
         # Clean up temp files
         try:
             if temp_dir and os.path.exists(temp_dir):
@@ -738,7 +800,7 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
             "total_references_compared": len(all_results),
             "total_available_references": len(reference_files),
             "used_preprocessed_data": using_preprocessed,
-            "preprocessing_complete": PREPROCESSING_COMPLETE
         })
         # Cache the result for future identical requests
@@ -748,40 +810,50 @@ def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
             # Remove oldest entry (simplified approach)
             EVALUATION_CACHE.pop(next(iter(EVALUATION_CACHE)))
         return response
-    except Exception as e:
-        logger.error(f"[{request_id}] ❌ Unhandled exception in evaluation endpoint: {str(e)}")
-        logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
-        # Clean up on error
-        try:
-            if temp_dir and os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir)
-        except:
-            pass
-        return jsonify({"error": f"Internal server error: {str(e)}"}), 500
-# Add a new function to get preprocessing status
 def get_preprocessing_status():
     """Get the current status of reference audio preprocessing"""
-    global PREPROCESSING_COMPLETE, REFERENCE_CACHE
     with PREPROCESSING_LOCK:
         is_complete = PREPROCESSING_COMPLETE
     # Count total preprocessed references
     preprocessed_count = 0
     for pattern, files in REFERENCE_CACHE.items():
         preprocessed_count += len(files)
     # Check if preprocessing thread is alive
     thread_running = PREPROCESSING_THREAD is not None and PREPROCESSING_THREAD.is_alive()
     return {
         "complete": is_complete,
         "preprocessed_files": preprocessed_count,
         "patterns_cached": len(REFERENCE_CACHE),
-        "thread_running": thread_running
-    }

 import traceback
 import tempfile
 import shutil
+import json
+import fcntl
 from difflib import SequenceMatcher
 import torch
 import torchaudio
 # Traditional evaluation cache for quick responses to identical requests
 EVALUATION_CACHE = {}
+# Flags to manage preprocessing state
 PREPROCESSING_COMPLETE = False
+PREPROCESSING_ACTIVE = False
 PREPROCESSING_LOCK = threading.Lock()
 PREPROCESSING_THREAD = None
+PREPROCESSING_PAUSE = threading.Event()  # Event for pausing/resuming preprocessing
+PREPROCESSING_PAUSE.set()  # Start in "resumed" state
+# Lock file for ensuring only one preprocessing thread runs system-wide
+LOCK_FILE = "/tmp/speech_api_preprocessing.lock"
+_lock_file_handle = None  # Global variable to hold the lock file handle
 def calculate_similarity(text1, text2):
     """Calculate text similarity percentage."""
     matcher = SequenceMatcher(None, clean1, clean2)
     return matcher.ratio() * 100
+def acquire_preprocessing_lock():
+    """Attempt to acquire the system-wide preprocessing lock using a lock file.
+    Returns True if lock was acquired, False otherwise"""
+    try:
+        # Check if lock file exists, create it if not
+        if not os.path.exists(LOCK_FILE):
+            with open(LOCK_FILE, 'w') as f:
+                f.write(str(os.getpid()))
+        # Try to get an exclusive lock on the file
+        lock_file = open(LOCK_FILE, 'r+')
+        try:
+            fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            # If we get here, we have the lock
+            # Update with current PID
+            lock_file.seek(0)
+            lock_file.write(str(os.getpid()))
+            lock_file.truncate()
+            lock_file.flush()
+            # Store the file handle to maintain the lock
+            global _lock_file_handle
+            _lock_file_handle = lock_file
+            logger.info("🔒 Acquired preprocessing lock")
+            return True
+        except IOError:
+            # Another process has the lock
+            lock_file.close()
+            logger.info("⚠️ Another process is already running preprocessing")
+            return False
+    except Exception as e:
+        logger.error(f"❌ Error acquiring preprocessing lock: {str(e)}")
+        return False
+def release_preprocessing_lock():
+    """Release the preprocessing lock if we have it"""
+    global _lock_file_handle
+    if '_lock_file_handle' in globals() and _lock_file_handle:
+        try:
+            fcntl.flock(_lock_file_handle, fcntl.LOCK_UN)
+            _lock_file_handle.close()
+            logger.info("🔓 Released preprocessing lock")
+        except Exception as e:
+            logger.error(f"❌ Error releasing preprocessing lock: {str(e)}")
+def save_preprocessing_state(reference_dir, state=None):
+    """Save the current preprocessing state to a file"""
+    state_file = os.path.join(reference_dir, ".preprocessing_state.json")
+    if state is None:
+        # Generate current state
+        state = {
+            "complete": PREPROCESSING_COMPLETE,
+            "active": PREPROCESSING_ACTIVE,
+            "patterns_cached": list(REFERENCE_CACHE.keys()),
+            "timestamp": time.time(),
+            "pid": os.getpid()
+        }
+    try:
+        with open(state_file, 'w') as f:
+            json.dump(state, f)
+    except Exception as e:
+        logger.error(f"❌ Error saving preprocessing state: {str(e)}")
+def load_preprocessing_state(reference_dir):
+    """Load preprocessing state from a file"""
+    state_file = os.path.join(reference_dir, ".preprocessing_state.json")
+    if not os.path.exists(state_file):
+        return None
+    try:
+        with open(state_file, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        logger.error(f"❌ Error loading preprocessing state: {str(e)}")
+        return None
+def pause_preprocessing():
+    """Pause preprocessing temporarily"""
+    PREPROCESSING_PAUSE.clear()
+def resume_preprocessing():
+    """Resume preprocessing after pause"""
+    PREPROCESSING_PAUSE.set()
 def setup_reference_patterns(reference_dir, sample_rate=16000):
     """Create standard reference pattern directories without dummy files"""
     reference_patterns = [
 def preprocess_all_references(reference_dir, sample_rate=16000):
     """Preprocess all reference audio files at startup"""
+    global PREPROCESSING_COMPLETE, REFERENCE_CACHE, PREPROCESSING_ACTIVE
+    # Check if another process already has the lock
+    if not acquire_preprocessing_lock():
+        logger.info("⏩ Skipping preprocessing as another process is already handling it")
         return False
     try:
+        logger.info("🚀 Starting preprocessing of all reference audio files...")
+        with PREPROCESSING_LOCK:
+            PREPROCESSING_ACTIVE = True
+        # Save initial state
+        save_preprocessing_state(reference_dir)
+        # Get ASR model and processor
+        asr_model = get_asr_model()
+        asr_processor = get_asr_processor()
+        if asr_model is None or asr_processor is None:
+            logger.error("❌ Cannot preprocess reference audio - ASR models not loaded")
+            with PREPROCESSING_LOCK:
+                PREPROCESSING_ACTIVE = False
+            save_preprocessing_state(reference_dir)
+            release_preprocessing_lock()
+            return False
+        try:
+            pattern_dirs = [d for d in os.listdir(reference_dir)
+                         if os.path.isdir(os.path.join(reference_dir, d))]
+            total_processed = 0
+            start_time = time.time()
+            # Process each reference pattern directory
+            for pattern in pattern_dirs:
+                # Wait if processing is paused
+                PREPROCESSING_PAUSE.wait()
+                pattern_path = os.path.join(reference_dir, pattern)
+                reference_files = glob.glob(os.path.join(pattern_path, "*.wav"))
+                reference_files = [f for f in reference_files if "dummy_reference" not in f]
+                if not reference_files:
+                    continue
+                # Initialize cache for this pattern if needed
+                if pattern not in REFERENCE_CACHE:
+                    REFERENCE_CACHE[pattern] = {}
+                logger.info(f"🔄 Preprocessing {len(reference_files)} references for pattern: {pattern}")
+                pattern_start_time = time.time()
+                # Determine optimal number of workers
+                max_workers = min(os.cpu_count() or 4, len(reference_files), 5)
+                processed_in_pattern = 0
+                # Process files in parallel
+                with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                    tasks = {
+                        executor.submit(preprocess_reference_file, ref_file, sample_rate, asr_model, asr_processor):
+                        ref_file for ref_file in reference_files
+                    }
+                    for future in tasks:
+                        ref_file = tasks[future]
+                        try:
+                            result = future.result()
+                            if result:
+                                REFERENCE_CACHE[pattern][os.path.basename(ref_file)] = result
+                                total_processed += 1
+                                processed_in_pattern += 1
+                        except Exception as e:
+                            logger.error(f"❌ Failed to process {ref_file}: {str(e)}")
+                # Log completion of pattern processing
+                pattern_time = time.time() - pattern_start_time
+                logger.info(f"✅ Completed preprocessing pattern '{pattern}' - {processed_in_pattern}/{len(reference_files)} files in {pattern_time:.2f}s")
+                # Update state after each pattern
+                save_preprocessing_state(reference_dir)
+            elapsed_time = time.time() - start_time
+            logger.info(f"✅ Preprocessing complete! Processed {total_processed} reference files in {elapsed_time:.2f} seconds")
+            with PREPROCESSING_LOCK:
+                PREPROCESSING_COMPLETE = True
+                PREPROCESSING_ACTIVE = False
+            # Save final state
+            save_preprocessing_state(reference_dir)
+            release_preprocessing_lock()
+            return True
+        except Exception as e:
+            logger.error(f"❌ Error during reference preprocessing: {str(e)}")
+            logger.debug(f"Stack trace: {traceback.format_exc()}")
+            with PREPROCESSING_LOCK:
+                PREPROCESSING_ACTIVE = False
+            save_preprocessing_state(reference_dir)
+            release_preprocessing_lock()
+            return False
     except Exception as e:
+        logger.error(f"❌ Unhandled exception in preprocessing: {str(e)}")
+        with PREPROCESSING_LOCK:
+            PREPROCESSING_ACTIVE = False
+        save_preprocessing_state(reference_dir)
+        release_preprocessing_lock()
         return False
 def start_preprocessing_thread(reference_dir, sample_rate=16000):
     """Start preprocessing in a background thread"""
+    global PREPROCESSING_THREAD, PREPROCESSING_ACTIVE
+    # Check if we're already preprocessing
+    with PREPROCESSING_LOCK:
+        if PREPROCESSING_ACTIVE:
+            logger.info("⏩ Skipping preprocessing start as it's already active")
+            return False
+    # Load previous state if available
+    state = load_preprocessing_state(reference_dir)
+    if state and state.get("complete", False):
+        logger.info("⏩ Skipping preprocessing as previous run was completed")
+        with PREPROCESSING_LOCK:
+            PREPROCESSING_COMPLETE = True
+        return False
     def preprocessing_worker():
         preprocess_all_references(reference_dir, sample_rate)
     PREPROCESSING_THREAD.start()
     logger.info("🧵 Started reference audio preprocessing in background thread")
+    return True
 def init_reference_audio(reference_dir, output_dir):
     """Initialize reference audio directories and start preprocessing"""
             logger.critical("💥 CRITICAL: Failed to create even a fallback directory")
             return reference_dir
 def handle_evaluation_request(request, reference_dir, output_dir, sample_rate):
     """Handle pronunciation evaluation requests with preprocessing optimization"""
     global REFERENCE_CACHE, PREPROCESSING_COMPLETE
     request_id = f"req-{id(request)}"
     logger.info(f"[{request_id}] 🆕 Starting pronunciation evaluation request")
+    # Pause preprocessing while handling user request
+    pause_preprocessing()
     temp_dir = None
     # Get the ASR model and processor using the getter functions
     if asr_model is None or asr_processor is None:
         logger.error(f"[{request_id}] ❌ Evaluation endpoint called but ASR models aren't loaded")
+        # Resume preprocessing before returning
+        resume_preprocessing()
         return jsonify({"error": "ASR model not available"}), 503
     try:
         # Check for basic request requirements
         if "audio" not in request.files:
             logger.warning(f"[{request_id}] ⚠️ Evaluation request missing audio file")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return jsonify({"error": "No audio file uploaded"}), 400
         audio_file = request.files["audio"]
         # Validate reference locator
         if not reference_locator:
             logger.warning(f"[{request_id}] ⚠️ No reference locator provided")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return jsonify({"error": "Reference locator is required"}), 400
         # OPTIMIZATION: Simple caching based on audio content hash + reference_locator
         # Check in-memory cache using the module-level cache
         if cache_key in EVALUATION_CACHE:
             logger.info(f"[{request_id}] ✅ Using cached evaluation result")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return EVALUATION_CACHE[cache_key]
         # Construct full reference directory path
                 logger.warning(f"[{request_id}] ⚠️ Created missing reference directory: {reference_dir_path}")
             except Exception as e:
                 logger.error(f"[{request_id}] ❌ Failed to create reference directory: {str(e)}")
+                # Resume preprocessing before returning
+                resume_preprocessing()
                 return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
         # Check for reference files
         # If no reference files exist, return a more detailed error message
         if not reference_files:
             logger.warning(f"[{request_id}] ⚠️ No valid reference audio files found in {reference_dir_path}")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return jsonify({
                 "error": f"No reference audio found for {reference_locator}",
                 "message": "Please upload a reference audio file before evaluation.",
             user_audio_path = processed_path
         except Exception as e:
             logger.error(f"[{request_id}] ❌ Audio processing failed: {str(e)}")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
         # Transcribe user audio
             logger.info(f"[{request_id}] ✅ User transcription: '{user_transcription}'")
         except Exception as e:
             logger.error(f"[{request_id}] ❌ ASR inference failed: {str(e)}")
+            # Resume preprocessing before returning
+            resume_preprocessing()
             return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
         # Check if we have preprocessed data for this reference locator
                         additional_files = remaining_files[:5]  # Process max 5 more
                         # Process remaining files
         # Clean up temp files
         try:
             if temp_dir and os.path.exists(temp_dir):
             "total_references_compared": len(all_results),
             "total_available_references": len(reference_files),
             "used_preprocessed_data": using_preprocessed,
+            "preprocessing_status": get_preprocessing_status()
         })
         # Cache the result for future identical requests
             # Remove oldest entry (simplified approach)
             EVALUATION_CACHE.pop(next(iter(EVALUATION_CACHE)))
+        # Resume preprocessing before returning
+        resume_preprocessing()
         return response
+        # Add a new function to get preprocessing status
 def get_preprocessing_status():
     """Get the current status of reference audio preprocessing"""
+    global PREPROCESSING_COMPLETE, REFERENCE_CACHE, PREPROCESSING_ACTIVE, PREPROCESSING_PAUSE
     with PREPROCESSING_LOCK:
         is_complete = PREPROCESSING_COMPLETE
+        is_active = PREPROCESSING_ACTIVE
     # Count total preprocessed references
     preprocessed_count = 0
+    reference_patterns_count = 0
     for pattern, files in REFERENCE_CACHE.items():
         preprocessed_count += len(files)
+        if len(files) > 0:
+            reference_patterns_count += 1
     # Check if preprocessing thread is alive
     thread_running = PREPROCESSING_THREAD is not None and PREPROCESSING_THREAD.is_alive()
+    # Check if preprocessing is currently paused
+    is_paused = not PREPROCESSING_PAUSE.is_set()
     return {
         "complete": is_complete,
+        "active": is_active,
+        "paused": is_paused,
         "preprocessed_files": preprocessed_count,
         "patterns_cached": len(REFERENCE_CACHE),
+        "completed_patterns": reference_patterns_count,
+        "thread_running": thread_running,
+        "pid": os.getpid()
+    }
+# Clean up resources when the module is unloaded
+def cleanup_resources():
+    """Clean up any resources when the module is unloaded/restarted"""
+    release_preprocessing_lock()
+# Register cleanup handler
+import atexit
+atexit.register(cleanup_resources)