Spaces:

Lesterchia1
/

FPOC2_AI-Tutor_Chatbot

Running

App Files Files Community

Chia Woon Yap commited on Nov 21, 2025

Commit

93ff155

verified ·

1 Parent(s): 1376d34

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -87

app.py CHANGED Viewed

@@ -31,7 +31,6 @@ import re
 import torch
 import torchaudio
-from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
@@ -87,133 +86,143 @@ Answer: d) 0.4
 Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
 """
-# Enhanced Whisper Transcriber with Chunked Processing
-class EnhancedWhisperTranscriber:
-    def __init__(self, model_name=None):
-        # Auto-select optimal model based on hardware
-        if model_name is None:
-            model_name = self.get_optimal_model()
         self.device = 0 if torch.cuda.is_available() else "cpu"
         self.model_name = model_name
         print(f"Initializing Whisper model: {model_name} on {self.device}")
-        self.pipe = pipeline(
-            task="automatic-speech-recognition",
-            model=model_name,
-            chunk_length_s=30,  # Process in 30-second chunks
-            stride_length_s=5,  # 5-second overlap between chunks
-            device=self.device,
-            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        )
-    def get_optimal_model(self):
-        """Automatically select the best model for available hardware"""
-        if torch.cuda.is_available():
-            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
-            if gpu_memory > 8:  # 8GB+ VRAM
-                return "openai/whisper-small.en"
-            else:  # Limited VRAM
-                return "openai/whisper-base.en"
-        else:  # CPU only
-            return "openai/whisper-base.en"  # Balanced choice for CPU
-    def transcribe_numpy(self, sr, y, return_timestamps=False):
-        """Transcribe numpy array audio with chunked processing"""
         try:
-            # Enhanced audio preprocessing
             if y.ndim > 1:
                 y = y.mean(axis=1)  # Convert to mono
             y = y.astype(np.float32)
-            # Normalize audio
             max_val = np.max(np.abs(y))
             if max_val > 0:
                 y = y / max_val
-            # Remove silence (simple threshold-based)
-            silence_threshold = 0.01
-            non_silent_indices = np.where(np.abs(y) > silence_threshold)[0]
-            if len(non_silent_indices) == 0:
-                return "No speech detected. Please speak louder or check your microphone."
-            # Trim silence from beginning and end
-            start_idx = non_silent_indices[0]
-            end_idx = non_silent_indices[-1]
-            y_trimmed = y[start_idx:end_idx+1]
-            # Check if audio is too short
-            if len(y_trimmed) / sr < 0.5:  # Less than 0.5 seconds
-                return "Audio too short. Please speak for at least 1-2 seconds."
-            # Create audio dict for pipeline
-            inputs = {"array": y_trimmed, "sampling_rate": sr}
-            # Enhanced transcription with chunked processing
-            result = self.pipe(
-                inputs,
-                batch_size=4,  # Optimal batch size for chunked processing
-                generate_kwargs={"task": "transcribe"},
-                return_timestamps=return_timestamps
-            )
             text = result["text"].strip()
             if not text:
-                return "No clear speech detected. Try speaking more clearly or in a quieter environment."
             return text
         except Exception as e:
             error_msg = f"Transcription error: {str(e)}"
-            print(error_msg)
-            return f"Sorry, I couldn't process the audio. Please try again or type your message instead."
-# Initialize the enhanced transcriber
-transcriber = EnhancedWhisperTranscriber()
 def get_transcription_status(audio):
     """Provide status feedback for transcription"""
     if audio is None:
         return "Ready to record audio"
-    sr, y = audio
-    duration = len(y) / sr if sr > 0 else 0
-    if duration < 0.5:
-        return "Audio too short - please record at least 1 second"
-    elif duration > 60 and not torch.cuda.is_available():
-        return "Long audio detected on CPU - this may take a while..."
-    else:
-        device = "GPU" if torch.cuda.is_available() else "CPU"
-        return f"Processing {duration:.1f}s audio on {device}..."
 def transcribe_audio(audio):
-    """Main transcription function with progress feedback"""
     if audio is None:
         return "Please record audio first"
-    # Show device info for debugging
-    device_type = "GPU" if torch.cuda.is_available() else "CPU"
-    print(f"Transcribing on {device_type} using {transcriber.model_name}")
-    sr, y = audio
-    # For CPU users, we might want to show a warning for long audio
-    audio_duration = len(y) / sr if sr > 0 else 0
-    if not torch.cuda.is_available() and audio_duration > 30:  # Longer than 30 seconds on CPU
-        print("Warning: Long audio on CPU - transcription may take a while...")
-    # Use the enhanced transcriber
-    result = transcriber.transcribe_numpy(sr, y)
-    # Log transcription result for debugging
-    print(f"Transcription result: {result[:100]}...")
-    return result
 # Function to clean AI response by removing unwanted formatting
 def clean_response(response):

 import torch
 import torchaudio
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
 Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
 """
+# Simplified and Robust Whisper Transcriber
+class SimpleWhisperTranscriber:
+    def __init__(self, model_name="openai/whisper-base.en"):
         self.device = 0 if torch.cuda.is_available() else "cpu"
         self.model_name = model_name
         print(f"Initializing Whisper model: {model_name} on {self.device}")
+        try:
+            # Simplified pipeline with minimal parameters
+            self.pipe = pipeline(
+                task="automatic-speech-recognition",
+                model=model_name,
+                device=self.device,
+            )
+            print("✅ Whisper model loaded successfully")
+        except Exception as e:
+            print(f"❌ Error loading Whisper model: {e}")
+            # Fallback to tiny model if base fails
+            self.pipe = pipeline(
+                task="automatic-speech-recognition",
+                model="openai/whisper-tiny.en",
+                device=self.device,
+            )
+    def transcribe_numpy(self, sr, y):
+        """Simplified and robust transcription"""
         try:
+            print(f"Audio shape: {y.shape}, Sample rate: {sr}")
+            # Basic preprocessing - keep it simple
             if y.ndim > 1:
                 y = y.mean(axis=1)  # Convert to mono
+            # Convert to proper data type
             y = y.astype(np.float32)
+            # Simple normalization
             max_val = np.max(np.abs(y))
             if max_val > 0:
                 y = y / max_val
+            print(f"After preprocessing - shape: {y.shape}, max: {np.max(y)}, min: {np.min(y)}")
+            # Check audio length
+            audio_duration = len(y) / sr
+            print(f"Audio duration: {audio_duration:.2f} seconds")
+            if audio_duration < 0.3:
+                return "Audio too short. Please speak for at least 1 second."
+            # Create audio input - SIMPLIFIED
+            audio_input = {
+                "array": y,
+                "sampling_rate": sr
+            }
+            # Simple transcription call
+            print("Starting transcription...")
+            result = self.pipe(audio_input)
+            print("Transcription completed")
             text = result["text"].strip()
+            print(f"Raw transcription: '{text}'")
             if not text:
+                return "No speech detected. Please try speaking more clearly."
+            # Check for common false positives
+            false_positives = ["", "you", "thank you", "thanks for watching", "hello", "hi"]
+            if text.lower() in false_positives:
+                return "No meaningful speech detected. Please try again with clearer audio."
             return text
         except Exception as e:
             error_msg = f"Transcription error: {str(e)}"
+            print(f"❌ {error_msg}")
+            # Return more specific error message
+            return f"Audio processing failed: {str(e)}"
+# Initialize the transcriber
+try:
+    transcriber = SimpleWhisperTranscriber()
+    print("✅ Transcriber initialized successfully")
+except Exception as e:
+    print(f"❌ Failed to initialize transcriber: {e}")
+    transcriber = None
 def get_transcription_status(audio):
     """Provide status feedback for transcription"""
     if audio is None:
         return "Ready to record audio"
+    try:
+        sr, y = audio
+        duration = len(y) / sr if sr > 0 else 0
+        if duration < 0.5:
+            return "Audio too short - please record at least 1 second"
+        elif duration > 60 and not torch.cuda.is_available():
+            return "Long audio detected on CPU - this may take a while..."
+        else:
+            device = "GPU" if torch.cuda.is_available() else "CPU"
+            return f"Processing {duration:.1f}s audio on {device}..."
+    except Exception as e:
+        return f"Error analyzing audio: {str(e)}"
 def transcribe_audio(audio):
+    """Main transcription function with better error handling"""
     if audio is None:
         return "Please record audio first"
+    if transcriber is None:
+        return "Transcription service not available. Please type your message."
+    try:
+        sr, y = audio
+        # Basic validation
+        if sr is None or y is None or len(y) == 0:
+            return "Invalid audio data. Please try recording again."
+        print(f"=== Starting Transcription ===")
+        print(f"Sample rate: {sr}, Audio length: {len(y)}")
+        result = transcriber.transcribe_numpy(sr, y)
+        print(f"=== Transcription Result ===")
+        print(f"Result: '{result}'")
+        return result
+    except Exception as e:
+        error_msg = f"Unexpected error: {str(e)}"
+        print(f"❌ {error_msg}")
+        return "Failed to process audio. Please try typing your message instead."
 # Function to clean AI response by removing unwanted formatting
 def clean_response(response):