Spaces:

Lesterchia1
/

FPOC2_AI-Tutor_Chatbot

Running

App Files Files Community

Chia Woon Yap commited on Nov 21, 2025

Commit

fb80bae

verified ·

1 Parent(s): f8542fd

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -116

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import os
 import time
 import groq
 import uuid
 # LangChain imports
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
@@ -27,10 +28,8 @@ import fitz  # PyMuPDF for PDFs
 import docx  # python-docx for Word files
 import gtts  # Google Text-to-Speech library
 from pptx import Presentation  # python-pptx for PowerPoint files
-import re
 import torch
-import torchaudio
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
@@ -86,143 +85,151 @@ Answer: d) 0.4
 Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
 """
-# Simplified and Robust Whisper Transcriber
-class SimpleWhisperTranscriber:
-    def __init__(self, model_name="openai/whisper-base.en"):
         self.device = 0 if torch.cuda.is_available() else "cpu"
-        self.model_name = model_name
-        print(f"Initializing Whisper model: {model_name} on {self.device}")
-        try:
-            # Simplified pipeline with minimal parameters
-            self.pipe = pipeline(
-                task="automatic-speech-recognition",
-                model=model_name,
-                device=self.device,
-            )
-            print("✅ Whisper model loaded successfully")
-        except Exception as e:
-            print(f"❌ Error loading Whisper model: {e}")
-            # Fallback to tiny model if base fails
-            self.pipe = pipeline(
-                task="automatic-speech-recognition",
-                model="openai/whisper-tiny.en",
-                device=self.device,
-            )
-    def transcribe_numpy(self, sr, y):
-        """Simplified and robust transcription"""
         try:
-            print(f"Audio shape: {y.shape}, Sample rate: {sr}")
-            # Basic preprocessing - keep it simple
-            if y.ndim > 1:
-                y = y.mean(axis=1)  # Convert to mono
-            # Convert to proper data type
-            y = y.astype(np.float32)
-            # Simple normalization
-            max_val = np.max(np.abs(y))
-            if max_val > 0:
-                y = y / max_val
-            print(f"After preprocessing - shape: {y.shape}, max: {np.max(y)}, min: {np.min(y)}")
-            # Check audio length
             audio_duration = len(y) / sr
             print(f"Audio duration: {audio_duration:.2f} seconds")
-            if audio_duration < 0.3:
-                return "Audio too short. Please speak for at least 1 second."
-            # Create audio input - SIMPLIFIED
-            audio_input = {
-                "array": y,
-                "sampling_rate": sr
-            }
-            # Simple transcription call
             print("Starting transcription...")
-            result = self.pipe(audio_input)
-            print("Transcription completed")
-            text = result["text"].strip()
-            print(f"Raw transcription: '{text}'")
-            if not text:
-                return "No speech detected. Please try speaking more clearly."
-            # Check for common false positives
-            false_positives = ["", "you", "thank you", "thanks for watching", "hello", "hi"]
-            if text.lower() in false_positives:
-                return "No meaningful speech detected. Please try again with clearer audio."
-            return text
         except Exception as e:
-            error_msg = f"Transcription error: {str(e)}"
-            print(f"❌ {error_msg}")
-            # Return more specific error message
-            return f"Audio processing failed: {str(e)}"
-# Initialize the transcriber
 try:
-    transcriber = SimpleWhisperTranscriber()
-    print("✅ Transcriber initialized successfully")
 except Exception as e:
-    print(f"❌ Failed to initialize transcriber: {e}")
     transcriber = None
 def get_transcription_status(audio):
-    """Provide status feedback for transcription"""
     if audio is None:
-        return "Ready to record audio"
     try:
         sr, y = audio
         duration = len(y) / sr if sr > 0 else 0
         if duration < 0.5:
-            return "Audio too short - please record at least 1 second"
-        elif duration > 60 and not torch.cuda.is_available():
-            return "Long audio detected on CPU - this may take a while..."
         else:
-            device = "GPU" if torch.cuda.is_available() else "CPU"
-            return f"Processing {duration:.1f}s audio on {device}..."
-    except Exception as e:
-        return f"Error analyzing audio: {str(e)}"
-def transcribe_audio(audio):
-    """Main transcription function with better error handling"""
-    if audio is None:
-        return "Please record audio first"
-    if transcriber is None:
-        return "Transcription service not available. Please type your message."
-    try:
-        sr, y = audio
-        # Basic validation
-        if sr is None or y is None or len(y) == 0:
-            return "Invalid audio data. Please try recording again."
-        print(f"=== Starting Transcription ===")
-        print(f"Sample rate: {sr}, Audio length: {len(y)}")
-        result = transcriber.transcribe_numpy(sr, y)
-        print(f"=== Transcription Result ===")
-        print(f"Result: '{result}'")
-        return result
-    except Exception as e:
-        error_msg = f"Unexpected error: {str(e)}"
-        print(f"❌ {error_msg}")
-        return "Failed to process audio. Please try typing your message instead."
 # Function to clean AI response by removing unwanted formatting
 def clean_response(response):
@@ -424,25 +431,25 @@ def tutor_ai_chatbot():
             transcription_status = gr.Textbox(
                 label="Transcription Status",
                 interactive=False,
-                value="Record audio to see status here",
                 max_lines=2
             )
             # Voice recording tips - ONLY in AI Chatbot tab
             with gr.Accordion("Voice Recording Tips", open=False):
                 gr.Markdown("""
-                **For better speech recognition accuracy:**
-                - Speak clearly and at a moderate pace
-                - Record in a quiet environment
-                - Keep the microphone close to your mouth (10-15 cm)
-                - Use a good quality microphone if possible
-                - Review the transcribed text before sending
-                - If transcription is poor, try recording again or type manually
-                **Performance Info:**
-                - GPU: Fast transcription (2-5 seconds)
-                - CPU: Slower but functional (10-30 seconds for longer audio)
-                - Using model: whisper-base.en (optimized for accuracy/speed balance)
                 """)
             # Clear chat history button
@@ -492,7 +499,7 @@ def tutor_ai_chatbot():
                 inputs=audio_input,
                 outputs=msg
             ).then(
-                fn=lambda x: "Transcription completed!" if x and x != "Please record audio first" else "Ready for new recording",
                 inputs=msg,
                 outputs=transcription_status
             )

 import time
 import groq
 import uuid
+import re
 # LangChain imports
 from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
 import docx  # python-docx for Word files
 import gtts  # Google Text-to-Speech library
 from pptx import Presentation  # python-pptx for PowerPoint files
 import torch
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
 Feedback: This question tests understanding of Bayes' Theorem by requiring the calculation of conditional probability using the given values.
 """
+# Fixed Whisper Implementation
+class FixedWhisperTranscriber:
+    def __init__(self):
         self.device = 0 if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        # Try multiple models in order
+        self.model = self._load_model()
+    def _load_model(self):
+        """Try loading different models until one works"""
+        models_to_try = [
+            "openai/whisper-base",
+            "openai/whisper-tiny",
+            "openai/whisper-small",
+        ]
+        for model_name in models_to_try:
+            try:
+                print(f"Trying to load: {model_name}")
+                pipe = pipeline(
+                    "automatic-speech-recognition",
+                    model=model_name,
+                    device=self.device,
+                )
+                print(f"✅ Successfully loaded: {model_name}")
+                return pipe
+            except Exception as e:
+                print(f"❌ Failed to load {model_name}: {e}")
+                continue
+        raise Exception("All models failed to load")
+    def transcribe_audio(self, audio):
+        """Robust transcription with proper error handling"""
+        if audio is None:
+            return "Please record audio first"
         try:
+            sr, y = audio
+            print(f"Audio received - Sample rate: {sr}, Length: {len(y)}")
+            # Basic validation
+            if len(y) == 0:
+                return "Empty audio detected"
+            # Convert to mono if stereo
+            if y.ndim > 1:
+                y = np.mean(y, axis=1)
+            # Convert to float32 and normalize
+            y = y.astype(np.float32)
+            if np.max(np.abs(y)) > 0:
+                y = y / np.max(np.abs(y))
+            # Check audio quality
             audio_duration = len(y) / sr
             print(f"Audio duration: {audio_duration:.2f} seconds")
+            if audio_duration < 0.5:
+                return "Audio too short. Speak for at least 1 second."
+            if audio_duration > 30:
+                return "Audio too long. Keep it under 30 seconds."
+            # Prepare audio for Whisper
+            audio_dict = {"array": y, "sampling_rate": sr}
             print("Starting transcription...")
+            # Simple transcription call
+            result = self.model(audio_dict)
+            transcription = result["text"].strip()
+            print(f"Raw transcription: '{transcription}'")
+            # Filter out garbage outputs
+            if self._is_garbage_transcription(transcription):
+                return "No clear speech detected. Please try again with clearer audio."
+            return transcription
         except Exception as e:
+            print(f"Transcription error: {str(e)}")
+            return f"Transcription failed: {str(e)}"
+    def _is_garbage_transcription(self, text):
+        """Check if transcription is garbage"""
+        if not text:
+            return True
+        # Common garbage patterns
+        garbage_patterns = [
+            r"^(oh,\s*)+oh$",
+            r"^(ah,\s*)+ah$",
+            r"^(\w+,\s*)+\w+$",  # Repeated single words
+        ]
+        text_lower = text.lower().strip()
+        for pattern in garbage_patterns:
+            if re.match(pattern, text_lower):
+                return True
+        # Check if it's just repetitive nonsense
+        words = text_lower.split()
+        if len(words) > 10:
+            unique_words = len(set(words))
+            if unique_words / len(words) < 0.3:  # Too repetitive
+                return True
+        return False
+# Initialize transcriber
 try:
+    transcriber = FixedWhisperTranscriber()
 except Exception as e:
+    print(f"Failed to initialize transcriber: {e}")
     transcriber = None
+def transcribe_audio(audio):
+    """Main transcription function"""
+    if transcriber is None:
+        return "Speech recognition not available"
+    return transcriber.transcribe_audio(audio)
 def get_transcription_status(audio):
+    """Status updates"""
     if audio is None:
+        return "Click record to start"
     try:
         sr, y = audio
         duration = len(y) / sr if sr > 0 else 0
         if duration < 0.5:
+            return "Recording... (keep speaking)"
+        elif duration > 10:
+            return "Processing longer audio..."
         else:
+            return "Processing audio..."
+    except:
+        return "Ready to record"
 # Function to clean AI response by removing unwanted formatting
 def clean_response(response):
             transcription_status = gr.Textbox(
                 label="Transcription Status",
                 interactive=False,
+                value="Click record to start",
                 max_lines=2
             )
             # Voice recording tips - ONLY in AI Chatbot tab
             with gr.Accordion("Voice Recording Tips", open=False):
                 gr.Markdown("""
+                **For perfect transcription:**
+                - 🎤 Speak clearly and directly into microphone
+                - 🔇 Record in QUIET environment (no background noise)
+                - 📏 Keep recording between 2-10 seconds
+                - 🗣️ Speak at normal volume and pace
+                - 📱 Use a good quality microphone
+                **If you see 'oh oh oh' errors:**
+                - Your audio might be too noisy
+                - Try recording in a quieter place
+                - Speak more clearly and slowly
+                - Use headphones with microphone
                 """)
             # Clear chat history button
                 inputs=audio_input,
                 outputs=msg
             ).then(
+                fn=lambda x: "Transcription completed!" if x and "failed" not in x.lower() and "error" not in x.lower() and "sorry" not in x.lower() else "Ready for new recording",
                 inputs=msg,
                 outputs=transcription_status
             )