Spaces:

Lesterchia1
/

FPOC2_AI-Tutor_Chatbot

Sleeping

App Files Files Community

Chia Woon Yap commited on Nov 21, 2025

Commit

dafdf4f

verified ·

1 Parent(s): d6f71f7

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -9

app.py CHANGED Viewed

@@ -29,6 +29,10 @@ import gtts  # Google Text-to-Speech library
 from pptx import Presentation  # python-pptx for PowerPoint files
 import re
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
@@ -246,37 +250,91 @@ def process_document(file):
     except Exception as e:
         return f"Error processing document: {str(e)}"
 # Function to handle speech-to-text conversion
 def transcribe_audio(audio):
-    """Simple working transcription"""
     if audio is None:
         return "Please record audio first"
     try:
         sr, y = audio
-        # Basic preprocessing
         if y.ndim > 1:
             y = y.mean(axis=1)  # Convert to mono
         y = y.astype(np.float32)
         max_val = np.max(np.abs(y))
         if max_val > 0:
             y = y / max_val
-        # Simple pipeline call
-        transcriber = pipeline(
-            "automatic-speech-recognition",
-            model="openai/whisper-base.en"
         )
-        result = transcriber({"sampling_rate": sr, "raw": y})
         text = result["text"].strip()
-        return text if text else "No clear speech detected. Try speaking louder."
     except Exception as e:
-        return f"Recording error: {str(e)}"
 # Clear chat history function
 def clear_chat_history():

 from pptx import Presentation  # python-pptx for PowerPoint files
 import re
+import torch
+import torchaudio
+from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
 # Set API Key
 groq.api_key = os.getenv("GROQ_API_KEY")
     except Exception as e:
         return f"Error processing document: {str(e)}"
 # Function to handle speech-to-text conversion
+# Initialize Whisper model globally to avoid reloading
+def initialize_whisper_model():
+    """Initialize Whisper model once to improve performance"""
+    try:
+        # Use larger model for better accuracy
+        model_name = "openai/whisper-small.en"  # or "openai/whisper-medium.en" for even better accuracy
+        transcriber = pipeline(
+            "automatic-speech-recognition",
+            model=model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device="cuda" if torch.cuda.is_available() else "cpu"
+        )
+        return transcriber
+    except Exception as e:
+        print(f"Error initializing Whisper model: {e}")
+        # Fallback to base model
+        return pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
+# Initialize model once
+whisper_model = initialize_whisper_model()
 def transcribe_audio(audio):
+    """Enhanced speech-to-text transcription with better preprocessing"""
     if audio is None:
         return "Please record audio first"
     try:
         sr, y = audio
+        # Enhanced audio preprocessing
         if y.ndim > 1:
             y = y.mean(axis=1)  # Convert to mono
+        # Convert to proper data type
         y = y.astype(np.float32)
+        # Normalize audio
         max_val = np.max(np.abs(y))
         if max_val > 0:
             y = y / max_val
+        # Remove silence (simple threshold-based)
+        silence_threshold = 0.01
+        non_silent_indices = np.where(np.abs(y) > silence_threshold)[0]
+        if len(non_silent_indices) == 0:
+            return "No speech detected. Please speak louder or check your microphone."
+        # Trim silence from beginning and end
+        start_idx = non_silent_indices[0]
+        end_idx = non_silent_indices[-1]
+        y_trimmed = y[start_idx:end_idx+1]
+        # Check if audio is too short
+        if len(y_trimmed) / sr < 0.5:  # Less than 0.5 seconds
+            return "Audio too short. Please speak for at least 1-2 seconds."
+        # Enhanced transcription with better parameters
+        result = whisper_model(
+            {
+                "sampling_rate": sr,
+                "raw": y_trimmed
+            },
+            return_timestamps=False,
+            generate_kwargs={
+                "task": "transcribe",
+                "language": "en"
+            }
         )
         text = result["text"].strip()
+        if not text or text.lower() in ["", "you", "thank you"]:
+            return "No clear speech detected. Try speaking more clearly or in a quieter environment."
+        return text
     except Exception as e:
+        error_msg = f"Transcription error: {str(e)}"
+        print(error_msg)
+        return f"Sorry, I couldn't process the audio. Please try again or type your message instead. Error: {str(e)}"
 # Clear chat history function
 def clear_chat_history():