Hindi-Rag

Sleeping

App Files Files Community

wellwisherofindia commited on Jun 29, 2025

Commit

e886781

1 Parent(s): cab6fc3

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -44

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
-Hindi RAG Voice Demo - Gradio Implementation (No OCR Version)
 A streamlined voice-enabled RAG system for Hindi content using Gradio
-Assumes PDFs have selectable text - no OCR processing
 """
 import gradio as gr
@@ -17,8 +17,9 @@ import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
 import faiss
-import whisper
 from gtts import gTTS
 import warnings
 warnings.filterwarnings("ignore")
@@ -29,6 +30,7 @@ CONFIG = {
     'MAX_QUERIES_PER_SESSION': 5,
     'MAX_AUDIO_DURATION': 120,  # 2 minutes
     'GROQ_API_KEY': os.getenv('GAPI'),
 }
 # Global session storage
@@ -41,21 +43,112 @@ SESSION_DATA = {
     'author_name': '',
     'book_title': '',
     'embedding_model': None,
-    'whisper_model': None
 }
-# Initialize models (cached)
 def load_models():
-    """Load and cache models"""
     if SESSION_DATA['embedding_model'] is None:
         print("Loading embedding model...")
         SESSION_DATA['embedding_model'] = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
-    if SESSION_DATA['whisper_model'] is None:
-        print("Loading Whisper model...")
-        SESSION_DATA['whisper_model'] = whisper.load_model("base")
-    return SESSION_DATA['embedding_model'], SESSION_DATA['whisper_model']
 # Text extraction functions
 def extract_text_from_pdf(pdf_path):
@@ -209,35 +302,6 @@ def generate_rag_response(query, context_chunks):
     response = call_groq_api(prompt)
     return response
-# Audio processing functions
-def transcribe_audio(audio_file):
-    """Transcribe audio using Whisper"""
-    if audio_file is None:
-        return ""
-    try:
-        _, whisper_model = load_models()
-        result = whisper_model.transcribe(audio_file, language="hi")
-        return result["text"]
-    except Exception as e:
-        return f"Transcription error: {str(e)}"
-def text_to_speech(text):
-    """Convert text to speech in Hindi"""
-    if not text or len(text.strip()) == 0:
-        return None
-    try:
-        tts = gTTS(text=text, lang='hi', slow=False)
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-            tts.save(tmp_file.name)
-            return tmp_file.name
-    except Exception as e:
-        print(f"TTS Error: {str(e)}")
-        return None
 # Authentication function
 def authenticate(passcode):
     """Check passcode authentication"""
@@ -370,7 +434,7 @@ def create_interface():
     """Create the Gradio interface"""
     with gr.Blocks(
-        title="Hindi RAG Voice Demo",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; color: #2E86AB; margin-bottom: 2rem; }
@@ -381,10 +445,10 @@ def create_interface():
         gr.HTML("""
         <div class="main-header">
-            <h1>📚 Hindi RAG Voice Demo</h1>
             <h3>हिंदी पुस्तक आवाज़ सहायक</h3>
-            <p>AI-powered interactive book assistant for Indian authors</p>
-            <p><em>Optimized for PDFs with selectable text</em></p>
         </div>
         """)
@@ -436,6 +500,7 @@ def create_interface():
             # Query section
             with gr.Group(visible=False) as query_section:
                 gr.Markdown("### 🎤 Step 2: Ask Questions / प्रश्न पूछें")
                 with gr.Row():
                     with gr.Column():
@@ -478,7 +543,9 @@ def create_interface():
                     - PDF with selectable text (no scanned images)
                     - Max file size: 10MB
                     - Max queries: 5 per session
                     - Supported: Hindi & English text
                     """)
         # Event handlers
@@ -513,7 +580,7 @@ def create_interface():
 # Main function
 def main():
     """Main function to launch the application"""
-    print("🚀 Starting Hindi RAG Voice Demo (No OCR Version)...")
     print("📋 Loading AI models (this may take a moment)...")
     # Pre-load models

 #!/usr/bin/env python3
 """
+Hindi RAG Voice Demo - Gradio Implementation (Groq Whisper API Version)
 A streamlined voice-enabled RAG system for Hindi content using Gradio
+Uses Groq Whisper API for transcription and assumes PDFs have selectable text
 """
 import gradio as gr
 import numpy as np
 from sentence_transformers import SentenceTransformer
 import faiss
+from groq import Groq
 from gtts import gTTS
+import subprocess
 import warnings
 warnings.filterwarnings("ignore")
     'MAX_QUERIES_PER_SESSION': 5,
     'MAX_AUDIO_DURATION': 120,  # 2 minutes
     'GROQ_API_KEY': os.getenv('GAPI'),
+    'AUDIO_CLIP_DURATION': 10,  # First 10 seconds only
 }
 # Global session storage
     'author_name': '',
     'book_title': '',
     'embedding_model': None,
+    'groq_client': None
 }
+# Initialize models and clients (cached)
 def load_models():
+    """Load and cache models and clients"""
     if SESSION_DATA['embedding_model'] is None:
         print("Loading embedding model...")
         SESSION_DATA['embedding_model'] = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+    if SESSION_DATA['groq_client'] is None:
+        if CONFIG['GROQ_API_KEY']:
+            print("Initializing Groq client...")
+            SESSION_DATA['groq_client'] = Groq(api_key=CONFIG['GROQ_API_KEY'])
+        else:
+            print("Warning: GROQ_API_KEY not found")
+    return SESSION_DATA['embedding_model'], SESSION_DATA['groq_client']
+# Audio processing functions
+def trim_audio_to_duration(input_path, output_path, duration=10):
+    """Trim audio to specified duration using ffmpeg"""
+    try:
+        # Use ffmpeg to trim audio to first N seconds
+        cmd = [
+            'ffmpeg', '-i', input_path,
+            '-t', str(duration),
+            '-acodec', 'copy',
+            '-y',  # Overwrite output file
+            output_path
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            return True
+        else:
+            print(f"FFmpeg error: {result.stderr}")
+            return False
+    except Exception as e:
+        print(f"Error trimming audio: {str(e)}")
+        return False
+def transcribe_audio(audio_file):
+    """Transcribe audio using Groq Whisper API (first 10 seconds only)"""
+    if audio_file is None:
+        return ""
+    if not CONFIG['GROQ_API_KEY'] or SESSION_DATA['groq_client'] is None:
+        return "Error: Groq API key not configured"
+    try:
+        # Create temporary file for trimmed audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+            trimmed_audio_path = tmp_file.name
+        # Trim audio to first 10 seconds
+        if not trim_audio_to_duration(audio_file, trimmed_audio_path, CONFIG['AUDIO_CLIP_DURATION']):
+            # If trimming fails, use original file but warn user
+            print("Warning: Could not trim audio, using full duration")
+            trimmed_audio_path = audio_file
+        # Transcribe using Groq Whisper API
+        with open(trimmed_audio_path, "rb") as file:
+            transcription = SESSION_DATA['groq_client'].audio.transcriptions.create(
+                file=(os.path.basename(trimmed_audio_path), file.read()),
+                model="whisper-large-v3",
+                response_format="verbose_json",
+                language="hi"  # Specify Hindi language
+            )
+        # Clean up temporary file if we created one
+        if trimmed_audio_path != audio_file:
+            try:
+                os.unlink(trimmed_audio_path)
+            except:
+                pass
+        return transcription.text
+    except Exception as e:
+        # Clean up on error
+        try:
+            if 'trimmed_audio_path' in locals() and trimmed_audio_path != audio_file:
+                os.unlink(trimmed_audio_path)
+        except:
+            pass
+        return f"Transcription error: {str(e)}"
+def text_to_speech(text):
+    """Convert text to speech in Hindi"""
+    if not text or len(text.strip()) == 0:
+        return None
+    try:
+        tts = gTTS(text=text, lang='hi', slow=False)
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+            tts.save(tmp_file.name)
+            return tmp_file.name
+    except Exception as e:
+        print(f"TTS Error: {str(e)}")
+        return None
 # Text extraction functions
 def extract_text_from_pdf(pdf_path):
     response = call_groq_api(prompt)
     return response
 # Authentication function
 def authenticate(passcode):
     """Check passcode authentication"""
     """Create the Gradio interface"""
     with gr.Blocks(
+        title="Hindi RAG Voice Demo - Groq Whisper",
         theme=gr.themes.Soft(),
         css="""
         .main-header { text-align: center; color: #2E86AB; margin-bottom: 2rem; }
         gr.HTML("""
         <div class="main-header">
+            <h1>📚 Hindi RAG Voice Demo - Groq Whisper</h1>
             <h3>हिंदी पुस्तक आवाज़ सहायक</h3>
+            <p>AI-powered interactive book assistant with Groq Whisper API</p>
+            <p><em>Audio transcription limited to first 10 seconds</em></p>
         </div>
         """)
             # Query section
             with gr.Group(visible=False) as query_section:
                 gr.Markdown("### 🎤 Step 2: Ask Questions / प्रश्न पूछें")
+                gr.Markdown("**Note:** Audio recordings are limited to first 10 seconds for transcription")
                 with gr.Row():
                     with gr.Column():
                     - PDF with selectable text (no scanned images)
                     - Max file size: 10MB
                     - Max queries: 5 per session
+                    - Audio transcription: First 10 seconds only
                     - Supported: Hindi & English text
+                    - Requires: Groq API key and ffmpeg
                     """)
         # Event handlers
 # Main function
 def main():
     """Main function to launch the application"""
+    print("🚀 Starting Hindi RAG Voice Demo (Groq Whisper API Version)...")
     print("📋 Loading AI models (this may take a moment)...")
     # Pre-load models