wellwisherofindia commited on
Commit
e886781
·
1 Parent(s): cab6fc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -44
app.py CHANGED
@@ -1,8 +1,8 @@
1
  #!/usr/bin/env python3
2
  """
3
- Hindi RAG Voice Demo - Gradio Implementation (No OCR Version)
4
  A streamlined voice-enabled RAG system for Hindi content using Gradio
5
- Assumes PDFs have selectable text - no OCR processing
6
  """
7
 
8
  import gradio as gr
@@ -17,8 +17,9 @@ import json
17
  import numpy as np
18
  from sentence_transformers import SentenceTransformer
19
  import faiss
20
- import whisper
21
  from gtts import gTTS
 
22
  import warnings
23
  warnings.filterwarnings("ignore")
24
 
@@ -29,6 +30,7 @@ CONFIG = {
29
  'MAX_QUERIES_PER_SESSION': 5,
30
  'MAX_AUDIO_DURATION': 120, # 2 minutes
31
  'GROQ_API_KEY': os.getenv('GAPI'),
 
32
  }
33
 
34
  # Global session storage
@@ -41,21 +43,112 @@ SESSION_DATA = {
41
  'author_name': '',
42
  'book_title': '',
43
  'embedding_model': None,
44
- 'whisper_model': None
45
  }
46
 
47
- # Initialize models (cached)
48
  def load_models():
49
- """Load and cache models"""
50
  if SESSION_DATA['embedding_model'] is None:
51
  print("Loading embedding model...")
52
  SESSION_DATA['embedding_model'] = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
53
 
54
- if SESSION_DATA['whisper_model'] is None:
55
- print("Loading Whisper model...")
56
- SESSION_DATA['whisper_model'] = whisper.load_model("base")
 
 
 
57
 
58
- return SESSION_DATA['embedding_model'], SESSION_DATA['whisper_model']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # Text extraction functions
61
  def extract_text_from_pdf(pdf_path):
@@ -209,35 +302,6 @@ def generate_rag_response(query, context_chunks):
209
  response = call_groq_api(prompt)
210
  return response
211
 
212
- # Audio processing functions
213
- def transcribe_audio(audio_file):
214
- """Transcribe audio using Whisper"""
215
- if audio_file is None:
216
- return ""
217
-
218
- try:
219
- _, whisper_model = load_models()
220
- result = whisper_model.transcribe(audio_file, language="hi")
221
- return result["text"]
222
- except Exception as e:
223
- return f"Transcription error: {str(e)}"
224
-
225
- def text_to_speech(text):
226
- """Convert text to speech in Hindi"""
227
- if not text or len(text.strip()) == 0:
228
- return None
229
-
230
- try:
231
- tts = gTTS(text=text, lang='hi', slow=False)
232
-
233
- # Save to temporary file
234
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
235
- tts.save(tmp_file.name)
236
- return tmp_file.name
237
- except Exception as e:
238
- print(f"TTS Error: {str(e)}")
239
- return None
240
-
241
  # Authentication function
242
  def authenticate(passcode):
243
  """Check passcode authentication"""
@@ -370,7 +434,7 @@ def create_interface():
370
  """Create the Gradio interface"""
371
 
372
  with gr.Blocks(
373
- title="Hindi RAG Voice Demo",
374
  theme=gr.themes.Soft(),
375
  css="""
376
  .main-header { text-align: center; color: #2E86AB; margin-bottom: 2rem; }
@@ -381,10 +445,10 @@ def create_interface():
381
 
382
  gr.HTML("""
383
  <div class="main-header">
384
- <h1>📚 Hindi RAG Voice Demo</h1>
385
  <h3>हिंदी पुस्तक आवाज़ सहायक</h3>
386
- <p>AI-powered interactive book assistant for Indian authors</p>
387
- <p><em>Optimized for PDFs with selectable text</em></p>
388
  </div>
389
  """)
390
 
@@ -436,6 +500,7 @@ def create_interface():
436
  # Query section
437
  with gr.Group(visible=False) as query_section:
438
  gr.Markdown("### 🎤 Step 2: Ask Questions / प्रश्न पूछें")
 
439
 
440
  with gr.Row():
441
  with gr.Column():
@@ -478,7 +543,9 @@ def create_interface():
478
  - PDF with selectable text (no scanned images)
479
  - Max file size: 10MB
480
  - Max queries: 5 per session
 
481
  - Supported: Hindi & English text
 
482
  """)
483
 
484
  # Event handlers
@@ -513,7 +580,7 @@ def create_interface():
513
  # Main function
514
  def main():
515
  """Main function to launch the application"""
516
- print("🚀 Starting Hindi RAG Voice Demo (No OCR Version)...")
517
  print("📋 Loading AI models (this may take a moment)...")
518
 
519
  # Pre-load models
 
1
  #!/usr/bin/env python3
2
  """
3
+ Hindi RAG Voice Demo - Gradio Implementation (Groq Whisper API Version)
4
  A streamlined voice-enabled RAG system for Hindi content using Gradio
5
+ Uses Groq Whisper API for transcription and assumes PDFs have selectable text
6
  """
7
 
8
  import gradio as gr
 
17
  import numpy as np
18
  from sentence_transformers import SentenceTransformer
19
  import faiss
20
+ from groq import Groq
21
  from gtts import gTTS
22
+ import subprocess
23
  import warnings
24
  warnings.filterwarnings("ignore")
25
 
 
30
  'MAX_QUERIES_PER_SESSION': 5,
31
  'MAX_AUDIO_DURATION': 120, # 2 minutes
32
  'GROQ_API_KEY': os.getenv('GAPI'),
33
+ 'AUDIO_CLIP_DURATION': 10, # First 10 seconds only
34
  }
35
 
36
  # Global session storage
 
43
  'author_name': '',
44
  'book_title': '',
45
  'embedding_model': None,
46
+ 'groq_client': None
47
  }
48
 
49
+ # Initialize models and clients (cached)
50
  def load_models():
51
+ """Load and cache models and clients"""
52
  if SESSION_DATA['embedding_model'] is None:
53
  print("Loading embedding model...")
54
  SESSION_DATA['embedding_model'] = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
55
 
56
+ if SESSION_DATA['groq_client'] is None:
57
+ if CONFIG['GROQ_API_KEY']:
58
+ print("Initializing Groq client...")
59
+ SESSION_DATA['groq_client'] = Groq(api_key=CONFIG['GROQ_API_KEY'])
60
+ else:
61
+ print("Warning: GROQ_API_KEY not found")
62
 
63
+ return SESSION_DATA['embedding_model'], SESSION_DATA['groq_client']
64
+
65
+ # Audio processing functions
66
+ def trim_audio_to_duration(input_path, output_path, duration=10):
67
+ """Trim audio to specified duration using ffmpeg"""
68
+ try:
69
+ # Use ffmpeg to trim audio to first N seconds
70
+ cmd = [
71
+ 'ffmpeg', '-i', input_path,
72
+ '-t', str(duration),
73
+ '-acodec', 'copy',
74
+ '-y', # Overwrite output file
75
+ output_path
76
+ ]
77
+
78
+ result = subprocess.run(cmd, capture_output=True, text=True)
79
+
80
+ if result.returncode == 0:
81
+ return True
82
+ else:
83
+ print(f"FFmpeg error: {result.stderr}")
84
+ return False
85
+
86
+ except Exception as e:
87
+ print(f"Error trimming audio: {str(e)}")
88
+ return False
89
+
90
+ def transcribe_audio(audio_file):
91
+ """Transcribe audio using Groq Whisper API (first 10 seconds only)"""
92
+ if audio_file is None:
93
+ return ""
94
+
95
+ if not CONFIG['GROQ_API_KEY'] or SESSION_DATA['groq_client'] is None:
96
+ return "Error: Groq API key not configured"
97
+
98
+ try:
99
+ # Create temporary file for trimmed audio
100
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
101
+ trimmed_audio_path = tmp_file.name
102
+
103
+ # Trim audio to first 10 seconds
104
+ if not trim_audio_to_duration(audio_file, trimmed_audio_path, CONFIG['AUDIO_CLIP_DURATION']):
105
+ # If trimming fails, use original file but warn user
106
+ print("Warning: Could not trim audio, using full duration")
107
+ trimmed_audio_path = audio_file
108
+
109
+ # Transcribe using Groq Whisper API
110
+ with open(trimmed_audio_path, "rb") as file:
111
+ transcription = SESSION_DATA['groq_client'].audio.transcriptions.create(
112
+ file=(os.path.basename(trimmed_audio_path), file.read()),
113
+ model="whisper-large-v3",
114
+ response_format="verbose_json",
115
+ language="hi" # Specify Hindi language
116
+ )
117
+
118
+ # Clean up temporary file if we created one
119
+ if trimmed_audio_path != audio_file:
120
+ try:
121
+ os.unlink(trimmed_audio_path)
122
+ except:
123
+ pass
124
+
125
+ return transcription.text
126
+
127
+ except Exception as e:
128
+ # Clean up on error
129
+ try:
130
+ if 'trimmed_audio_path' in locals() and trimmed_audio_path != audio_file:
131
+ os.unlink(trimmed_audio_path)
132
+ except:
133
+ pass
134
+
135
+ return f"Transcription error: {str(e)}"
136
+
137
+ def text_to_speech(text):
138
+ """Convert text to speech in Hindi"""
139
+ if not text or len(text.strip()) == 0:
140
+ return None
141
+
142
+ try:
143
+ tts = gTTS(text=text, lang='hi', slow=False)
144
+
145
+ # Save to temporary file
146
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
147
+ tts.save(tmp_file.name)
148
+ return tmp_file.name
149
+ except Exception as e:
150
+ print(f"TTS Error: {str(e)}")
151
+ return None
152
 
153
  # Text extraction functions
154
  def extract_text_from_pdf(pdf_path):
 
302
  response = call_groq_api(prompt)
303
  return response
304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  # Authentication function
306
  def authenticate(passcode):
307
  """Check passcode authentication"""
 
434
  """Create the Gradio interface"""
435
 
436
  with gr.Blocks(
437
+ title="Hindi RAG Voice Demo - Groq Whisper",
438
  theme=gr.themes.Soft(),
439
  css="""
440
  .main-header { text-align: center; color: #2E86AB; margin-bottom: 2rem; }
 
445
 
446
  gr.HTML("""
447
  <div class="main-header">
448
+ <h1>📚 Hindi RAG Voice Demo - Groq Whisper</h1>
449
  <h3>हिंदी पुस्तक आवाज़ सहायक</h3>
450
+ <p>AI-powered interactive book assistant with Groq Whisper API</p>
451
+ <p><em>Audio transcription limited to first 10 seconds</em></p>
452
  </div>
453
  """)
454
 
 
500
  # Query section
501
  with gr.Group(visible=False) as query_section:
502
  gr.Markdown("### 🎤 Step 2: Ask Questions / प्रश्न पूछें")
503
+ gr.Markdown("**Note:** Audio recordings are limited to first 10 seconds for transcription")
504
 
505
  with gr.Row():
506
  with gr.Column():
 
543
  - PDF with selectable text (no scanned images)
544
  - Max file size: 10MB
545
  - Max queries: 5 per session
546
+ - Audio transcription: First 10 seconds only
547
  - Supported: Hindi & English text
548
+ - Requires: Groq API key and ffmpeg
549
  """)
550
 
551
  # Event handlers
 
580
  # Main function
581
  def main():
582
  """Main function to launch the application"""
583
+ print("🚀 Starting Hindi RAG Voice Demo (Groq Whisper API Version)...")
584
  print("📋 Loading AI models (this may take a moment)...")
585
 
586
  # Pre-load models