import gradio as gr import os import tempfile import time import uuid from datetime import datetime import fitz import requests import json import numpy as np from sentence_transformers import SentenceTransformer import faiss from groq import Groq from gtts import gTTS import subprocess import warnings warnings.filterwarnings("ignore") CONFIG = { 'PASSCODE': os.getenv('PASSCODE'), 'MAX_FILE_SIZE': 10 * 1024 * 1024, 'MAX_QUERIES_PER_SESSION': 10, 'MAX_AUDIO_DURATION': 120, 'GROQ_API_KEY': os.getenv('GAPI'), 'AUDIO_CLIP_DURATION': 10, 'BOOK_THUMBNAILS_DIR': './book_thumbnails', 'OCR_BOOKS_DIR': './ocr_books', } SESSION_DATA = { 'authenticated': False, 'session_id': str(uuid.uuid4()), 'query_count': 0, 'document_chunks': [], 'faiss_index': None, 'author_name': '', 'book_title': '', 'embedding_model': None, 'groq_client': None } # Predefined questions for books PREDEFINED_QUESTIONS = { 'general': [ "इस पुस्तक का मुख्य विषय क्या है?", "लेखक ने इस पुस्तक में क्या संदेश दिया है?", "इस पुस्तक में कौन से मुख्य पात्र हैं?" ], 'analysis': [ "इस पुस्तक की मुख्य शिक्षा क्या है?", "लेखक की लेखन शैली कैसी है?", "इस पुस्तक में कौन सा मुख्य संघर्ष है?" ], 'content': [ "इस कहानी का क्या अंत है?", "पुस्तक में कौन सी मुख्य घटनाएं हैं?", "मुख्य पात्र का चरित्र कैसा है?" ] } def load_models(): if SESSION_DATA['embedding_model'] is None: print("Loading embedding model...") SESSION_DATA['embedding_model'] = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') if SESSION_DATA['groq_client'] is None: if CONFIG['GROQ_API_KEY']: print("Initializing Groq client...") SESSION_DATA['groq_client'] = Groq(api_key=CONFIG['GROQ_API_KEY']) else: print("Warning: GROQ_API_KEY not found") return SESSION_DATA['embedding_model'], SESSION_DATA['groq_client'] def trim_audio_to_duration(input_path, output_path, duration=10): try: cmd = [ 'ffmpeg', '-i', input_path, '-t', str(duration), '-acodec', 'copy', '-y', output_path ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: return True else: print(f"FFmpeg error: {result.stderr}") return False except Exception as e: print(f"Error trimming audio: {str(e)}") return False def transcribe_audio(audio_file): if audio_file is None: return "" if not CONFIG['GROQ_API_KEY'] or SESSION_DATA['groq_client'] is None: return "Error: Groq API key not configured" try: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: trimmed_audio_path = tmp_file.name if not trim_audio_to_duration(audio_file, trimmed_audio_path, CONFIG['AUDIO_CLIP_DURATION']): print("Warning: Could not trim audio, using full duration") trimmed_audio_path = audio_file with open(trimmed_audio_path, "rb") as file: transcription = SESSION_DATA['groq_client'].audio.transcriptions.create( file=(os.path.basename(trimmed_audio_path), file.read()), model="whisper-large-v3", response_format="verbose_json", language="hi" ) if trimmed_audio_path != audio_file: try: os.unlink(trimmed_audio_path) except: pass return transcription.text except Exception as e: try: if 'trimmed_audio_path' in locals() and trimmed_audio_path != audio_file: os.unlink(trimmed_audio_path) except: pass return f"Transcription error: {str(e)}" def text_to_speech(text): if not text or len(text.strip()) == 0: return None try: tts = gTTS(text=text, lang='hi', slow=False) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tts.save(tmp_file.name) return tmp_file.name except Exception as e: print(f"TTS Error: {str(e)}") return None def extract_text_from_pdf(pdf_path): text_content = "" try: pdf_document = fitz.open(pdf_path) total_pages = len(pdf_document) print(f"Processing PDF with {total_pages} pages...") for page_num in range(total_pages): page = pdf_document.load_page(page_num) page_text = page.get_text() if page_text.strip(): text_content += page_text + "\n" pdf_document.close() if not text_content.strip(): return "Error: No selectable text found in PDF. Please ensure the PDF contains selectable text, not just images." return text_content except Exception as e: return f"Error extracting text: {str(e)}" def extract_metadata(text): lines = [line.strip() for line in text.split('\n')[:25] if line.strip()] author_name = "अज्ञात लेखक" book_title = "अनाम पुस्तक" for i, line in enumerate(lines): if any(word in line.lower() for word in ['लेखक', 'author', 'by', 'द्वारा', 'रचयिता']): author_name = line elif 10 < len(line) < 100 and not any(char.isdigit() for char in line[:20]): if book_title == "अनाम पुस्तक": book_title = line return author_name, book_title def chunk_text(text, chunk_size=400, overlap=50): words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) if chunk.strip(): chunks.append(chunk) return chunks def create_embeddings(chunks): embedding_model, _ = load_models() embeddings = embedding_model.encode(chunks, show_progress_bar=False) dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) faiss.normalize_L2(embeddings) index.add(embeddings.astype('float32')) return index def search_similar_chunks(query, top_k=3): if SESSION_DATA['faiss_index'] is None or not SESSION_DATA['document_chunks']: return [] embedding_model, _ = load_models() query_embedding = embedding_model.encode([query], show_progress_bar=False) faiss.normalize_L2(query_embedding) scores, indices = SESSION_DATA['faiss_index'].search(query_embedding.astype('float32'), top_k) results = [] for i, idx in enumerate(indices[0]): if idx >= 0 and idx < len(SESSION_DATA['document_chunks']): results.append({ 'text': SESSION_DATA['document_chunks'][idx], 'score': float(scores[0][i]) }) return results def call_groq_api(prompt, model="llama-3.1-8b-instant"): if not CONFIG['GROQ_API_KEY'] or CONFIG['GROQ_API_KEY'] == 'your_groq_api_key_here': return "⚠️ Groq API key not configured. Please set GROQ_API_KEY environment variable." url = "https://api.groq.com/openai/v1/chat/completions" headers = { "Authorization": f"Bearer {CONFIG['GROQ_API_KEY']}", "Content-Type": "application/json" } data = { "model": model, "messages": [{"role": "user", "content": prompt}], "temperature": 0.7, "max_tokens": 600 } try: response = requests.post(url, headers=headers, json=data, timeout=30) response.raise_for_status() return response.json()['choices'][0]['message']['content'] except Exception as e: return f"Error calling LLM: {str(e)}" def generate_rag_response(query, context_chunks): if not context_chunks: return "मुझे इस प्रश्न का उत्तर देने के लिए पर्याप्त जानकारी नहीं मिली।" context = "\n\n".join([chunk['text'] for chunk in context_chunks]) prompt = f"""आप एक हिंदी पुस्तक सहायक हैं। निम्नलिखित जानकारी के आधार पर प्रश्न का उत्तर दें: पुस्तक: {SESSION_DATA['book_title']} लेखक: {SESSION_DATA['author_name']} संदर्भ: {context} प्रश्न: {query} निर्देश: - हिंदी में संक्षिप्त और सटीक उत्तर दें - उत्तर की शुरुआत में पुस्तक और लेखक का संदर्भ शामिल करें - केवल दिए गए संदर्भ के आधार पर ही उत्तर दें """ response = call_groq_api(prompt) return response def authenticate(passcode): if passcode == CONFIG['PASSCODE']: SESSION_DATA['authenticated'] = True return gr.update(visible=False), gr.update(visible=True), "✅ Welcome!" else: return gr.update(visible=True), gr.update(visible=False), "❌ Invalid passcode" def process_document(pdf_file): if pdf_file is None: return "Please upload a PDF file", "", "", gr.update(visible=True), gr.update(visible=False), gr.update(choices=[]) try: file_size = os.path.getsize(pdf_file.name) if file_size > CONFIG['MAX_FILE_SIZE']: return f"File too large! Max size: {CONFIG['MAX_FILE_SIZE'] // (1024*1024)}MB", "", "", gr.update(visible=True), gr.update(visible=False), gr.update(choices=[]) text_content = extract_text_from_pdf(pdf_file.name) if not text_content.strip() or "Error" in text_content: return text_content, "", "", gr.update(visible=True), gr.update(visible=False), gr.update(choices=[]) author_name, book_title = extract_metadata(text_content) SESSION_DATA['author_name'] = author_name SESSION_DATA['book_title'] = book_title chunks = chunk_text(text_content) SESSION_DATA['document_chunks'] = chunks SESSION_DATA['faiss_index'] = create_embeddings(chunks) SESSION_DATA['query_count'] = 0 # Generate predefined questions questions = [] for category in PREDEFINED_QUESTIONS.values(): questions.extend(category) success_msg = f"✅ Document processed successfully!" return success_msg, book_title, author_name, gr.update(visible=False), gr.update(visible=True), gr.update(choices=questions[:6]) except Exception as e: return f"Error processing document: {str(e)}", "", "", gr.update(visible=True), gr.update(visible=False), gr.update(choices=[]) def show_questions(): """Show the questions section""" return gr.update(visible=False), gr.update(visible=True) def process_query(audio_input, text_input, predefined_question): if SESSION_DATA['query_count'] >= CONFIG['MAX_QUERIES_PER_SESSION']: return "⚠️ Query limit reached", None if not SESSION_DATA['document_chunks']: return "Please upload a document first", None query_text = "" # Priority: Predefined > Audio > Text if predefined_question and predefined_question != "Select a question...": query_text = predefined_question elif audio_input: query_text = transcribe_audio(audio_input) if "error" in query_text.lower(): query_text = "" if not query_text.strip() and text_input.strip(): query_text = text_input.strip() if not query_text.strip(): return "Please ask a question", None try: similar_chunks = search_similar_chunks(query_text) response_text = generate_rag_response(query_text, similar_chunks) audio_response = text_to_speech(response_text) SESSION_DATA['query_count'] += 1 formatted_response = f"**प्रश्न:** {query_text}\n\n**उत्तर:** {response_text}" return formatted_response, audio_response except Exception as e: return f"Error processing query: {str(e)}", None def reset_session(): SESSION_DATA.update({ 'query_count': 0, 'document_chunks': [], 'faiss_index': None, 'author_name': '', 'book_title': '', 'session_id': str(uuid.uuid4()) }) return "✅ New session started!", "", "", gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(choices=[]) def create_interface(): with gr.Blocks( title="Hindi Book Assistant", theme=gr.themes.Soft(), css=""" .main-container { max-width: 1200px; margin: 0 auto; } .section-header { font-size: 1.2em; font-weight: bold; margin: 1em 0; } .upload-area { border: 2px dashed #ccc; padding: 2em; text-align: center; margin: 1em 0; } """ ) as demo: gr.HTML("""

📚 Hindi Book Assistant

AI-powered assistant for Hindi books with voice support

""") # Authentication Section with gr.Group(visible=True) as auth_section: gr.Markdown("### 🔐 Enter Passcode") passcode_input = gr.Textbox( label="Passcode", type="password", placeholder="Enter access code..." ) auth_button = gr.Button("🔓 Access", variant="primary") auth_status = gr.Textbox(label="Status", interactive=False) # Main Interface with gr.Group(visible=False) as main_section: # Step 1: Upload Document with gr.Group(visible=True) as upload_section: gr.Markdown("### 📄 Upload Your Book") pdf_upload = gr.File( label="Choose PDF file", file_types=[".pdf"], type="filepath" ) process_btn = gr.Button("📖 Process Book", variant="primary", size="lg") doc_status = gr.Textbox(label="Status", interactive=False) # Step 2: Book Info (shown after processing) with gr.Group(visible=False) as book_info_section: gr.Markdown("### 📚 Book Information") with gr.Row(): book_title_display = gr.Textbox(label="Book Title", interactive=False) author_display = gr.Textbox(label="Author", interactive=False) continue_btn = gr.Button("➡️ Continue to Questions", variant="primary", size="lg") # Step 3: Ask Questions (shown after continue) with gr.Group(visible=False) as query_section: gr.Markdown("### 💬 Ask Questions About Your Book") with gr.Tab("🎯 Quick Questions"): predefined_dropdown = gr.Dropdown( label="Choose a question", choices=[], value=None, interactive=True ) ask_predefined_btn = gr.Button("🔍 Ask This Question", variant="primary") with gr.Tab("🎤 Voice Question"): audio_input = gr.Audio( label="Record your question (Hindi/English)", sources=["microphone"], type="filepath" ) ask_voice_btn = gr.Button("🔍 Ask Voice Question", variant="primary") with gr.Tab("⌨️ Type Question"): text_input = gr.Textbox( label="Type your question", placeholder="Example: इस पुस्तक का मुख्य विषय क्या है?", lines=2 ) ask_text_btn = gr.Button("🔍 Ask Text Question", variant="primary") # Response Section gr.Markdown("### 📝 Answer") response_text = gr.Textbox( label="Response", lines=6, interactive=False ) response_audio = gr.Audio( label="🔊 Audio Response", interactive=False ) # Reset Button gr.Markdown("---") reset_btn = gr.Button("🔄 Start New Session", variant="secondary") # Event Handlers auth_button.click( authenticate, inputs=[passcode_input], outputs=[auth_section, main_section, auth_status] ) process_btn.click( process_document, inputs=[pdf_upload], outputs=[doc_status, book_title_display, author_display, upload_section, book_info_section, predefined_dropdown] ) continue_btn.click( show_questions, outputs=[book_info_section, query_section] ) ask_predefined_btn.click( process_query, inputs=[gr.State(None), gr.State(""), predefined_dropdown], outputs=[response_text, response_audio] ) ask_voice_btn.click( process_query, inputs=[audio_input, gr.State(""), gr.State("")], outputs=[response_text, response_audio] ) ask_text_btn.click( process_query, inputs=[gr.State(None), text_input, gr.State("")], outputs=[response_text, response_audio] ) reset_btn.click( reset_session, outputs=[doc_status, book_title_display, author_display, upload_section, book_info_section, query_section, predefined_dropdown] ) demo.load(load_models) return demo def main(): print("🚀 Starting Hindi Book Assistant...") print("📋 Loading AI models...") load_models() demo = create_interface() print("✅ Ready!") print(f"🔑 Passcode: {CONFIG['PASSCODE']}") demo.launch( share=True, show_error=True, ) if __name__ == "__main__": main()