Hindi-Rag

Sleeping

App Files Files Community

wellwisherofindia commited on Jun 13, 2025

Commit

3dc7d4f

1 Parent(s): 9f8df1a

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -205

app.py CHANGED Viewed

@@ -2,283 +2,282 @@ import os
 import tempfile
 import gradio as gr
 import numpy as np
-from sklearn.metrics.pairwise import cosine_similarity
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
 import fitz  # PyMuPDF
-import traceback # Import traceback for detailed error logging
 # Initialize embedding model
 sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Data storage
 chunks = []
-embeddings = np.array([])
-# Global state for API key
-# stored_api_key = None # Replaced by gr.State
 def extract_text_from_pdf(pdf_file_path, start_page=None, end_page=None):
-    """
-    Extract text from PDF file, optionally from a specific page range.
-    Page numbers are 1-indexed.
-    """
     doc = fitz.open(pdf_file_path)
     text = ""
-    pages_to_process = []
     num_pages_in_doc = doc.page_count
     if start_page is not None and end_page is not None:
         start_idx = start_page - 1
         end_idx = end_page - 1
         if 0 <= start_idx <= end_idx < num_pages_in_doc:
-            for i in range(start_idx, end_idx + 1):
-                pages_to_process.append(doc.load_page(i))
         else:
-            print(f"Warning: Invalid page range received in extract_text_from_pdf. Defaulting to all pages.")
-            pages_to_process = [doc.load_page(i) for i in range(num_pages_in_doc)]
     else:
-        pages_to_process = [doc.load_page(i) for i in range(num_pages_in_doc)]
-    for page_obj in pages_to_process:
-        text += page_obj.get_text()
     doc.close()
     return text, num_pages_in_doc
 def chunk_text(text, chunk_size=1000, overlap=200):
     """Split text into overlapping chunks"""
     doc_chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunk = text[i:i + chunk_size]
-        if len(chunk) > 100: # Ensure chunks are substantial
             doc_chunks.append(chunk)
     return doc_chunks
-def process_pdf(pdf_file_obj, processing_mode, start_page_ui, end_page_ui):
-    """Process PDF (full or page range) and create embeddings."""
-    global chunks, embeddings
     if pdf_file_obj is None:
-        chunks = []
-        embeddings = np.array([])
-        return "No PDF file provided. Please upload a PDF."
-    tmp_path = None
     try:
         with open(pdf_file_obj.name, "rb") as f_in:
             pdf_bytes = f_in.read()
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
             tmp.write(pdf_bytes)
             tmp_path = tmp.name
-    except Exception as e:
-        if tmp_path and os.path.exists(tmp_path):
-            os.unlink(tmp_path)
-        return f"Error handling uploaded PDF file: {str(e)}"
-    actual_start_page, actual_end_page = None, None
-    page_info_str = "full document"
-    pdf_name = os.path.basename(pdf_file_obj.name)
-    try:
-        doc_for_page_count = fitz.open(tmp_path)
-        total_pages_in_doc = doc_for_page_count.page_count
-        doc_for_page_count.close()
-        if processing_mode == "Page Range":
-            if start_page_ui is None or end_page_ui is None:
-                raise ValueError("For 'Page Range' mode, both Start Page and End Page must be specified.")
-            s_page = int(start_page_ui)
-            e_page = int(end_page_ui)
-            if not (1 <= s_page <= total_pages_in_doc and \
-                    1 <= e_page <= total_pages_in_doc and \
-                    s_page <= e_page):
-                raise ValueError(f"Invalid page range ({s_page}-{e_page}). Document has {total_pages_in_doc} pages.")
-            actual_start_page, actual_end_page = s_page, e_page
-            page_info_str = f"pages {s_page}-{e_page}"
-        text, _ = extract_text_from_pdf(tmp_path, start_page=actual_start_page, end_page=actual_end_page)
         if not text.strip():
-            chunks = []
-            embeddings = np.array([])
-            return f"Processed {page_info_str} of '{pdf_name}', but no text content found. Old data cleared."
-        current_book_chunks = chunk_text(text)
-        if not current_book_chunks:
-            chunks = []
-            embeddings = np.array([])
-            return f"Processed {page_info_str} of '{pdf_name}', but no substantial chunks created. Old data cleared."
-        current_book_embeddings = sbert_model.encode(current_book_chunks)
-        chunks = current_book_chunks
-        embeddings = np.array(current_book_embeddings)
-        return f"Successfully processed {page_info_str} of '{pdf_name}'. Created {len(chunks)} chunks. Ready for questions."
-    except ValueError as ve:
-        return f"Error: {str(ve)}"
     except Exception as e:
         chunks = []
-        embeddings = np.array([])
-        error_msg = f"Error processing '{pdf_name}' ({page_info_str}): {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
-        return error_msg
-    finally:
-        if tmp_path and os.path.exists(tmp_path):
-            os.unlink(tmp_path)
-def retrieve_relevant_chunks(query, top_k=5):
-    """Retrieve most relevant chunks based on query."""
-    global chunks, embeddings
-    if not chunks or not isinstance(embeddings, np.ndarray) or embeddings.size == 0:
-        return ["No document processed or no content yielded. Please process a PDF."]
-    query_embedding = sbert_model.encode([query])[0]
-    similarities = cosine_similarity([query_embedding], embeddings)[0]
-    num_available_chunks = len(chunks)
-    actual_top_k = min(top_k, num_available_chunks)
-    if actual_top_k == 0:
-        return ["No relevant chunks found."]
-    top_indices = np.argsort(similarities)[-actual_top_k:][::-1]
-    # Return the actual chunk text
-    top_chunks_text = [chunks[i] for i in top_indices]
-    return top_chunks_text
-def generate_response(query, current_api_key_state):
-    """Generate response using Gemini API and RAG, including sources."""
-    if not current_api_key_state:
-        return "API key not set. Please enter your API key and click 'Confirm API Key'.", "" # Return empty string for sources
     try:
-        genai.configure(api_key=current_api_key_state)
-        context_chunks = retrieve_relevant_chunks(query)
-        if not context_chunks or "No document" in context_chunks[0] or "No relevant chunks" in context_chunks[0]:
-             return f"Could not retrieve relevant context. Ensure a PDF is processed.\n\nDetails: {context_chunks[0]}", "" # Return empty string for sources
-        context_for_prompt = "\n\n".join(context_chunks)
-        prompt = f"""
-        Based on the following context from a book, please answer the query.
-        CONTEXT:
-        {context_for_prompt}
-        QUERY:
-        {query}
-        Please provide a helpful and accurate response based only on the information in the context. If the context doesn't provide an answer, say so.
-        """
-        gemini_model_instance = genai.GenerativeModel('gemini-1.5-flash-latest')
-        response = gemini_model_instance.generate_content(prompt)
-        # Prepare sources text
-        sources_text = "--- Sources (Context Chunks) ---\n"
-        for i, chunk in enumerate(context_chunks):
-            sources_text += f"\n[Source {i+1}]:\n{chunk}\n"
-        return response.text, sources_text # Return answer and sources separately
     except Exception as e:
-        return f"Error generating response: {str(e)}\n{traceback.format_exc()}", "" # Return empty string for sources
-with gr.Blocks(title="RAG Book Assistant") as demo:
-    api_key_state = gr.State(None)  # Store the API key
-    gr.Markdown("# 📚 RAG Book Assistant")
-    gr.Markdown(
-        "Upload a PDF book. You can choose to process the full book or a specific page range. "
-        "Processing a new PDF (or range) will **replace the current one**.\n\n"
-    )
     with gr.Row():
         with gr.Column(scale=2):
-            with gr.Group() as api_key_group:
-                api_key_input = gr.Textbox(label="Gemini API Key", type="password", elem_id="api_key_input_id")
-                confirm_api_key_btn = gr.Button("Confirm API Key", elem_id="confirm_api_key_btn_id")
-            api_key_status_output = gr.Markdown(visible=False, value="API Key Set!", elem_id="api_key_status_id")
-            pdf_input = gr.File(label="Upload PDF Book", file_types=['.pdf'])
-            processing_mode_input = gr.Radio(
-                label="Processing Mode",
-                choices=["Full Book", "Page Range"],
-                value="Full Book",
-                interactive=True
             )
-            with gr.Row(visible=False) as page_range_ui_row:
-                start_page_input = gr.Number(label="Start Page", minimum=1, precision=0, interactive=True)
-                end_page_input = gr.Number(label="End Page", minimum=1, precision=0, interactive=True)
-            process_btn = gr.Button("Process PDF (Replaces Current Book)")
-            query_input = gr.Textbox(label="Ask a question about the current book", lines=3)
-            submit_btn = gr.Button("Submit Question")
         with gr.Column(scale=1):
-            status_output = gr.Textbox(label="Processing Status", interactive=False, lines=5)
-            response_output = gr.Textbox(label="Response (Answer)", interactive=False, lines=10)
-            sources_output = gr.Textbox(label="Sources (Context Chunks)", interactive=False, lines=10)
-    # Logic to show/hide page range inputs
-    def update_page_range_visibility(mode):
-        return gr.Row(visible=(mode == "Page Range"))
-    processing_mode_input.change(
-        fn=update_page_range_visibility,
-        inputs=processing_mode_input,
-        outputs=page_range_ui_row
     )
-    # Logic to handle API key confirmation
-    def confirm_api_key(api_key):
-        if api_key:
-            return {
-                api_key_state: api_key,
-                api_key_group: gr.Group(visible=False),
-                api_key_status_output: gr.Markdown(visible=True, value="API Key Set and Hidden.")
-            }
-        else:
-            return {
-                api_key_state: None,
-                api_key_group: gr.Group(visible=True),
-                api_key_status_output: gr.Markdown(visible=True, value="Please enter an API Key.")
-            }
-    confirm_api_key_btn.click(
-        fn=confirm_api_key,
-        inputs=[api_key_input],
-        outputs=[api_key_state, api_key_group, api_key_status_output]
     )
-    process_btn.click(
-        process_pdf,
-        inputs=[pdf_input, processing_mode_input, start_page_input, end_page_input],
-        outputs=[status_output]
     )
     submit_btn.click(
-        generate_response,
-        inputs=[query_input, api_key_state],
-        outputs=[response_output, sources_output]
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import tempfile
 import gradio as gr
 import numpy as np
+import faiss
 from sentence_transformers import SentenceTransformer
 import google.generativeai as genai
 import fitz  # PyMuPDF
+import traceback
 # Initialize embedding model
 sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Data storage
 chunks = []
+faiss_index = None
+embedding_dimension = 384  # all-MiniLM-L6-v2 embedding dimension
 def extract_text_from_pdf(pdf_file_path, start_page=None, end_page=None):
+    """Extract text from PDF file, optionally from a specific page range."""
     doc = fitz.open(pdf_file_path)
     text = ""
     num_pages_in_doc = doc.page_count
     if start_page is not None and end_page is not None:
         start_idx = start_page - 1
         end_idx = end_page - 1
         if 0 <= start_idx <= end_idx < num_pages_in_doc:
+            pages_to_process = range(start_idx, end_idx + 1)
         else:
+            pages_to_process = range(num_pages_in_doc)
     else:
+        pages_to_process = range(num_pages_in_doc)
+    for i in pages_to_process:
+        text += doc.load_page(i).get_text()
     doc.close()
     return text, num_pages_in_doc
 def chunk_text(text, chunk_size=1000, overlap=200):
     """Split text into overlapping chunks"""
     doc_chunks = []
     for i in range(0, len(text), chunk_size - overlap):
         chunk = text[i:i + chunk_size]
+        if len(chunk) > 100:
             doc_chunks.append(chunk)
     return doc_chunks
+def create_faiss_index(embeddings):
+    """Create FAISS index for fast similarity search."""
+    global embedding_dimension
+    # Normalize embeddings for cosine similarity
+    faiss.normalize_L2(embeddings)
+    # Create index - using IndexFlatIP for cosine similarity
+    index = faiss.IndexFlatIP(embedding_dimension)
+    index.add(embeddings)
+    return index
+def process_pdf(pdf_file_obj, api_key):
+    """Process PDF and create FAISS index."""
+    global chunks, faiss_index
+    if not api_key:
+        return None, [["System", "⚠️ Please set your Gemini API key first."]]
     if pdf_file_obj is None:
+        return None, [["System", "📄 Please upload a PDF file."]]
     try:
+        # Save uploaded file temporarily
         with open(pdf_file_obj.name, "rb") as f_in:
             pdf_bytes = f_in.read()
         with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
             tmp.write(pdf_bytes)
             tmp_path = tmp.name
+        # Extract text
+        text, total_pages = extract_text_from_pdf(tmp_path)
         if not text.strip():
+            return None, [["System", "⚠️ No text found in the PDF. Please try a different file."]]
+        # Create chunks
+        current_chunks = chunk_text(text)
+        if not current_chunks:
+            return None, [["System", "⚠️ Could not create text chunks from the PDF."]]
+        # Generate embeddings
+        current_embeddings = sbert_model.encode(current_chunks)
+        current_embeddings = np.array(current_embeddings, dtype=np.float32)
+        # Create FAISS index
+        current_index = create_faiss_index(current_embeddings)
+        # Update global storage
+        chunks = current_chunks
+        faiss_index = current_index
+        pdf_name = os.path.basename(pdf_file_obj.name)
+        success_msg = f"✅ Successfully processed '{pdf_name}' ({total_pages} pages, {len(chunks)} chunks). FAISS index created! You can now ask questions!"
+        # Clean up
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+        return None, [["System", success_msg]]
     except Exception as e:
         chunks = []
+        faiss_index = None
+        error_msg = f"❌ Error processing PDF: {str(e)}"
+        return None, [["System", error_msg]]
+def retrieve_relevant_chunks(query, top_k=3):
+    """Retrieve most relevant chunks using FAISS search."""
+    global chunks, faiss_index
+    if not chunks or faiss_index is None:
+        return []
+    try:
+        # Encode query
+        query_embedding = sbert_model.encode([query])
+        query_embedding = np.array(query_embedding, dtype=np.float32)
+        # Normalize for cosine similarity
+        faiss.normalize_L2(query_embedding)
+        # Search using FAISS
+        scores, indices = faiss_index.search(query_embedding, top_k)
+        # Get top chunks
+        top_chunks = []
+        for idx in indices[0]:
+            if idx < len(chunks):  # Safety check
+                top_chunks.append(chunks[idx])
+        return top_chunks
+    except Exception as e:
+        print(f"Error in FAISS search: {str(e)}")
+        return []
+def chat_fn(message, history, api_key):
+    """Handle chat interaction."""
+    if not message.strip():
+        return history, ""
+    # Add user message to history
+    history = history + [[message, None]]
+    if not api_key:
+        history[-1][1] = "⚠️ Please set your Gemini API key first."
+        return history, ""
+    if not chunks or faiss_index is None:
+        history[-1][1] = "📄 Please upload and process a PDF document first."
+        return history, ""
     try:
+        # Configure Gemini
+        genai.configure(api_key=api_key)
+        # Get relevant context using FAISS
+        context_chunks = retrieve_relevant_chunks(message, top_k=5)
+        if not context_chunks:
+            history[-1][1] = "❌ Could not find relevant information in the document."
+            return history, ""
+        # Generate response
+        context = "\n\n".join(context_chunks)
+        prompt = f"""Based on the following context from the document, answer the user's question.
+Context:
+{context}
+Question: {message}
+Please provide a clear, accurate answer based only on the information in the context. If the context doesn't contain enough information to answer the question, say so."""
+        model = genai.GenerativeModel('gemini-1.5-flash-latest')
+        response = model.generate_content(prompt)
+        history[-1][1] = response.text
     except Exception as e:
+        history[-1][1] = f"❌ Error: {str(e)}"
+    return history, ""
+# Custom CSS for better chat appearance
+css = """
+.gradio-container {
+    max-width: 800px !important;
+    margin: auto !important;
+}
+.chat-message {
+    padding: 10px !important;
+    margin: 5px 0 !important;
+    border-radius: 10px !important;
+}
+"""
+with gr.Blocks(css=css, title="📚 Chat with Your PDF") as demo:
+    api_key_state = gr.State("")
+    gr.Markdown("""
+    # 📚 Chat with Your PDF (FAISS-Powered)
+    Upload a PDF document and chat with it naturally. Now with FAISS for faster vector search!
+    """)
     with gr.Row():
         with gr.Column(scale=2):
+            api_key_input = gr.Textbox(
+                label="🔑 Gemini API Key",
+                type="password",
+                placeholder="Enter your API key here..."
             )
         with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="📄 Upload PDF",
+                file_types=['.pdf']
+            )
+    # Chat interface
+    chatbot = gr.Chatbot(
+        label="💬 Chat",
+        height=500,
+        show_label=False,
+        bubble_full_width=False
     )
+    msg_input = gr.Textbox(
+        label="Message",
+        placeholder="Ask anything about your PDF...",
+        show_label=False,
+        container=False
     )
+    with gr.Row():
+        submit_btn = gr.Button("Send 💬", variant="primary")
+        clear_btn = gr.Button("Clear Chat 🗑️")
+    # Event handlers
+    def update_api_key(key):
+        return key
+    api_key_input.change(
+        fn=update_api_key,
+        inputs=api_key_input,
+        outputs=api_key_state
     )
+    pdf_input.upload(
+        fn=process_pdf,
+        inputs=[pdf_input, api_key_state],
+        outputs=[msg_input, chatbot]
+    )
     submit_btn.click(
+        fn=chat_fn,
+        inputs=[msg_input, chatbot, api_key_state],
+        outputs=[chatbot, msg_input]
+    )
+    msg_input.submit(
+        fn=chat_fn,
+        inputs=[msg_input, chatbot, api_key_state],
+        outputs=[chatbot, msg_input]
+    )
+    clear_btn.click(
+        fn=lambda: ([], ""),
+        outputs=[chatbot, msg_input]
     )
 if __name__ == "__main__":
+    demo.launch(share=True)