Spaces:

Dushyant4342
/

RAG-PDFChat

Sleeping

App Files Files Community

Dushyant4342 commited on May 19, 2025

Commit

d0364db

verified ·

1 Parent(s): b27e9cd

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -169

app.py CHANGED Viewed

@@ -1,212 +1,103 @@
-# app.py — RAG PDF Chat (phi-2 + LlamaIndex) in Gradio
-# ------------------------------------------------------------------
-# • LLM:        microsoft/phi-2
-# • Embedding:  BAAI/bge-small-en-v1.5
-# • UI:         Gradio Blocks
-# • Retrieval:  LlamaIndex VectorStoreIndex (one per PDF)
-# ------------------------------------------------------------------
 import gradio as gr
-import tempfile
-import gc
-from pathlib import Path
-from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document
-from llama_index.core.settings import Settings
-from llama_index.llms.huggingface import HuggingFaceLLM
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-import torch # Explicitly import torch to check availability early
-print("Script starting...")
-# ---------------- LLM & Embeddings ----------------
-print("Initializing LLM and Embeddings...")
-try:
-    Settings.llm = HuggingFaceLLM(
-        model_name="microsoft/phi-2",
-        tokenizer_name="microsoft/phi-2",
-        device_map="auto", # Requires accelerate
-        model_kwargs={"trust_remote_code": True}, # Often needed for Phi-2
-        generate_kwargs={"temperature": 0.2, "max_new_tokens": 256, "repetition_penalty": 1.2},
-    )
-    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-    print("LLM and Embeddings initialized successfully.")
-except Exception as e:
-    print(f"Error initializing LLM or Embeddings: {e}")
-    # Optionally, re-raise or handle as appropriate for your app
-    # For now, we'll let it proceed to see if Gradio UI can at least load
-    # to show the error, but a real app might stop here.
-    Settings.llm = None # Ensure it's None if failed
-    Settings.embed_model = None
-# ---------------- Helpers ----------------
-def build_index(path: str) -> VectorStoreIndex:
-    """Create a VectorStoreIndex from the PDF at path."""
-    print(f"Building index for: {path}")
-    # Ensure SimpleDirectoryReader is robust
-    try:
-        docs = SimpleDirectoryReader(input_files=[path]).load_data()
-        if not docs:
-            print(f"No documents loaded from {path}. Check PDF content and reader.")
-            # Handle empty or unreadable PDF gracefully
-            return VectorStoreIndex.from_documents([Document(text="Error: Could not read PDF or PDF is empty.")])
-        index = VectorStoreIndex.from_documents(docs)
-        print(f"Index built successfully for: {path}")
-        return index
-    except Exception as e:
-        print(f"Error building index for {path}: {e}")
-        # Return a dummy index or raise an error that can be caught by the UI
-        return VectorStoreIndex.from_documents([Document(text=f"Error processing PDF: {e}")])
-# ---------------- Gradio logic ----------------
-def add_pdfs(files, current_state):
-    """Handle file upload, build indexes, return updated dropdown choices."""
-    print("Adding PDFs...")
-    indexes, chat_hist = current_state if current_state else ({}, [])
-    if files is None:
-        print("No files uploaded.")
-        choices = list(indexes.keys())
-        return gr.Dropdown.update(choices=choices, value=choices[0] if choices else None), (indexes, chat_hist)
-    for f_obj in files: # Gradio File component gives a list of tempfile._TemporaryFileWrapper
-        original_filename = f_obj.name # This is the path to the temporary file
-        # Use a more descriptive name if possible, or stick to the temp name if original not easily available
-        # For this example, we'll use the temp file's name as key, but ideally, you'd want the original upload name.
-        # Gradio's File component might not directly give original filename easily without custom JS.
-        # Let's assume f_obj.name is unique enough for this context or use a counter.
-        # For simplicity, we'll use the temp file path as the key, but this is not ideal for display.
-        # A better approach would be to get the original filename if the Gradio version supports it easily,
-        # or manage it via the UI.
-        # Let's use Path(original_filename).name to get just the filename part of the temp path
-        display_name = Path(original_filename).name
-        if display_name in indexes:
-            print(f"Index for {display_name} already exists. Skipping.")
-            continue
-        # The file `f_obj` is already a file-like object pointing to the uploaded content.
-        # We need its path. `f_obj.name` gives the path to the temporary file Gradio creates.
-        try:
-            print(f"Processing file: {display_name} from path: {original_filename}")
-            # No need to write to another tempfile, Gradio already provides one.
-            idx = build_index(original_filename)
-            indexes[display_name] = idx # Use display_name as key
-            print(f"Index for {display_name} added.")
-        except Exception as e:
-            print(f"Failed to process file {display_name}: {e}")
-            # Optionally, inform the user via the UI
-            # For now, just log and skip.
-    gc.collect() # Clean up memory
-    choices = list(indexes.keys())
-    updated_value = choices[0] if choices else None
-    print(f"PDFs processed. Choices: {choices}, Selected: {updated_value}")
-    return gr.Dropdown.update(choices=choices, value=updated_value), (indexes, chat_hist)
-def chat(query, pdf_choice, current_state):
-    """Handle chat query with the selected PDF."""
-    print(f"Chat query: '{query}' for PDF: '{pdf_choice}'")
-    indexes, chat_hist = current_state
-    if not Settings.llm or not Settings.embed_model:
-        answer = "⚠️ LLM or Embedding model not initialized. Please check server logs."
-        chat_hist = chat_hist + [[query, answer]]
-        return chat_hist, (indexes, chat_hist)
-    if not pdf_choice or pdf_choice not in indexes:
-        answer = "⚠️ Please select a PDF to chat with, or the selected PDF index is not available."
-        if not pdf_choice:
-             print("No PDF selected for chat.")
-        else:
-            print(f"PDF choice '{pdf_choice}' not found in indexes: {list(indexes.keys())}")
-        chat_hist = chat_hist + [[query, answer]]
-        return chat_hist, (indexes, chat_hist)
-    query_engine = indexes[pdf_choice].as_query_engine(similarity_top_k=4)
-    try:
-        print(f"Querying engine for PDF: {pdf_choice}...")
-        response = query_engine.query(query)
-        answer = response.response
-        print("Query successful.")
-    except Exception as e:
-        answer = f"⚠️ Error during query: {e}"
-        print(f"Exception during query: {e}")
-    chat_hist = chat_hist + [[query, answer]]
-    return chat_hist, (indexes, chat_hist)
 def clear_chat_and_query(current_state):
-    """Clears the chatbot and the query box."""
-    indexes, _ = current_state # Keep indexes
-    return [], (indexes, []), "" # Clear chatbot, new empty chat_hist, clear query_box
-print("Building Gradio interface...")
 with gr.Blocks(theme=gr.themes.Soft(), css="footer {display:none}") as demo:
-    gr.Markdown("## 📄 Chat with any PDF &nbsp; | &nbsp; **microsoft/phi-2 + LlamaIndex**")
-    # (indexes dict: {filename: VectorStoreIndex}, chat_history list: [[user_msg, bot_msg], ...])
-    # Initialize with empty dict for indexes and empty list for chat_hist
-    app_state = gr.State(({}, []))
     with gr.Row():
         with gr.Column(scale=1, min_width=300):
             file_box = gr.File(
-                label="Upload PDF(s)",
                 file_types=[".pdf"],
                 file_count="multiple"
             )
             pdf_select = gr.Dropdown(
-                label="Choose a PDF to chat with",
                 interactive=True
             )
         with gr.Column(scale=3, min_width=500):
             chatbot = gr.Chatbot(
-                label="Conversation",
                 bubble_full_width=False,
                 height=500
             )
             query_box = gr.Textbox(
-                label="Ask a question…",
                 placeholder="Type your question here and press Enter.",
-                scale=4
             )
             clear_button = gr.Button("Clear Chat")
-    # Event handlers
     file_box.upload(
-        fn=add_pdfs,
         inputs=[file_box, app_state],
         outputs=[pdf_select, app_state]
     )
-    # When a PDF is selected from dropdown, or when files are uploaded and dropdown is updated
-    # you might want to clear the chat history for the new PDF.
-    # This can be chained or handled in add_pdfs if desired.
-    # For now, chat is persistent until "Clear Chat" is pressed.
     query_box.submit(
-        fn=chat,
         inputs=[query_box, pdf_select, app_state],
         outputs=[chatbot, app_state]
     )
-    # Clear button functionality
     clear_button.click(
         fn=clear_chat_and_query,
         inputs=[app_state],
-        outputs=[chatbot, app_state, query_box] # chatbot, app_state (to reset chat_hist), query_box
     )
-print("Gradio Blocks defined.")
 if __name__ == "__main__":
-    print("Launching Gradio app...")
-    # For Hugging Face Spaces, demo.launch() is usually sufficient.
-    # queue() is good for handling multiple users.
-    # Ensure share=False (default) or not set, as Spaces handles public access.
-    demo.queue().launch()
-    print("Gradio app launched.")

+# app.py — MINIMAL TEST VERSION
 import gradio as gr
+import time # For a small delay to help logs catch up if needed
+print(f"[{time.time()}] SCRIPT START: Minimal test app.py is running.")
+# ---------------- Gradio logic (Simplified) ----------------
+def dummy_add_pdfs(files, current_state):
+    print(f"[{time.time()}] dummy_add_pdfs called.")
+    # Simulate some processing
+    time.sleep(0.1)
+    # Don't actually process files, just update UI
+    choices = ["PDF A (mock)", "PDF B (mock)"] if files else []
+    selected_choice = choices[0] if choices else None
+    print(f"[{time.time()}] dummy_add_pdfs: Choices: {choices}, Selected: {selected_choice}")
+    # Ensure state is a tuple
+    state = current_state if isinstance(current_state, tuple) else (None, [])
+    return gr.Dropdown.update(choices=choices, value=selected_choice), state
+def dummy_chat(query, pdf_choice, current_state):
+    print(f"[{time.time()}] dummy_chat called with query: '{query}', PDF: '{pdf_choice}'")
+    # Simulate some processing
+    time.sleep(0.1)
+    # Ensure state is a tuple and history is a list
+    history = current_state[1] if isinstance(current_state, tuple) and len(current_state) > 1 and isinstance(current_state[1], list) else []
+    answer = f"This is a **mock** response to '{query}' for '{pdf_choice}'. Model loading is disabled for this test."
+    history = history + [[query, answer]]
+    # Ensure state is correctly structured before returning
+    new_state = (current_state[0] if isinstance(current_state, tuple) else None, history)
+    print(f"[{time.time()}] dummy_chat: History updated. Returning new state.")
+    return history, new_state
 def clear_chat_and_query(current_state):
+    print(f"[{time.time()}] clear_chat_and_query called.")
+    # Ensure state is correctly structured
+    indexes = current_state[0] if isinstance(current_state, tuple) else None
+    return [], (indexes, []), ""
+print(f"[{time.time()}] Building Gradio interface (minimal)...")
 with gr.Blocks(theme=gr.themes.Soft(), css="footer {display:none}") as demo:
+    gr.Markdown("## 📄 Minimal Test: PDF Chat App (Models Disabled)")
+    gr.Markdown("### If you see this, Gradio started. Model loading is bypassed.")
+    # Initialize state correctly as a tuple: (indexes_placeholder, chat_history_list)
+    app_state = gr.State((None, []))
     with gr.Row():
         with gr.Column(scale=1, min_width=300):
             file_box = gr.File(
+                label="Upload PDF(s) (Mock)",
                 file_types=[".pdf"],
                 file_count="multiple"
             )
             pdf_select = gr.Dropdown(
+                label="Choose a PDF (Mock)",
                 interactive=True
             )
         with gr.Column(scale=3, min_width=500):
             chatbot = gr.Chatbot(
+                label="Conversation (Mock)",
                 bubble_full_width=False,
                 height=500
             )
             query_box = gr.Textbox(
+                label="Ask a question (Mock)…",
                 placeholder="Type your question here and press Enter.",
             )
             clear_button = gr.Button("Clear Chat")
     file_box.upload(
+        fn=dummy_add_pdfs,
         inputs=[file_box, app_state],
         outputs=[pdf_select, app_state]
     )
     query_box.submit(
+        fn=dummy_chat,
         inputs=[query_box, pdf_select, app_state],
         outputs=[chatbot, app_state]
     )
     clear_button.click(
         fn=clear_chat_and_query,
         inputs=[app_state],
+        outputs=[chatbot, app_state, query_box]
     )
+print(f"[{time.time()}] Gradio Blocks defined (minimal).")
 if __name__ == "__main__":
+    print(f"[{time.time()}] MAIN: Attempting to launch Gradio app (minimal)...")
+    try:
+        # Adding a small delay before launch, sometimes helps with log flushing in constrained envs
+        # time.sleep(2)
+        demo.queue().launch(debug=True) # Keep debug=True for Gradio logs
+        print(f"[{time.time()}] MAIN: Gradio app demo.launch() called (minimal). Monitor for 'Application startup complete'.")
+    except Exception as e:
+        print(f"[{time.time()}] FATAL ERROR during demo.launch() (minimal): {e}")
+        # Write error to a file as a last resort if logs aren't showing
+        with open("launch_error.txt", "w") as f_err:
+            f_err.write(str(e))
+print(f"[{time.time()}] SCRIPT END: Minimal test app.py has finished executing initial setup code.")