Spaces:

Dushyant4342
/

RAG-PDFChat

Sleeping

App Files Files Community

Dushyant4342 commited on May 19, 2025

Commit

497021f

verified ·

1 Parent(s): 09507d6

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -30

app.py CHANGED Viewed

@@ -1,40 +1,155 @@
 import gradio as gr
-import time
 import os
-print(f"[{time.time()}] SCRIPT START: Echo Bot Test. PID: {os.getpid()}")
-def echo_chat(message, history):
-    # message: The user's input string
-    # history: A list of previous interactions [[user_msg_1, bot_msg_1], [user_msg_2, bot_msg_2], ...]
-    print(f"[{time.time()}] echo_chat called. Message: '{message}'")
-    # Simulate a little bit of work
-    time.sleep(0.2)
-    # The function for gr.ChatInterface should return the bot's response as a string
-    return f"Echo: {message}"
 if __name__ == "__main__":
-    print(f"[{time.time()}] MAIN: Building Gradio interface (Echo Bot)...")
-    # Using gr.ChatInterface for a very standard and robust chat UI
-    iface = gr.ChatInterface(
-        fn=echo_chat,
-        title="Echo Bot Test",
-        description="Type a message and it will be echoed back. This tests basic Gradio functionality.",
-        examples=["Hello Gradio!", "Is this working?"],
-        cache_examples=False # Disable caching for this simple test
     )
-    print(f"[{time.time()}] MAIN: Attempting to launch Gradio app (Echo Bot)...")
     try:
-        iface.launch(debug=True) # debug=True gives more verbose Gradio logs
-        print(f"[{time.time()}] MAIN: Gradio app launch() called (Echo Bot). Monitor logs for 'Application startup complete'.")
     except Exception as e:
-        print(f"[{time.time()}] FATAL ERROR during launch (Echo Bot): {e}")
-        # As a last resort, try to write the error to a file if logs are inaccessible
-        with open("launch_error.txt", "w") as f_err:
-            f_err.write(f"Error during Echo Bot launch: {str(e)}\n")
-print(f"[{time.time()}] SCRIPT END: Echo Bot test app.py has finished executing initial setup code.")

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
+import time
+print(f"[{time.time()}] SCRIPT START: Small Local LLM Chat. PID: {os.getpid()}")
+# --- Configuration ---
+MODEL_NAME = "distilgpt2" # A small and efficient model
+# For slightly larger, try "gpt2" (the smallest version of GPT-2)
+# MODEL_NAME = "gpt2"
+# Determine device: use CUDA if available, otherwise CPU.
+# For small models on typical HF Spaces, CPU is often the only option or more stable.
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"[{time.time()}] Using device: {DEVICE}")
+# --- Load Model and Tokenizer ---
+# This section can take some time and memory, especially on first run (downloading model)
+model = None
+tokenizer = None
+model_load_error = None
+try:
+    print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
+    # Add a padding token if it doesn't exist (common for GPT-2 models)
+    if tokenizer and tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
+    print(f"[{time.time()}] Loading model {MODEL_NAME} to {DEVICE}...")
+    # For CPU, ensure model is explicitly moved. For 'auto', it might try GPU.
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
+    model.eval() # Set model to evaluation mode
+    print(f"[{time.time()}] Model {MODEL_NAME} loaded successfully on {DEVICE}.")
+except Exception as e:
+    model_load_error = str(e)
+    print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
+    # Fallback or error display will be handled in the Gradio UI
+# --- Chat Function ---
+def generate_chat_response(message, history):
+    """
+    Generates a response from the local LLM.
+    'message' is the user's new input.
+    'history' is a list of previous [user, bot] pairs.
+    """
+    print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
+    if model_load_error or not model or not tokenizer:
+        error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
+        print(f"[{time.time()}] {error_msg}")
+        return error_msg
+    # Basic conversation history formatting (can be improved)
+    # We'll prepend the history to the current message to give some context.
+    # Keep history short to avoid exceeding max input length for small models.
+    prompt = ""
+    # Limit history to last 2 turns to keep prompt short
+    for user_msg, bot_msg in history[-2:]:
+        prompt += f"User: {user_msg}\nBot: {bot_msg}\n"
+    prompt += f"User: {message}\nBot:"
+    try:
+        print(f"[{time.time()}] Encoding prompt for model...")
+        # Ensure padding_side is set correctly if using padding during generation (though not typical for single prompt generation)
+        # tokenizer.padding_side = "left" # Important for decoder-only models if batching
+        inputs = tokenizer.encode_plus(
+            prompt,
+            return_tensors="pt",
+            padding=True, # Pad to max length of batch (or model if single)
+            truncation=True,
+            max_length=512 # Max input length for the model (distilgpt2 is 1024, but keep it reasonable)
+        ).to(DEVICE)
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        print(f"[{time.time()}] Generating response... Input ID length: {input_ids.shape[1]}")
+        # Generate response
+        # `max_length` here is the total length of input + output
+        # `max_new_tokens` is usually preferred for controlling output length specifically
+        with torch.no_grad(): # Disable gradient calculations for inference
+            output_sequences = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=60,  # Max number of new tokens to generate
+                num_return_sequences=1,
+                pad_token_id=tokenizer.pad_token_id, # Use the pad token ID from tokenizer
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=True, # Enable sampling for more diverse outputs
+                top_k=50,       # Consider top_k tokens for sampling
+                top_p=0.95,     # Use nucleus sampling
+                temperature=0.8 # Controls randomness
+            )
+        # Decode the generated sequence
+        response_text = tokenizer.decode(output_sequences[0][input_ids.shape[-1]:], skip_special_tokens=True)
+        # Basic post-processing: remove potential artifacts or incomplete sentences if needed
+        response_text = response_text.strip()
+        print(f"[{time.time()}] Raw generated text: '{response_text}'")
+        if not response_text:
+            response_text = "I'm not sure how to respond to that right now."
+        return response_text
+    except Exception as e:
+        print(f"[{time.time()}] Error during text generation: {e}")
+        return f"Error generating response: {e}"
+# --- Gradio Interface ---
 if __name__ == "__main__":
+    print(f"[{time.time()}] MAIN: Building Gradio interface (Small Local LLM Chat)...")
+    interface_title = f"Chat with Small Local LLM ({MODEL_NAME})"
+    interface_description = f"""
+    This app runs a small language model ({MODEL_NAME}) directly in this Space.
+    Responses might be slow and simple due to the model's size and CPU processing.
+    """
+    if model_load_error:
+        interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
+    elif not model or not tokenizer:
+        interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
+    chat_interface = gr.ChatInterface(
+        fn=generate_chat_response,
+        title=interface_title,
+        description=interface_description,
+        examples=[["Hello, who are you?"], ["What is 1+1?"]],
+        cache_examples=False, # Disable caching for dynamic model responses
+        retry_btn="Retry",
+        undo_btn="Delete last",
+        clear_btn="Clear chat",
     )
+    print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
     try:
+        chat_interface.queue().launch(debug=True) # queue() for better handling, debug=True for logs
+        print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
     except Exception as e:
+        print(f"[{time.time()}] FATAL ERROR during launch: {e}")
+        with open("launch_error.txt", "w") as f_err: # Fallback error logging
+            f_err.write(f"Error during Small LLM Chat launch: {str(e)}\n")
+print(f"[{time.time()}] SCRIPT END: Small Local LLM Chat app.py has finished initial setup.")