Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 6, 2025

Commit

fcef7cd

1 Parent(s): 8106bb9

COMPLETE REWRITE: Clean ChatGPT-style interface with proper response handling

Browse files

Files changed (1) hide show

gradio_app.py +202 -185

gradio_app.py CHANGED Viewed

@@ -1,9 +1,7 @@
 import os
 import logging
-import time
-import asyncio
-from typing import List, Optional, Dict, Any
 import threading
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -13,233 +11,252 @@ import gradio as gr
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class ModelManager:
-    def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.device = None
-        self.model_loaded = False
-        # Load model in a separate thread to avoid blocking
-        self.load_thread = threading.Thread(target=self.load_model_sync)
-        self.load_thread.daemon = True
-        self.load_thread.start()
-    def load_model_sync(self):
-        """Load model synchronously in background thread"""
-        try:
-            logger.info("Starting model loading...")
-            # Check if CUDA is available and force to cuda:0
-            if torch.cuda.is_available():
-                torch.cuda.set_device(0)
-                self.device = "cuda:0"
-            else:
-                self.device = "cpu"
-            logger.info(f"Using device: {self.device}")
-            if self.device == "cuda:0":
-                logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
-                logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
-            # Get HF token from environment
-            hf_token = os.getenv("HF_TOKEN")
-            logger.info("Loading Llama-3.1-8B-Instruct model...")
-            base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                base_model_name,
-                use_fast=True,
-                trust_remote_code=True,
-                token=hf_token
-            )
-            self.model = AutoModelForCausalLM.from_pretrained(
-                base_model_name,
-                torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
-                device_map={"": 0},  # Force all parameters to GPU 0
-                trust_remote_code=True,
-                low_cpu_mem_usage=True,
-                use_safetensors=True,
-                token=hf_token
-            )
-            # Ensure model is on the correct device
-            if self.device == "cuda:0":
-                self.model = self.model.to(self.device)
-            self.model_loaded = True
-            logger.info("Model loaded successfully!")
-        except Exception as e:
-            logger.error(f"Error loading model: {str(e)}")
-            self.model_loaded = False
-# Initialize model manager
-model_manager = ModelManager()
-def chat_with_model(message, history, temperature):
-    """Raw chat function for direct model interaction"""
     if not message.strip():
         return history, ""
     try:
-        # Use the model directly for raw chat
-        if not model_manager.model_loaded:
-            response = "Model not loaded yet. Please wait..."
         else:
-            # Create a simple chat prompt
-            prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
-{message}
-<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-"""
-            # Generate response using the model directly
-            inputs = model_manager.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
-            # Force all inputs to the same device as the model
-            if model_manager.device == "cuda:0":
-                model_device = next(model_manager.model.parameters()).device
-                inputs = {k: v.to(model_device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = model_manager.model.generate(
-                    **inputs,
-                    max_new_tokens=4096,
-                    temperature=temperature,
-                    top_p=0.95,
-                    do_sample=True,
-                    num_beams=1,
-                    pad_token_id=model_manager.tokenizer.eos_token_id,
-                    eos_token_id=model_manager.tokenizer.eos_token_id,
-                    early_stopping=False,  # Disable early stopping to prevent premature truncation
-                    repetition_penalty=1.1  # Add slight repetition penalty to improve quality
-                )
-            # Decode the generated text and remove the input prompt
-            full_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Use a more robust method to extract the response
-            # Look for the assistant header end and extract everything after it
-            assistant_start = "<|start_header_id|>assistant<|end_header_id|>"
-            if assistant_start in full_text:
-                # Find the position after the assistant header
-                response_start = full_text.find(assistant_start) + len(assistant_start)
-                # TEMPORARY: Show full response for debugging
-                response = f"=== FULL RESPONSE ===\n{full_text}\n=== END ==="
-                # Original line: response = full_text[response_start:].strip()
-                logger.info(f"Extracted response length: {len(response)}")
-            else:
-                # Fallback: try to remove the original prompt
-                try:
-                    response = full_text[len(prompt):].strip()
-                except:
-                    response = full_text.strip()
-            # Check if response ends abruptly (might indicate truncation)
-            if response and not response.endswith(('.', '!', '?', ':', ';')):
-                logger.warning(f"Response may be truncated - ends with: '{response[-20:]}'")
-            if not response:
-                response = "I couldn't generate a response. Please try a different prompt."
     except Exception as e:
-        logger.error(f"Error in chat: {str(e)}")
-        response = f"Error: {str(e)}"
-    # Add both user message and bot response to history using new message format
-    history.append({"role": "user", "content": message})
-    history.append({"role": "assistant", "content": response})
     return history, ""
-def clear_chat():
     """Clear the chat history"""
-    return [], ""
-# CSS for styling
 css = """
 #chatbot {
-    min-height: 500px;
-    border: 1px solid #e0e0e0;
-    border-radius: 8px;
-    overflow: auto;
-    background-color: #f9f9f9;
 }
-.gradio-container {
-    font-family: 'IBM Plex Sans', sans-serif;
 }
 """
-# Create simplified chat interface
-with gr.Blocks(css=css, title="Llama Chat", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # 🦙 Llama Chat
-        ### Raw interface for Llama-3.1-8B-Instruct
-        Direct chat interface for testing prompts and having conversations with the model.
         """
     )
-    # Simple chat interface
-    chatbot = gr.Chatbot(
-        elem_id="chatbot",
-        label="Chat",
-        show_label=False,
-        avatar_images=(None, None),
-        show_share_button=False,
-        type="messages",  # Use new message format
-        height=500
-    )
     with gr.Row():
         with gr.Column(scale=4):
-            msg = gr.Textbox(
-                placeholder="Type your message here...",
                 show_label=False,
-                container=False
             )
-        with gr.Column(scale=1):
-            submit_btn = gr.Button("Send", variant="primary")
-        with gr.Column(scale=1):
-            clear_btn = gr.Button("Clear", variant="secondary")
-    with gr.Row():
-        temperature = gr.Slider(
-            minimum=0.1,
-            maximum=2.0,
-            value=0.8,
-            step=0.1,
-            label="Temperature",
-            info="Controls randomness (0.1=focused, 2.0=creative)"
-        )
     # Event handlers
     def respond(message, history, temp):
-        return chat_with_model(message, history, temp)
     msg.submit(respond, [msg, chatbot, temperature], [chatbot, msg])
-    submit_btn.click(respond, [msg, chatbot, temperature], [chatbot, msg])
-    clear_btn.click(clear_chat, outputs=[chatbot, msg])
-    # Add footer
     gr.Markdown(
         """
         ---
         <div style="text-align: center; color: #666; font-size: 0.9em;">
-            Built with ❤️ using Gradio and Llama-3.1-8B-Instruct •
-            <a href="/docs" target="_blank">API Documentation</a>
         </div>
         """
     )
 if __name__ == "__main__":
-    # Launch Gradio interface
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 import os
 import logging
 import threading
+from typing import List, Tuple
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Global variables for model
+model = None
+tokenizer = None
+device = None
+model_loaded = False
+def load_model():
+    """Load the Llama model and tokenizer"""
+    global model, tokenizer, device, model_loaded
+    try:
+        logger.info("Starting model loading...")
+        # Check if CUDA is available and force to cuda:0
+        if torch.cuda.is_available():
+            torch.cuda.set_device(0)
+            device = "cuda:0"
+        else:
+            device = "cpu"
+        logger.info(f"Using device: {device}")
+        if device == "cuda:0":
+            logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
+            logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+        # Get HF token from environment
+        hf_token = os.getenv("HF_TOKEN")
+        logger.info("Loading Llama-3.1-8B-Instruct model...")
+        model_name = "meta-llama/Llama-3.1-8B-Instruct"
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            use_fast=True,
+            trust_remote_code=True,
+            token=hf_token
+        )
+        # Load model
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if device == "cuda:0" else torch.float32,
+            device_map={"": 0},  # Force all parameters to GPU 0
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+            token=hf_token
+        )
+        # Ensure model is on the correct device
+        if device == "cuda:0":
+            model = model.to(device)
+        model_loaded = True
+        logger.info("Model loaded successfully!")
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        model_loaded = False
+def chat_response(message: str, history: List[List[str]], temperature: float) -> Tuple[List[List[str]], str]:
+    """Generate a response to the user's message"""
+    global model, tokenizer, device, model_loaded
+    if not model_loaded:
+        history.append([message, "🔄 Model is still loading, please wait..."])
+        return history, ""
     if not message.strip():
         return history, ""
     try:
+        # Create Llama chat prompt
+        conversation = ""
+        for user_msg, assistant_msg in history:
+            if user_msg and assistant_msg:
+                conversation += f"<|start_header_id|>user<|end_header_id|>\n{user_msg}<|eot_id|>"
+                conversation += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant_msg}<|eot_id|>"
+        # Add current message
+        prompt = f"<|begin_of_text|>{conversation}<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
+        # Tokenize input
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
+        # Move to correct device
+        if device == "cuda:0":
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Generate response
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=2048,
+                temperature=temperature,
+                top_p=0.95,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id
+            )
+        # Decode response
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract just the assistant's response (everything after the last assistant header)
+        response_start = generated_text.rfind("<|start_header_id|>assistant<|end_header_id|>")
+        if response_start != -1:
+            response = generated_text[response_start + len("<|start_header_id|>assistant<|end_header_id|>"):].strip()
         else:
+            # Fallback: remove the original prompt
+            response = generated_text[len(prompt):].strip()
+        # Clean up any remaining tokens
+        response = response.replace("<|eot_id|>", "").strip()
+        if not response:
+            response = "I apologize, but I couldn't generate a response. Please try rephrasing your message."
+        # Add to history
+        history.append([message, response])
     except Exception as e:
+        logger.error(f"Error generating response: {str(e)}")
+        history.append([message, f"❌ Error: {str(e)}"])
     return history, ""
+def clear_history():
     """Clear the chat history"""
+    return []
+# Load model in background thread
+def load_model_background():
+    load_model()
+model_thread = threading.Thread(target=load_model_background, daemon=True)
+model_thread.start()
+# Custom CSS for ChatGPT-like appearance
 css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: auto !important;
+}
 #chatbot {
+    height: 600px !important;
+    overflow-y: auto !important;
 }
+.message {
+    padding: 10px !important;
+    margin: 5px 0 !important;
+    border-radius: 10px !important;
+}
+.user {
+    background-color: #dcf8c6 !important;
+    margin-left: 20% !important;
+}
+.bot {
+    background-color: #f1f1f1 !important;
+    margin-right: 20% !important;
 }
 """
+# Create Gradio interface
+with gr.Blocks(
+    css=css,
+    title="Llama Chat",
+    theme=gr.themes.Soft()
+) as demo:
+    # Header
     gr.Markdown(
         """
         # 🦙 Llama Chat
+        ### Powered by Llama-3.1-8B-Instruct
+        A clean, ChatGPT-style interface for conversing with the Llama model.
         """
     )
+    # Chat interface
     with gr.Row():
         with gr.Column(scale=4):
+            chatbot = gr.Chatbot(
+                label="Chat",
                 show_label=False,
+                height=600,
+                show_copy_button=True
+            )
+            with gr.Row():
+                msg = gr.Textbox(
+                    placeholder="Type your message here...",
+                    show_label=False,
+                    scale=4,
+                    lines=1,
+                    max_lines=5
+                )
+                send_btn = gr.Button("Send", variant="primary", scale=1)
+        with gr.Column(scale=1, min_width=250):
+            gr.Markdown("### ⚙️ Settings")
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=0.8,
+                step=0.1,
+                label="Temperature",
+                info="Controls creativity (0.1=focused, 2.0=creative)"
+            )
+            clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
+            gr.Markdown(
+                """
+                ### 💡 Tips
+                - Use lower temperature (0.1-0.5) for factual responses
+                - Use higher temperature (1.0-2.0) for creative tasks
+                - Press Enter to send messages
+                - The model maintains conversation context
+                """
             )
     # Event handlers
     def respond(message, history, temp):
+        return chat_response(message, history, temp)
+    # Connect events
     msg.submit(respond, [msg, chatbot, temperature], [chatbot, msg])
+    send_btn.click(respond, [msg, chatbot, temperature], [chatbot, msg])
+    clear_btn.click(lambda: (clear_history(), ""), outputs=[chatbot, msg])
+    # Footer
     gr.Markdown(
         """
         ---
         <div style="text-align: center; color: #666; font-size: 0.9em;">
+            🚀 Built with Gradio • 🦙 Powered by Llama-3.1-8B-Instruct
         </div>
         """
     )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,