Spaces:

AiCoderv2
/

dark-app-74

Sleeping

App Files Files Community

AiCoderv2 commited on Oct 4, 2025

Commit

af5ca25

verified ·

1 Parent(s): 6558d3e

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +73 -50

app.py CHANGED Viewed

@@ -5,11 +5,11 @@ from typing import List, Dict
 class ChatbotHandler:
     def __init__(self):
-        self.model_name = "facebook/opt-13b"  # 13B parameter model (close to 15B)
         self.tokenizer = None
         self.model = None
         self.chat_pipeline = None
-        self.max_length = 1000
         self.temperature = 0.7
         self.model_loaded = False
         self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
@@ -21,20 +21,29 @@ class ChatbotHandler:
         self.initialize_model()
     def initialize_model(self):
-        """Initialize the Hugging Face model."""
         try:
-            from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
         except ImportError:
             print("Transformers library not available. Please install the required dependencies.")
             return False
         try:
-            print("Loading OPT-13B model... This may take a very long time and require significant memory.")
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
                 device_map="auto",  # Automatically distribute across available GPUs
-                torch_dtype="auto",
                 low_cpu_mem_usage=True
             )
@@ -42,7 +51,7 @@ class ChatbotHandler:
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Create pipeline for text generation
             self.chat_pipeline = pipeline(
                 "text-generation",
                 model=self.model,
@@ -52,7 +61,8 @@ class ChatbotHandler:
                 temperature=self.temperature,
                 do_sample=True,
                 pad_token_id=self.tokenizer.eos_token_id,
-                truncation=True
             )
             print("Model loaded successfully!")
             self.model_loaded = True
@@ -62,16 +72,16 @@ class ChatbotHandler:
             return False
     def get_response(self, message: str, history: List[Dict]) -> str:
-        """Get response from the model."""
         if not self.chat_pipeline:
             return "Model not loaded. Please try again later."
         try:
-            # Prepare conversation history as a single string
             conversation = self.system_prompt + "\n"
-            # Add recent history (limit to last 3 exchanges to save memory)
-            for msg in history[-3:]:
                 if msg["role"] == "user":
                     conversation += f"User: {msg['content']}\n"
                 elif msg["role"] == "assistant":
@@ -80,15 +90,20 @@ class ChatbotHandler:
             # Add current message
             conversation += f"User: {message}\nAssistant:"
-            # Generate response with memory constraints
             outputs = self.chat_pipeline(
                 conversation,
-                max_new_tokens=100,  # Shorter responses to save memory
                 num_return_sequences=1,
                 return_full_text=False,
                 do_sample=True,
-                temperature=self.temperature
             )
             response = outputs[0]['generated_text'].strip()
@@ -98,56 +113,64 @@ class ChatbotHandler:
             elif response.startswith("User:"):
                 response = "I apologize, but I seem to have gotten confused. How can I help you?"
-            # Limit response length
-            if len(response) > 500:
-                response = response[:500] + "..."
-            # Simulate streaming by yielding chunks
             words = response.split()
             current_response = ""
-            for word in words:
-                current_response += word + " "
                 yield current_response.strip()
-                time.sleep(0.02)  # Faster streaming
         except Exception as e:
-            yield f"I apologize, but I encountered an error generating a response. Please try asking your question again. Error: {str(e)}"
 # Initialize chatbot handler
 chat_handler = ChatbotHandler()
 def respond_stream(message: str, history: List[Dict]):
-    """Generate streaming response from the model."""
     if not message.strip():
         return "", history
     # Always add user message first to prevent disappearing chats
-    history = history + [{"role": "user", "content": message}]
     # Check if model is initialized
     if not chat_handler.chat_pipeline:
-        history = history + [{"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."}]
-        return "", history
-    # Get streaming response
     full_response = ""
     try:
-        for chunk in chat_handler.get_response(message, history[:-1]):  # Don't include current user message in context
             full_response = chunk
-            # Update the last assistant message
-            if len(history) > 0 and history[-1].get("role") == "assistant":
-                history[-1]["content"] = full_response
             else:
-                history = history + [{"role": "assistant", "content": full_response}]
-            yield "", history
     except Exception as e:
         # If streaming fails, add a fallback response
         error_msg = "I apologize, but I encountered an error. Please try again."
-        if len(history) > 0 and history[-1].get("role") == "assistant":
-            history[-1]["content"] = error_msg
         else:
-            history = history + [{"role": "assistant", "content": error_msg}]
-        yield "", history
 def clear_history():
     """Clear the chat history."""
@@ -160,22 +183,22 @@ def update_model_settings(temp, max_len):
     return f"Settings updated: temp={temp}, max_length={max_len}"
 # Create the interface
-with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
     # Header
     gr.HTML("""
     <div style='text-align: center; padding: 20px;'>
-        <h1>🤖 AI Chatbot</h1>
-        <p style='color: #666;'>Powered by OPT-13B (13B parameters) • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
     </div>
     """)
     # Status indicator
     if chat_handler.model_loaded:
-        status_msg = "✅ Chatbot is ready! Start chatting below."
         status_color = "#28a745"
     else:
-        status_msg = "⏳ Loading OPT-13B model... This may take 10-20 minutes and requires significant memory."
         status_color = "#ffc107"
     gr.HTML(f"""
@@ -196,12 +219,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
                 info="Higher values make responses more creative"
             )
             max_length = gr.Slider(
-                minimum=500,
-                maximum=2000,
-                value=1000,
-                step=100,
                 label="Max Length",
-                info="Maximum context length"
             )
     # Chatbot component
@@ -247,8 +270,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
     # Footer
     gr.HTML("""
     <div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
-        <p>This chatbot uses Meta's OPT-13B model (13 billion parameters) from Hugging Face. It's completely free to use!</p>
-        <p><strong>Note:</strong> This large model requires significant computational resources and may take time to load and respond.</p>
     </div>
     """)

 class ChatbotHandler:
     def __init__(self):
+        self.model_name = "facebook/opt-6.7b"  # Smaller, faster 6.7B model instead of 13B
         self.tokenizer = None
         self.model = None
         self.chat_pipeline = None
+        self.max_length = 512  # Reduced for speed
         self.temperature = 0.7
         self.model_loaded = False
         self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
         self.initialize_model()
     def initialize_model(self):
+        """Initialize the Hugging Face model with quantization for speed."""
         try:
+            from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
+            import torch
         except ImportError:
             print("Transformers library not available. Please install the required dependencies.")
             return False
         try:
+            print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.")
+            # Configure 8-bit quantization for speed
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
+                llm_int8_enable_fp32_cpu_offload=True
+            )
             self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_name,
+                quantization_config=quantization_config,
                 device_map="auto",  # Automatically distribute across available GPUs
+                torch_dtype=torch.float16,
                 low_cpu_mem_usage=True
             )
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Create pipeline for text generation with optimized settings
             self.chat_pipeline = pipeline(
                 "text-generation",
                 model=self.model,
                 temperature=self.temperature,
                 do_sample=True,
                 pad_token_id=self.tokenizer.eos_token_id,
+                truncation=True,
+                use_fast=True
             )
             print("Model loaded successfully!")
             self.model_loaded = True
             return False
     def get_response(self, message: str, history: List[Dict]) -> str:
+        """Get response from the model with optimized settings."""
         if not self.chat_pipeline:
             return "Model not loaded. Please try again later."
         try:
+            # Prepare conversation history as a single string (limit to last 2 exchanges for speed)
             conversation = self.system_prompt + "\n"
+            # Add recent history (limit to last 2 exchanges for speed)
+            for msg in history[-2:]:
                 if msg["role"] == "user":
                     conversation += f"User: {msg['content']}\n"
                 elif msg["role"] == "assistant":
             # Add current message
             conversation += f"User: {message}\nAssistant:"
+            # Generate response with optimized settings for speed
+            start_time = time.time()
             outputs = self.chat_pipeline(
                 conversation,
+                max_new_tokens=50,  # Shorter responses for speed
                 num_return_sequences=1,
                 return_full_text=False,
                 do_sample=True,
+                temperature=self.temperature,
+                top_p=0.9,  # Add top_p for better quality
+                repetition_penalty=1.1  # Reduce repetition
             )
+            end_time = time.time()
+            print(f"Response generated in {end_time - start_time:.2f} seconds")
             response = outputs[0]['generated_text'].strip()
             elif response.startswith("User:"):
                 response = "I apologize, but I seem to have gotten confused. How can I help you?"
+            # Limit response length for speed
+            if len(response) > 200:
+                response = response[:200] + "..."
+            # Faster streaming (yield larger chunks)
             words = response.split()
             current_response = ""
+            chunk_size = 3  # Yield every 3 words for faster streaming
+            for i in range(0, len(words), chunk_size):
+                chunk = words[i:i + chunk_size]
+                current_response += " ".join(chunk) + " "
                 yield current_response.strip()
+                time.sleep(0.01)  # Very short delay for smooth streaming
         except Exception as e:
+            yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}"
 # Initialize chatbot handler
 chat_handler = ChatbotHandler()
 def respond_stream(message: str, history: List[Dict]):
+    """Generate streaming response from the model with fixed history management."""
     if not message.strip():
         return "", history
+    # Create a copy of history to avoid mutation issues
+    current_history = history.copy()
     # Always add user message first to prevent disappearing chats
+    current_history.append({"role": "user", "content": message})
     # Check if model is initialized
     if not chat_handler.chat_pipeline:
+        current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."})
+        return "", current_history
+    # Get streaming response with error handling
     full_response = ""
+    assistant_added = False
     try:
+        for chunk in chat_handler.get_response(message, current_history[:-1]):  # Don't include current user message in context
             full_response = chunk
+            # Update or add the assistant message
+            if not assistant_added:
+                current_history.append({"role": "assistant", "content": full_response})
+                assistant_added = True
             else:
+                current_history[-1]["content"] = full_response
+            yield "", current_history
     except Exception as e:
         # If streaming fails, add a fallback response
         error_msg = "I apologize, but I encountered an error. Please try again."
+        if not assistant_added:
+            current_history.append({"role": "assistant", "content": error_msg})
         else:
+            current_history[-1]["content"] = error_msg
+        yield "", current_history
 def clear_history():
     """Clear the chat history."""
     return f"Settings updated: temp={temp}, max_length={max_len}"
 # Create the interface
+with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo:
     # Header
     gr.HTML("""
     <div style='text-align: center; padding: 20px;'>
+        <h1>⚡ Fast AI Chatbot</h1>
+        <p style='color: #666;'>Powered by OPT-6.7B with 8-bit quantization • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
     </div>
     """)
     # Status indicator
     if chat_handler.model_loaded:
+        status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds."
         status_color = "#28a745"
     else:
+        status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before."
         status_color = "#ffc107"
     gr.HTML(f"""
                 info="Higher values make responses more creative"
             )
             max_length = gr.Slider(
+                minimum=256,
+                maximum=1024,
+                value=512,
+                step=64,
                 label="Max Length",
+                info="Maximum context length (lower = faster)"
             )
     # Chatbot component
     # Footer
     gr.HTML("""
     <div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
+        <p>This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!</p>
+        <p><strong>Speed optimizations:</strong> Smaller model, quantization, shorter responses, optimized parameters.</p>
     </div>
     """)