Spaces:

anaspro
/

chatbox

Runtime error

App Files Files Community

anaspro commited on Oct 26

Commit

151da18

1 Parent(s): 2f06f2b

updatE

Browse files

Files changed (1) hide show

app.py +63 -25

app.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import os
 import torch
 import transformers
-from transformers import pipeline
 import gradio as gr
 import spaces
@@ -22,21 +22,65 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
-# Initialize pipeline for chat
-pipeline_model = pipeline(
-    "text-generation",
-    model=model_path,
-    device_map="auto",
     token=hf_token,
     trust_remote_code=True
 )
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
     """Generate response using the pipeline with messages format"""
     # Apply chat template
     try:
-        prompt = pipeline_model.tokenizer.apply_chat_template(
-            messages,
             tokenize=False,
             add_generation_prompt=True
         )
@@ -44,14 +88,12 @@ def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=
         print(f"Template application error: {template_error}")
         # Fallback: manually format messages
         prompt = ""
-        for msg in messages:
-            if msg['role'] == 'system':
-                prompt += f"System: {msg['content']}\n\n"
-            elif msg['role'] == 'user':
-                prompt += f"User: {msg['content']}\n"
-            elif msg['role'] == 'assistant':
-                prompt += f"Assistant: {msg['content']}\n"
-        prompt += "Assistant: "
     # Debug: print final prompt
     print(f"Final prompt preview: {prompt[:200]}...")
@@ -80,21 +122,17 @@ def generate_response(message, history, max_new_tokens, temperature, top_p, top_
         max_new_tokens, temperature, top_p, top_k, repetition_penalty: Generation parameters
     """
     try:
-        # Build messages list - Gemma template expects alternating user/assistant after system
         messages = []
-        # Add system message first (will be handled specially by the template)
         messages.append({"role": "system", "content": DEFAULT_SYSTEM_PROMPT})
-        # Add conversation history (ensure alternating user/assistant)
         if history:
             for msg in history:
                 if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
-                    # Convert 'assistant' role to 'model' for Gemma template if needed
-                    role = msg['role']
-                    if role == 'assistant':
-                        role = 'assistant'  # Keep as assistant, template converts to 'model'
-                    messages.append({"role": role, "content": msg['content']})
         # Add current user message
         if isinstance(message, dict):
@@ -159,7 +197,7 @@ demo = gr.ChatInterface(
 - 🔧 دعم فني واستكشاف الأخطاء
 - 📋 معلومات الخدمات والإرشاد
 - 🧠 **يتذكر المحادثة السابقة** - يمكنك الرجوع للمواضيع السابقة
-- 🎯 مدعوم بـ موديل Unsloth Meta-Llama-3.1-8B-Instruct-bnb-4bit
 احجي مع أليكس لحل مشاكلك التقنية، استفسر عن الخدمات، أو احصل على معلومات المنتجات.""",
     fill_height=True,

 import os
 import torch
 import transformers
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 import spaces
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
+# Initialize model and tokenizer separately for better control
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(
+    model_path,
     token=hf_token,
     trust_remote_code=True
 )
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    device_map="auto",
+    token=hf_token,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    quantization_config={
+        "load_in_4bit": True,
+        "bnb_4bit_use_double_quant": True,
+        "bnb_4bit_quant_type": "nf4",
+        "bnb_4bit_compute_dtype": torch.bfloat16
+    }
+)
+# Create pipeline with the loaded model
+pipeline_model = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device_map="auto"
+)
+print("Model loaded successfully!")
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
     """Generate response using the pipeline with messages format"""
+    # Gemma expects messages in format: [{"role": "user", "content": "..."}, {"role": "model", "content": "..."}]
+    # Convert 'assistant' to 'model' for Gemma
+    gemma_messages = []
+    for msg in messages:
+        role = msg['role']
+        # Gemma uses 'model' instead of 'assistant'
+        if role == 'assistant':
+            role = 'model'
+        # Gemma doesn't use system role in the same way - prepend to first user message
+        if role == 'system':
+            continue  # We'll handle system prompt differently
+        gemma_messages.append({"role": role, "content": msg['content']})
+    # If there's a system prompt, prepend it to the first user message
+    if messages and messages[0]['role'] == 'system' and gemma_messages:
+        system_content = messages[0]['content']
+        if gemma_messages[0]['role'] == 'user':
+            gemma_messages[0]['content'] = f"{system_content}\n\n{gemma_messages[0]['content']}"
     # Apply chat template
     try:
+        prompt = tokenizer.apply_chat_template(
+            gemma_messages,
             tokenize=False,
             add_generation_prompt=True
         )
         print(f"Template application error: {template_error}")
         # Fallback: manually format messages
         prompt = ""
+        for msg in gemma_messages:
+            if msg['role'] == 'user':
+                prompt += f"<start_of_turn>user\n{msg['content']}<end_of_turn>\n"
+            elif msg['role'] == 'model':
+                prompt += f"<start_of_turn>model\n{msg['content']}<end_of_turn>\n"
+        prompt += "<start_of_turn>model\n"
     # Debug: print final prompt
     print(f"Final prompt preview: {prompt[:200]}...")
         max_new_tokens, temperature, top_p, top_k, repetition_penalty: Generation parameters
     """
     try:
+        # Build messages list - Gemma template expects alternating user/model
         messages = []
+        # Add system message first (will be prepended to first user message)
         messages.append({"role": "system", "content": DEFAULT_SYSTEM_PROMPT})
+        # Add conversation history
         if history:
             for msg in history:
                 if isinstance(msg, dict) and 'role' in msg and 'content' in msg:
+                    messages.append({"role": msg['role'], "content": msg['content']})
         # Add current user message
         if isinstance(message, dict):
 - 🔧 دعم فني واستكشاف الأخطاء
 - 📋 معلومات الخدمات والإرشاد
 - 🧠 **يتذكر المحادثة السابقة** - يمكنك الرجوع للمواضيع السابقة
+- 🎯 مدعوم بـ موديل Gemma-3-4B-IT
 احجي مع أليكس لحل مشاكلك التقنية، استفسر عن الخدمات، أو احصل على معلومات المنتجات.""",
     fill_height=True,