Spaces:

anaspro
/

chatbox

Runtime error

App Files Files Community

anaspro commited on Oct 26

Commit

238300f

1 Parent(s): 6d60e00

updatE

Browse files

Files changed (1) hide show

app.py +48 -73

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-# -*- coding: utf-8 -*-
 import os
 import torch
 import transformers
-from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import gradio as gr
 import spaces
@@ -22,64 +20,31 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
-# Initialize model and tokenizer separately for better control
-print("Loading model and tokenizer...")
-try:
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_path,
-        token=hf_token,
-        trust_remote_code=True
-    )
-    # Load model with proper quantization config
-    from transformers import BitsAndBytesConfig
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16
-    )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map="auto",
-        token=hf_token,
-        trust_remote_code=True,
-        torch_dtype=torch.bfloat16,
-        low_cpu_mem_usage=True
-    )
-    # Create pipeline with the loaded model
-    pipeline_model = pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer
-    )
-    print("Model loaded successfully!")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    # Fallback to direct pipeline loading
-    print("Trying alternative loading method...")
-    pipeline_model = pipeline(
-        "text-generation",
-        model=model_path,
-        token=hf_token,
-        trust_remote_code=True,
-        model_kwargs={
-            "torch_dtype": torch.bfloat16,
-            "low_cpu_mem_usage": True,
-        }
-    )
-    tokenizer = pipeline_model.tokenizer
-    print("Model loaded with fallback method!")
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
-    """Generate response using the pipeline with messages format"""
     # Gemma expects messages in format: [{"role": "user", "content": "..."}, {"role": "model", "content": "..."}]
     # Convert 'assistant' to 'model' for Gemma
     gemma_messages = []
@@ -109,7 +74,7 @@ def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=
         )
     except Exception as template_error:
         print(f"Template application error: {template_error}")
-        # Fallback: manually format messages
         prompt = ""
         for msg in gemma_messages:
             if msg['role'] == 'user':
@@ -121,20 +86,29 @@ def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=
     # Debug: print final prompt
     print(f"Final prompt preview: {prompt[:200]}...")
-    outputs = pipeline_model(
-        prompt,
-        max_new_tokens=max_new_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
-        do_sample=True,
-        return_full_text=False
-    )
-    return outputs[0]["generated_text"]
-@spaces.GPU()
 def generate_response(message, history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
     """
     Generate response with full conversation history
@@ -168,7 +142,8 @@ def generate_response(message, history, max_new_tokens, temperature, top_p, top_
         # Debug: print messages structure
         print(f"Messages sent to model: {len(messages)} messages")
         for i, msg in enumerate(messages):
-            print(f"  Message {i}: role={msg['role']}, content_preview={msg['content'][:50]}...")
         # Generate response
         response = generate_with_pipeline(
@@ -234,4 +209,4 @@ demo = gr.ChatInterface(
 )
 if __name__ == "__main__":
-    demo.launch()

 import os
 import torch
 import transformers
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 import spaces
 # إذا كان فيه HF_TOKEN في البيئة
 hf_token = os.getenv("HF_TOKEN")
+# Initialize model and tokenizer for ZeroGPU
+print("Loading model and tokenizer for ZeroGPU...")
+# Load tokenizer first
+tokenizer = AutoTokenizer.from_pretrained(
+    model_path,
+    token=hf_token,
+    trust_remote_code=True
+)
+# For ZeroGPU, load model without specifying device_map
+# The @spaces.GPU() decorator will handle GPU allocation
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    token=hf_token,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,  # Use float16 for ZeroGPU
+    low_cpu_mem_usage=True
+)
+print("Model loaded successfully!")
 def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
+    """Generate response using the model with messages format"""
     # Gemma expects messages in format: [{"role": "user", "content": "..."}, {"role": "model", "content": "..."}]
     # Convert 'assistant' to 'model' for Gemma
     gemma_messages = []
         )
     except Exception as template_error:
         print(f"Template application error: {template_error}")
+        # Fallback: manually format messages for Gemma
         prompt = ""
         for msg in gemma_messages:
             if msg['role'] == 'user':
     # Debug: print final prompt
     print(f"Final prompt preview: {prompt[:200]}...")
+    # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    # Generate
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    # Decode only the new tokens
+    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
+    return response
+@spaces.GPU()  # This decorator handles GPU allocation for ZeroGPU
 def generate_response(message, history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
     """
     Generate response with full conversation history
         # Debug: print messages structure
         print(f"Messages sent to model: {len(messages)} messages")
         for i, msg in enumerate(messages):
+            content_preview = msg['content'][:50] if len(msg['content']) > 50 else msg['content']
+            print(f"  Message {i}: role={msg['role']}, content_preview={content_preview}...")
         # Generate response
         response = generate_with_pipeline(
 )
 if __name__ == "__main__":
+    demo.launch()