Spaces:

samzito12
/

iris

Sleeping

App Files Files Community

samzito12 commited on Dec 3, 2025

Commit

27a8bb1

1 Parent(s): a282b8f

try to improve the inference

Browse files

Files changed (2) hide show

app.py +21 -5
requirements.txt +1 -3

app.py CHANGED Viewed

@@ -8,13 +8,25 @@ model_name = "samzito12/lora_model2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
 )
-SYSTEM_PROMPT = "You are a helpful AI assistant based on Meta's Llama-3.2-3B model, fine-tuned on the FineTome dataset."
 def chat(message, history):
     # Build conversation
@@ -26,15 +38,16 @@ def chat(message, history):
     conversation += f"User: {message}\nAssistant:"
     # Tokenize
-    inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=2048)
     # Generate
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=256,
             temperature=0.7,
             do_sample=True,
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
@@ -54,7 +67,7 @@ demo = gr.ChatInterface(
     chat,
     title="🦙 My Fine-Tuned Llama-3.2-3B Chatbot",
     description="""
-    **Model:** Llama-3.2-3B fine-tuned on FineTome-100k dataset
     it's a custom fine-tuned model for ID2223 Lab 2.
     """,
@@ -63,7 +76,10 @@ demo = gr.ChatInterface(
         "Explain machine learning in simple terms",
         "Write a Python function to reverse a string"
     ],
-    theme="soft"
 )
 if __name__ == "__main__":

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left"
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
     device_map="cpu",
+    torch_dtype=torch.float32,
+    lower_cpu_mem_usage=True
 )
+print("⚙️ Quantification du modèle pour optimisation CPU...")
+model = torch.quantization.quantize_dynamic(
+    model,
+    {torch.nn.Linear},
+    dtype=torch.qint8
+)
+model.eval()
+SYSTEM_PROMPT = "You are a helpful AI assistant based on Meta's Llama-3.2-3B model, fine-tuned on a code dataset."
 def chat(message, history):
     # Build conversation
     conversation += f"User: {message}\nAssistant:"
     # Tokenize
+    inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024, padding=True)
     # Generate
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=128,
             temperature=0.7,
             do_sample=True,
+            use_cache=True,
             pad_token_id=tokenizer.eos_token_id,
             eos_token_id=tokenizer.eos_token_id
         )
     chat,
     title="🦙 My Fine-Tuned Llama-3.2-3B Chatbot",
     description="""
+    **Model:** Llama-3.2-3B fine-tuned on a code dataset
     it's a custom fine-tuned model for ID2223 Lab 2.
     """,
         "Explain machine learning in simple terms",
         "Write a Python function to reverse a string"
     ],
+    theme="soft",
+    retry_btn=None,  # Désactive retry pour éviter surcharge
+    undo_btn=None,
+    clear_btn="Clear"
 )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
 gradio
 transformers
 torch
-accelerate
-huggingface-hub
-bitsandbytes

 gradio
 transformers
 torch
+accelerate