Smllm2

Sleeping

App Files Files Community

ghosthets commited on Dec 12, 2025

Commit

2aa19f3

verified ·

1 Parent(s): 05fe403

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -49

app.py CHANGED Viewed

@@ -1,30 +1,36 @@
 import flask
 from flask import request, jsonify
-# Use AutoModelForCausalLM for Decoder-only models like Qwen
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-# Initialize the Flask application
 app = flask.Flask(__name__)
-# Qwen1.5-0.5B-Chat Model ID
-model_id = "Qwen/Qwen1.5-0.5B-Chat"
 print(f"🔄 Loading {model_id} model...")
-# Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-# Load the model using the correct CausalLM class
-# Using bfloat16 for better memory/speed if a compatible GPU is available
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-# Set the device (GPU/CPU)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
-print(f"✅ {model_id} Model loaded successfully!")
 @app.route('/chat', methods=['POST'])
 def chat():
     try:
@@ -34,57 +40,31 @@ def chat():
         if not msg:
             return jsonify({"error": "No message sent"}), 400
-        # --- Qwen1.5 Chat Template Formatting ---
-        # Qwen models require input in the ChatML format.
-        chat_history = [{"role": "user", "content": msg}]
-        # apply_chat_template handles the specific formatting (e.g., <|im_start|>user\n...)
-        formatted_prompt = tokenizer.apply_chat_template(
-            chat_history,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        # Tokenize the formatted prompt
-        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
-        # Generation configuration
         output = model.generate(
             **inputs,
-            max_length=256,
             do_sample=True,
-            top_p=0.8,
             temperature=0.6,
-            # Set pad_token_id to eos_token_id, which is often necessary for Causal LMs
-            pad_token_id=tokenizer.eos_token_id
         )
-        # Decode the full output
-        full_reply = tokenizer.decode(output[0], skip_special_tokens=False)
-        # --- Extract only the Generated Response ---
-        # Qwen ChatML format uses '<|im_start|>assistant\n' before the response
-        assistant_tag = "<|im_start|>assistant\n"
-        if assistant_tag in full_reply:
-             # Split the full reply and take the content after the assistant tag
-             reply = full_reply.split(assistant_tag)[-1].strip()
-             # Remove the end-of-message tag if it was generated
-             if "<|im_end|>" in reply:
-                 reply = reply.split("<|im_end|>")[0].strip()
-        else:
-             # Fallback: Decode only the newly generated tokens
-             reply = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
         return jsonify({"reply": reply})
     except Exception as e:
-        # Catch any runtime errors
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
-    # Run the Flask app
     app.run(host='0.0.0.0', port=7860)

 import flask
 from flask import request, jsonify
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 app = flask.Flask(__name__)
+# ---------------------------
+# SMALL LLM MODEL (1–2 GB)
+# ---------------------------
+# Best small model: SmolLM-1.7B-Chat
+model_id = "HuggingFaceTB/SmolLM-1.7B-Chat"
 print(f"🔄 Loading {model_id} model...")
+# Load tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Load model (auto dtype to avoid errors)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+)
+# Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+print(f"✅ {model_id} loaded successfully!")
+# ---------------------------
+# Chat Endpoint
+# ---------------------------
 @app.route('/chat', methods=['POST'])
 def chat():
     try:
         if not msg:
             return jsonify({"error": "No message sent"}), 400
+        # SmolLM uses normal text prompt (no ChatML)
+        prompt = f"<|user|>\n{msg}\n<|assistant|>\n"
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
         output = model.generate(
             **inputs,
+            max_new_tokens=256,
             do_sample=True,
             temperature=0.6,
+            top_p=0.8,
+            pad_token_id=tokenizer.eos_token_id,
         )
+        reply = tokenizer.decode(output[0], skip_special_tokens=True)
+        # Extract only assistant part
+        if "<|assistant|>" in reply:
+            reply = reply.split("<|assistant|>")[-1].strip()
         return jsonify({"reply": reply})
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
     app.run(host='0.0.0.0', port=7860)