Spaces:
Sleeping
Sleeping
| import flask | |
| from flask import request, jsonify | |
| from transformers import pipeline, AutoTokenizer # Added AutoTokenizer | |
| import torch | |
| import warnings | |
| # Suppress minor warnings that occur on CPU runs | |
| warnings.filterwarnings("ignore") | |
| app = flask.Flask(__name__) | |
| # =========================== | |
| # LOAD MODEL (SmolLM-1.7B-Chat) | |
| # This model is small (1.7B) and fully open-access. | |
| # =========================== | |
| model_id = "HuggingFaceTB/SmolLM-1.7B" | |
| print("๐ Loading model...") | |
| # CPU/GPU device set | |
| device = 0 if torch.cuda.is_available() else -1 | |
| # Use float32 for CPU (or bfloat16 for GPU) | |
| dtype = torch.float32 if device == -1 else torch.bfloat16 | |
| try: | |
| # 1. Load Tokenizer and set pad_token for stability | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| # Set pad_token to eos_token to fix generation warning/error | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # 2. Load Pipeline with the fixed tokenizer | |
| ai = pipeline( | |
| "text-generation", | |
| model=model_id, | |
| tokenizer=tokenizer, # Passing the configured tokenizer here | |
| max_new_tokens=200, | |
| device=device, | |
| torch_dtype=dtype, | |
| trust_remote_code=True | |
| ) | |
| print("โ Model loaded!") | |
| except Exception as e: | |
| print(f"โ Error loading model: {e}") | |
| ai = None | |
| # =========================== | |
| # CHAT API | |
| # =========================== | |
| def chat(): | |
| if ai is None: | |
| return jsonify({"error": "Model initialization failed."}), 500 | |
| try: | |
| data = request.get_json() | |
| msg = data.get("message", "") | |
| if not msg: | |
| return jsonify({"error": "No message sent"}), 400 | |
| # Instruction Format: Using a simple template for this model | |
| prompt = f"User: {msg}\nAssistant:" | |
| output = ai(prompt)[0]["generated_text"] | |
| # Clean the output to extract only the model's reply | |
| # We split based on the 'Assistant:' tag in the prompt template | |
| if "Assistant:" in output: | |
| reply = output.split("Assistant:")[-1].strip() | |
| elif "User:" in output: # Sometimes the model repeats the prompt | |
| reply = output.split("User:")[0].strip() | |
| else: | |
| reply = output.strip() | |
| # Remove any remaining instruction markers from the start | |
| if reply.startswith(msg): | |
| reply = reply[len(msg):].strip() | |
| return jsonify({"reply": reply}) | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| # =========================== | |
| # RUN SERVER | |
| # =========================== | |
| if __name__ == "__main__": | |
| app.run(host='0.0.0.0', port=7860) |