Text Generation
Transformers
Safetensors
English
echo
text-generation-inference
conversational
custom_code
🇪🇺 Region: EU
Instructions to use ethicalabs/Echo-DSRN-114M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ethicalabs/Echo-DSRN-114M with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ethicalabs/Echo-DSRN-114M", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("ethicalabs/Echo-DSRN-114M", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use ethicalabs/Echo-DSRN-114M with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ethicalabs/Echo-DSRN-114M" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ethicalabs/Echo-DSRN-114M", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ethicalabs/Echo-DSRN-114M
- SGLang
How to use ethicalabs/Echo-DSRN-114M with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ethicalabs/Echo-DSRN-114M" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ethicalabs/Echo-DSRN-114M", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ethicalabs/Echo-DSRN-114M" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ethicalabs/Echo-DSRN-114M", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ethicalabs/Echo-DSRN-114M with Docker Model Runner:
docker model run hf.co/ethicalabs/Echo-DSRN-114M
| import os | |
| from typing import Any, Dict, List | |
| import torch | |
| from transformers import ( | |
| AutoConfig, | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| StoppingCriteria, | |
| StoppingCriteriaList, | |
| ) | |
| from .modeling_echo import EchoConfig, EchoForCausalLM | |
| # Register local architecture to override remote code | |
| AutoConfig.register("echo", EchoConfig) | |
| AutoModelForCausalLM.register(EchoConfig, EchoForCausalLM) | |
| class StringStoppingCriteria(StoppingCriteria): | |
| def __init__(self, tokenizer, stop_strings): | |
| self.tokenizer = tokenizer | |
| self.stop_strings = stop_strings | |
| def __call__(self, input_ids, scores, **kwargs): | |
| generated_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=False) | |
| for stop_string in self.stop_strings: | |
| if stop_string in generated_text[-(len(stop_string) + 20) :]: | |
| if generated_text.strip().endswith(stop_string): | |
| return True | |
| return False | |
| class EndpointHandler: | |
| """ | |
| Custom Handler for Hugging Face Inference Endpoints. | |
| Ensures correct initialization of the Echo-DSRN model and fixes the pad_token error. | |
| """ | |
| def __init__(self, path=""): | |
| print(f"Loading Echo-DSRN from {path}...") | |
| "cuda" if torch.cuda.is_available() else "cpu" | |
| # Determine if path is an adapter or a full model | |
| from peft import PeftConfig, PeftModel | |
| adapter_config_path = os.path.join(path, "adapter_config.json") | |
| tokenizer_path = path | |
| if os.path.exists(adapter_config_path): | |
| print(f"Detected LoRA adapter in {path}") | |
| peft_config = PeftConfig.from_pretrained(path) | |
| base_model_path = peft_config.base_model_name_or_path | |
| tokenizer_path = base_model_path # Use base model for tokenizer | |
| print(f"Loading base model: {base_model_path}") | |
| # USE LOCAL EchoForCausalLM to ensure our fixes are active! | |
| model = EchoForCausalLM.from_pretrained( | |
| base_model_path, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| trust_remote_code=False, | |
| ) | |
| print("Applying adapter and merging...") | |
| model = PeftModel.from_pretrained(model, path) | |
| self.model = model.merge_and_unload() | |
| else: | |
| print(f"Loading full model: {path}") | |
| self.model = EchoForCausalLM.from_pretrained( | |
| path, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| trust_remote_code=False, | |
| ) | |
| print(f"Loading tokenizer from {tokenizer_path}...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) | |
| self.tokenizer.pad_token_id = 32000 # <|endoftext|> | |
| self.eos_token_ids = [32000, 32007, 32011] | |
| # Pre-compile stopping criteria strings matching talk.py | |
| self.stop_strings = ["<|im_end|>", "<|end|>", "<|user|>"] | |
| self.model.eval() | |
| print("Model and Tokenizer loaded successfully (Local Code Forced).") | |
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """ | |
| Args: | |
| data (:obj: `Dict`): | |
| - "inputs": The prompt for generation. | |
| - "parameters" (optional): Dictionary of generation parameters. | |
| Returns: | |
| A :obj:`list`: A list containing the generated text/logprobs. | |
| """ | |
| inputs = data.pop("inputs", data) | |
| parameters = data.pop( | |
| "parameters", | |
| { | |
| "max_new_tokens": 128, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "do_sample": True, | |
| "repetition_penalty": 1.2, | |
| "use_cache": False, | |
| }, | |
| ) | |
| # Ensure use_cache is False even if passed | |
| parameters["use_cache"] = False | |
| # Extract special flags | |
| logprobs_count = parameters.pop("logprobs", None) | |
| echo = parameters.pop("echo", False) | |
| # Handle Chat vs Completion inputs | |
| if isinstance(inputs, list): | |
| for msg in inputs: | |
| if isinstance(msg.get("content"), list): | |
| text_content = "" | |
| for part in msg["content"]: | |
| if part.get("type") == "text": | |
| text_content += part.get("text", "") | |
| msg["content"] = text_content | |
| inputs = self.tokenizer.apply_chat_template( | |
| inputs, tokenize=False, add_generation_prompt=True | |
| ) | |
| # Tokenize inputs | |
| input_tokens = self.tokenizer(inputs, return_tensors="pt").to(self.model.device) | |
| input_ids = input_tokens.input_ids | |
| # Extract special params | |
| eos_token_id = parameters.pop("eos_token_id", self.eos_token_ids) | |
| pad_token_id = parameters.pop("pad_token_id", self.tokenizer.pad_token_id) | |
| repetition_penalty = parameters.pop("repetition_penalty", 1.2) | |
| tokenizer_stop = StringStoppingCriteria(self.tokenizer, self.stop_strings) | |
| all_tokens = [] | |
| all_logprobs = [] | |
| text_offsets = [] | |
| current_offset = 0 | |
| # Handle Prompt Logprobs (Echo) | |
| if logprobs_count is not None and echo: | |
| with torch.no_grad(): | |
| outputs = self.model(input_ids) | |
| logits = outputs.logits # (B, T, V) | |
| # Shift logits to match input_ids for logprob calculation | |
| # input_ids[0, 1] logprob is logits[0, 0, input_ids[0, 1]] | |
| for i in range(input_ids.shape[1]): | |
| token_id = input_ids[0, i].item() | |
| token_text = self.tokenizer.decode([token_id]) | |
| all_tokens.append(token_text) | |
| text_offsets.append(current_offset) | |
| current_offset += len(token_text) | |
| if i == 0: | |
| all_logprobs.append(None) | |
| else: | |
| lp = torch.nn.functional.log_softmax(logits[0, i - 1, :], dim=-1) | |
| all_logprobs.append(lp[token_id].item()) | |
| # Generate output | |
| with torch.no_grad(): | |
| gen_out = self.model.generate( | |
| **input_tokens, | |
| eos_token_id=eos_token_id, | |
| pad_token_id=pad_token_id, | |
| repetition_penalty=repetition_penalty, | |
| stopping_criteria=StoppingCriteriaList([tokenizer_stop]), | |
| output_scores=True if logprobs_count is not None else False, | |
| return_dict_in_generate=True if logprobs_count is not None else False, | |
| **parameters, | |
| ) | |
| if logprobs_count is not None: | |
| output_ids = gen_out.sequences | |
| scores = gen_out.scores # list of (B, V) tensors | |
| # Process generated tokens | |
| input_len = input_ids.shape[1] | |
| generated_ids = output_ids[0, input_len:] | |
| for i, token_id in enumerate(generated_ids): | |
| token_id = token_id.item() | |
| token_text = self.tokenizer.decode([token_id]) | |
| all_tokens.append(token_text) | |
| lp = torch.nn.functional.log_softmax(scores[i][0, :], dim=-1) | |
| all_logprobs.append(lp[token_id].item()) | |
| text_offsets.append(current_offset) | |
| current_offset += len(token_text) | |
| decoded_output = self.tokenizer.decode(generated_ids, skip_special_tokens=True) | |
| logprobs_dict = { | |
| "tokens": all_tokens, | |
| "token_logprobs": all_logprobs, | |
| "top_logprobs": [], | |
| "text_offset": text_offsets, | |
| } | |
| return [{"generated_text": decoded_output, "logprobs": logprobs_dict}] | |
| else: | |
| output_ids = gen_out | |
| input_len = input_ids.shape[1] | |
| decoded_output = self.tokenizer.decode( | |
| output_ids[0][input_len:], skip_special_tokens=True | |
| ) | |
| return [{"generated_text": decoded_output}] | |