Instructions to use ethicalabs/Echo-DSRN-114M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use ethicalabs/Echo-DSRN-114M with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="ethicalabs/Echo-DSRN-114M", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("ethicalabs/Echo-DSRN-114M", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use ethicalabs/Echo-DSRN-114M with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "ethicalabs/Echo-DSRN-114M"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ethicalabs/Echo-DSRN-114M",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/ethicalabs/Echo-DSRN-114M

SGLang

How to use ethicalabs/Echo-DSRN-114M with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "ethicalabs/Echo-DSRN-114M" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ethicalabs/Echo-DSRN-114M",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "ethicalabs/Echo-DSRN-114M" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "ethicalabs/Echo-DSRN-114M",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use ethicalabs/Echo-DSRN-114M with Docker Model Runner:
```
docker model run hf.co/ethicalabs/Echo-DSRN-114M
```

Echo-DSRN-114M / handler.py

mrs83

Upload 5 files

f80e336 verified about 2 months ago

raw

history blame contribute delete

8.25 kB

	import os
	from typing import Any, Dict, List

	import torch
	from transformers import (
	AutoConfig,
	AutoModelForCausalLM,
	AutoTokenizer,
	StoppingCriteria,
	StoppingCriteriaList,
	)

	from .modeling_echo import EchoConfig, EchoForCausalLM

	# Register local architecture to override remote code
	AutoConfig.register("echo", EchoConfig)
	AutoModelForCausalLM.register(EchoConfig, EchoForCausalLM)


	class StringStoppingCriteria(StoppingCriteria):
	def __init__(self, tokenizer, stop_strings):
	self.tokenizer = tokenizer
	self.stop_strings = stop_strings

	def __call__(self, input_ids, scores, **kwargs):
	generated_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=False)
	for stop_string in self.stop_strings:
	if stop_string in generated_text[-(len(stop_string) + 20) :]:
	if generated_text.strip().endswith(stop_string):
	return True
	return False


	class EndpointHandler:
	"""
	Custom Handler for Hugging Face Inference Endpoints.
	Ensures correct initialization of the Echo-DSRN model and fixes the pad_token error.
	"""

	def __init__(self, path=""):
	print(f"Loading Echo-DSRN from {path}...")
	"cuda" if torch.cuda.is_available() else "cpu"

	# Determine if path is an adapter or a full model
	from peft import PeftConfig, PeftModel

	adapter_config_path = os.path.join(path, "adapter_config.json")

	tokenizer_path = path

	if os.path.exists(adapter_config_path):
	print(f"Detected LoRA adapter in {path}")
	peft_config = PeftConfig.from_pretrained(path)
	base_model_path = peft_config.base_model_name_or_path
	tokenizer_path = base_model_path # Use base model for tokenizer
	print(f"Loading base model: {base_model_path}")

	# USE LOCAL EchoForCausalLM to ensure our fixes are active!
	model = EchoForCausalLM.from_pretrained(
	base_model_path,
	device_map="auto",
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	trust_remote_code=False,
	)
	print("Applying adapter and merging...")
	model = PeftModel.from_pretrained(model, path)
	self.model = model.merge_and_unload()
	else:
	print(f"Loading full model: {path}")
	self.model = EchoForCausalLM.from_pretrained(
	path,
	device_map="auto",
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	trust_remote_code=False,
	)

	print(f"Loading tokenizer from {tokenizer_path}...")
	self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
	self.tokenizer.pad_token_id = 32000 # <\|endoftext\|>
	self.eos_token_ids = [32000, 32007, 32011]

	# Pre-compile stopping criteria strings matching talk.py
	self.stop_strings = ["<\|im_end\|>", "<\|end\|>", "<\|user\|>"]

	self.model.eval()
	print("Model and Tokenizer loaded successfully (Local Code Forced).")

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Args:
	data (:obj: `Dict`):
	- "inputs": The prompt for generation.
	- "parameters" (optional): Dictionary of generation parameters.
	Returns:
	A :obj:`list`: A list containing the generated text/logprobs.
	"""
	inputs = data.pop("inputs", data)
	parameters = data.pop(
	"parameters",
	{
	"max_new_tokens": 128,
	"temperature": 0.7,
	"top_p": 0.9,
	"do_sample": True,
	"repetition_penalty": 1.2,
	"use_cache": False,
	},
	)
	# Ensure use_cache is False even if passed
	parameters["use_cache"] = False

	# Extract special flags
	logprobs_count = parameters.pop("logprobs", None)
	echo = parameters.pop("echo", False)

	# Handle Chat vs Completion inputs
	if isinstance(inputs, list):
	for msg in inputs:
	if isinstance(msg.get("content"), list):
	text_content = ""
	for part in msg["content"]:
	if part.get("type") == "text":
	text_content += part.get("text", "")
	msg["content"] = text_content

	inputs = self.tokenizer.apply_chat_template(
	inputs, tokenize=False, add_generation_prompt=True
	)

	# Tokenize inputs
	input_tokens = self.tokenizer(inputs, return_tensors="pt").to(self.model.device)
	input_ids = input_tokens.input_ids

	# Extract special params
	eos_token_id = parameters.pop("eos_token_id", self.eos_token_ids)
	pad_token_id = parameters.pop("pad_token_id", self.tokenizer.pad_token_id)
	repetition_penalty = parameters.pop("repetition_penalty", 1.2)

	tokenizer_stop = StringStoppingCriteria(self.tokenizer, self.stop_strings)

	all_tokens = []
	all_logprobs = []
	text_offsets = []
	current_offset = 0

	# Handle Prompt Logprobs (Echo)
	if logprobs_count is not None and echo:
	with torch.no_grad():
	outputs = self.model(input_ids)
	logits = outputs.logits # (B, T, V)

	# Shift logits to match input_ids for logprob calculation
	# input_ids[0, 1] logprob is logits[0, 0, input_ids[0, 1]]
	for i in range(input_ids.shape[1]):
	token_id = input_ids[0, i].item()
	token_text = self.tokenizer.decode([token_id])
	all_tokens.append(token_text)
	text_offsets.append(current_offset)
	current_offset += len(token_text)

	if i == 0:
	all_logprobs.append(None)
	else:
	lp = torch.nn.functional.log_softmax(logits[0, i - 1, :], dim=-1)
	all_logprobs.append(lp[token_id].item())

	# Generate output
	with torch.no_grad():
	gen_out = self.model.generate(
	**input_tokens,
	eos_token_id=eos_token_id,
	pad_token_id=pad_token_id,
	repetition_penalty=repetition_penalty,
	stopping_criteria=StoppingCriteriaList([tokenizer_stop]),
	output_scores=True if logprobs_count is not None else False,
	return_dict_in_generate=True if logprobs_count is not None else False,
	**parameters,
	)

	if logprobs_count is not None:
	output_ids = gen_out.sequences
	scores = gen_out.scores # list of (B, V) tensors

	# Process generated tokens
	input_len = input_ids.shape[1]
	generated_ids = output_ids[0, input_len:]

	for i, token_id in enumerate(generated_ids):
	token_id = token_id.item()
	token_text = self.tokenizer.decode([token_id])
	all_tokens.append(token_text)

	lp = torch.nn.functional.log_softmax(scores[i][0, :], dim=-1)
	all_logprobs.append(lp[token_id].item())

	text_offsets.append(current_offset)
	current_offset += len(token_text)

	decoded_output = self.tokenizer.decode(generated_ids, skip_special_tokens=True)

	logprobs_dict = {
	"tokens": all_tokens,
	"token_logprobs": all_logprobs,
	"top_logprobs": [],
	"text_offset": text_offsets,
	}

	return [{"generated_text": decoded_output, "logprobs": logprobs_dict}]
	else:
	output_ids = gen_out
	input_len = input_ids.shape[1]
	decoded_output = self.tokenizer.decode(
	output_ids[0][input_len:], skip_special_tokens=True
	)
	return [{"generated_text": decoded_output}]