Fredtt3 commited on
Commit
f761fb7
·
verified ·
1 Parent(s): 4edfecb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +135 -1
README.md CHANGED
@@ -18,4 +18,138 @@ tags:
18
 
19
  <h1 align="center">Athenea-4B-Math</h1>
20
 
21
- ![image](atheneamodel.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  <h1 align="center">Athenea-4B-Math</h1>
20
 
21
+ ![image](atheneamodel.png)
22
+
23
+ ## 💻 Usage
24
+
25
+ ### Installation
26
+
27
+ ```bash
28
+ uv pip install transformers torch accelerate
29
+ ```
30
+
31
+ ### Basic Inference
32
+
33
+ ```python
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer
35
+ import torch
36
+ model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Math",
37
+ dtype=torch.bfloat16,
38
+ trust_remote_code=True,
39
+ device_map="auto",
40
+ attn_implementation="flash_attention_2") # Requires flash-attn
41
+
42
+ # Without flash-attn:
43
+ # model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Math",
44
+ # dtype="auto",
45
+ # device_map="auto"
46
+ # )
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained("Aquiles-ai/Athenea-4B-Math", trust_remote_code=True)
49
+
50
+ messages = [
51
+ {"role": "user", "content": "Hey, find the derivative of 3x^4 - 2x^2 + 5x - 7"}
52
+ ]
53
+
54
+ inputs = tokenizer.apply_chat_template(
55
+ messages,
56
+ add_generation_prompt=True,
57
+ tokenize=True,
58
+ return_dict=True,
59
+ return_tensors="pt",
60
+ ).to('cuda')
61
+
62
+ with torch.no_grad():
63
+ output = model.generate(
64
+ **inputs,
65
+ max_new_tokens=8092,
66
+ pad_token_id=tokenizer.eos_token_id,
67
+ eos_token_id=tokenizer.eos_token_id,
68
+ )
69
+
70
+ # Decode and print the output
71
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
72
+ ```
73
+
74
+ ### Streaming Inference
75
+
76
+ ```python
77
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
78
+ import torch
79
+ from threading import Thread
80
+ model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Math",
81
+ dtype=torch.bfloat16,
82
+ trust_remote_code=True,
83
+ device_map="auto",
84
+ attn_implementation="flash_attention_2")
85
+ tokenizer = AutoTokenizer.from_pretrained("Aquiles-ai/Athenea-4B-Math", trust_remote_code=True)
86
+ messages = [
87
+ {"role": "user", "content": "Hey, find the derivative of x^2(3x + 1) using the product rule."}
88
+ ]
89
+
90
+ inputs = tokenizer.apply_chat_template(
91
+ messages,
92
+ add_generation_prompt=True,
93
+ tokenize=True,
94
+ return_dict=True,
95
+ return_tensors="pt",
96
+ ).to('cuda')
97
+
98
+ # Create the streamer
99
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
100
+
101
+ # Build kwargs for generate
102
+ generate_kwargs = dict(
103
+ **inputs,
104
+ max_new_tokens=8092,
105
+ pad_token_id=tokenizer.eos_token_id,
106
+ eos_token_id=tokenizer.eos_token_id,
107
+ streamer=streamer,
108
+ )
109
+
110
+ def _generate_thread(model, kwargs):
111
+ with torch.no_grad():
112
+ model.generate(**kwargs)
113
+
114
+ thread = Thread(target=_generate_thread, args=(model, generate_kwargs))
115
+
116
+ thread.start()
117
+
118
+ for chunk in streamer:
119
+ print(chunk, end="", flush=True)
120
+ ```
121
+
122
+ ### Production Deployment with vLLM
123
+
124
+ **Start server:**
125
+
126
+ ```bash
127
+ vllm serve Aquiles-ai/Athenea-4B-Math \
128
+ --host 0.0.0.0 \
129
+ --port 8000 \
130
+ --api-key dummyapikey \
131
+ --max-model-len=16384 \
132
+ --async-scheduling \
133
+ --gpu-memory-utilization=0.90
134
+ ```
135
+
136
+ **Request to the server from the OpenAI client:**
137
+
138
+ ```python
139
+ from openai import OpenAI
140
+ client = OpenAI(api_key="dummyapikey", base_url="http://127.0.0.1:8000/v1")
141
+ stream = client.chat.completions.create(
142
+ model="Aquiles-ai/Athenea-4B-Math",
143
+ messages=[{
144
+ "role": "user",
145
+ "content": "Hey, find the indefinite integral of 4x^3 -2x + 7"
146
+ }],
147
+ max_tokens=8092,
148
+ stream=True
149
+ )
150
+ for chunk in stream:
151
+ if chunk.choices[0].delta.content:
152
+ print(chunk.choices[0].delta.content, end="", flush=True)
153
+ ```
154
+
155
+ **vLLM Benefits:** 20-30x faster inference, OpenAI-compatible API, continuous batching, async scheduling.