arthu1 commited on
Commit
2b2f18c
·
1 Parent(s): 77b73a9

North Air 1 API — Instance 2 (load-balanced replica)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.json filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PORT=7860
7
+
8
+ COPY requirements.txt /app/requirements.txt
9
+ RUN pip install --no-cache-dir -r /app/requirements.txt
10
+
11
+ COPY . /app
12
+
13
+ EXPOSE 7860
14
+
15
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,11 @@
1
  ---
2
- title: North Air Api 2
3
- emoji: 📚
4
- colorFrom: green
5
  colorTo: green
6
  sdk: docker
7
- pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: North Air API 2
3
+ emoji: 🌬️
4
+ colorFrom: blue
5
  colorTo: green
6
  sdk: docker
7
+ app_port: 7860
8
  ---
9
 
10
+ # North Air 1 API Instance 2
11
+ Load-balanced replica.
app.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ import json
5
+ from typing import List, Optional
6
+ from threading import Thread
7
+
8
+ import torch
9
+ from fastapi import FastAPI, HTTPException
10
+ from fastapi.responses import StreamingResponse
11
+ from pydantic import BaseModel
12
+ from transformers import AutoTokenizer, TextIteratorStreamer
13
+
14
+ MODEL_DIR = os.getenv("MODEL_DIR", "./final_model")
15
+ MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
16
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.6"))
17
+ TOP_P = float(os.getenv("TOP_P", "0.85"))
18
+
19
+ SYSTEM_PROMPT = """You are North Air 1, built by North Air. 0.6B params, a custom model designed for helpful and concise responses.
20
+ Be direct, helpful, concise. Use markdown. Write clean code. Never fabricate facts.
21
+ If asked who you are: "I'm North Air 1, built by North Air." You are NOT ChatGPT/GPT-4/Claude/etc."""
22
+
23
+
24
+ class Message(BaseModel):
25
+ role: str
26
+ content: str
27
+
28
+
29
+ class ChatRequest(BaseModel):
30
+ messages: List[Message]
31
+ model: Optional[str] = "north-air-1"
32
+ max_new_tokens: Optional[int] = None
33
+ temperature: Optional[float] = None
34
+ top_p: Optional[float] = None
35
+ system_prompt: Optional[str] = None
36
+ stream: Optional[bool] = False
37
+ enable_thinking: Optional[bool] = False
38
+
39
+
40
+ app = FastAPI(title="North Air 1 API", version="4.0.0")
41
+
42
+ # ─── Model Loading: try ONNX first (fast), fallback to PyTorch ───
43
+ ONNX_SESSION = None
44
+ MODEL = None
45
+ TOKENIZER = None
46
+ LOAD_ERROR = None
47
+ INFERENCE_MODE = "pytorch" # or "onnx"
48
+
49
+
50
+ def _try_load_onnx():
51
+ """Try to load ONNX Runtime quantized model for 2-4x faster CPU inference."""
52
+ global ONNX_SESSION, INFERENCE_MODE
53
+ onnx_path = os.path.join(MODEL_DIR, "model_quantized.onnx")
54
+ if not os.path.exists(onnx_path):
55
+ onnx_path = os.path.join(MODEL_DIR, "model.onnx")
56
+ if not os.path.exists(onnx_path):
57
+ return False
58
+
59
+ try:
60
+ import onnxruntime as ort
61
+ sess_options = ort.SessionOptions()
62
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
63
+ sess_options.intra_op_num_threads = 4
64
+ sess_options.inter_op_num_threads = 2
65
+ sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
66
+
67
+ ONNX_SESSION = ort.InferenceSession(
68
+ onnx_path, sess_options,
69
+ providers=["CPUExecutionProvider"],
70
+ )
71
+ INFERENCE_MODE = "onnx"
72
+ print(f"ONNX Runtime loaded: {onnx_path}")
73
+ return True
74
+ except Exception as e:
75
+ print(f"ONNX load failed: {e}")
76
+ return False
77
+
78
+
79
+ def _load_model():
80
+ """Load model — ONNX quantized if available, else PyTorch."""
81
+ global MODEL, TOKENIZER, LOAD_ERROR, INFERENCE_MODE
82
+
83
+ try:
84
+ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True, trust_remote_code=True)
85
+ if TOKENIZER.pad_token is None:
86
+ TOKENIZER.pad_token = TOKENIZER.eos_token
87
+ except Exception as e:
88
+ LOAD_ERROR = f"Tokenizer load failed: {e}"
89
+ return
90
+
91
+ # Try ONNX first
92
+ if _try_load_onnx():
93
+ print(f"Using ONNX Runtime ({INFERENCE_MODE})")
94
+ return
95
+
96
+ # Fallback: PyTorch with optimizations
97
+ try:
98
+ from transformers import AutoModelForCausalLM
99
+ adapter_cfg = os.path.join(MODEL_DIR, "adapter_config.json")
100
+
101
+ if os.path.exists(adapter_cfg):
102
+ from peft import AutoPeftModelForCausalLM
103
+ MODEL = AutoPeftModelForCausalLM.from_pretrained(
104
+ MODEL_DIR, torch_dtype=torch.float32, device_map={"": "cpu"},
105
+ )
106
+ else:
107
+ MODEL = AutoModelForCausalLM.from_pretrained(
108
+ MODEL_DIR, torch_dtype=torch.float32, device_map={"": "cpu"},
109
+ trust_remote_code=True,
110
+ )
111
+
112
+ MODEL.eval()
113
+
114
+ # Dynamic INT8 quantization — only for non-PEFT models
115
+ # PEFT/LoRA models break with quantize_dynamic due to adapter wrapping
116
+ if not os.path.exists(adapter_cfg):
117
+ try:
118
+ MODEL = torch.quantization.quantize_dynamic(
119
+ MODEL, {torch.nn.Linear}, dtype=torch.qint8,
120
+ )
121
+ INFERENCE_MODE = "pytorch-int8"
122
+ print("PyTorch dynamic INT8 quantization applied")
123
+ except Exception as e:
124
+ INFERENCE_MODE = "pytorch"
125
+ print(f"Quantization skipped: {e}")
126
+ else:
127
+ INFERENCE_MODE = "pytorch"
128
+ print("PEFT model detected — skipping quantization (incompatible)")
129
+
130
+ print(f"Model loaded: {INFERENCE_MODE}")
131
+
132
+ except Exception as e:
133
+ LOAD_ERROR = str(e)
134
+
135
+
136
+ _load_model()
137
+
138
+
139
+ @app.get("/health")
140
+ def health():
141
+ ok = (MODEL is not None) or (ONNX_SESSION is not None)
142
+ return {
143
+ "ok": ok,
144
+ "model": "north-air-1",
145
+ "version": "4.0.0",
146
+ "architecture": "Qwen3-0.6B + LoRA r=64",
147
+ "inference": INFERENCE_MODE,
148
+ "features": ["streaming", "thinking", "quantized"],
149
+ "model_dir": MODEL_DIR,
150
+ "error": LOAD_ERROR,
151
+ }
152
+
153
+
154
+ def _build_prompt(messages: list, system: str, enable_thinking: bool) -> str:
155
+ has_system = any(m["role"] == "system" for m in messages)
156
+ if not has_system:
157
+ messages = [{"role": "system", "content": system}] + messages
158
+
159
+ if hasattr(TOKENIZER, "apply_chat_template"):
160
+ return TOKENIZER.apply_chat_template(
161
+ messages, tokenize=False, add_generation_prompt=True,
162
+ enable_thinking=enable_thinking,
163
+ )
164
+ return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
165
+
166
+
167
+ def _parse_thinking(text: str) -> tuple:
168
+ think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
169
+ if think_match:
170
+ thinking = think_match.group(1).strip()
171
+ answer = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
172
+ return thinking, answer
173
+ return "", text
174
+
175
+
176
+ def _generation_kwargs(input_ids, attention_mask, max_new_tokens, temperature, top_p, **extra):
177
+ return {
178
+ "input_ids": input_ids,
179
+ "attention_mask": attention_mask,
180
+ "max_new_tokens": max_new_tokens,
181
+ "temperature": max(temperature, 0.01),
182
+ "top_p": top_p,
183
+ "top_k": 40,
184
+ "do_sample": True,
185
+ "repetition_penalty": 1.2,
186
+ "pad_token_id": TOKENIZER.pad_token_id,
187
+ "eos_token_id": TOKENIZER.eos_token_id,
188
+ **extra,
189
+ }
190
+
191
+
192
+ def _check_model():
193
+ if MODEL is None and ONNX_SESSION is None:
194
+ raise HTTPException(status_code=500, detail=f"Model failed to load: {LOAD_ERROR}")
195
+ if TOKENIZER is None:
196
+ raise HTTPException(status_code=500, detail=f"Tokenizer failed to load: {LOAD_ERROR}")
197
+
198
+
199
+ def _prepare_request(req: ChatRequest):
200
+ system = req.system_prompt or SYSTEM_PROMPT
201
+ messages = [{"role": m.role, "content": m.content} for m in req.messages]
202
+ enable_thinking = req.enable_thinking if req.enable_thinking is not None else False
203
+
204
+ prompt = _build_prompt(messages, system, enable_thinking)
205
+ batch = TOKENIZER(prompt, return_tensors="pt", add_special_tokens=False)
206
+
207
+ max_new_tokens = req.max_new_tokens or MAX_NEW_TOKENS
208
+ temperature = req.temperature if req.temperature is not None else TEMPERATURE
209
+ top_p = req.top_p if req.top_p is not None else TOP_P
210
+
211
+ return batch, max_new_tokens, temperature, top_p
212
+
213
+
214
+ @app.post("/chat")
215
+ def chat(req: ChatRequest):
216
+ _check_model()
217
+
218
+ if not req.messages:
219
+ raise HTTPException(status_code=400, detail="messages are required")
220
+
221
+ if req.stream:
222
+ return chat_stream(req)
223
+
224
+ batch, max_new_tokens, temperature, top_p = _prepare_request(req)
225
+ input_ids = batch["input_ids"]
226
+ attention_mask = batch["attention_mask"]
227
+
228
+ t0 = time.time()
229
+
230
+ with torch.no_grad():
231
+ out = MODEL.generate(
232
+ **_generation_kwargs(input_ids, attention_mask, max_new_tokens, temperature, top_p)
233
+ )
234
+
235
+ elapsed = time.time() - t0
236
+ generated_ids = out[0][input_ids.shape[1]:]
237
+ completion = TOKENIZER.decode(generated_ids, skip_special_tokens=True).strip()
238
+ thinking, answer = _parse_thinking(completion)
239
+
240
+ return {
241
+ "output": answer,
242
+ "thinking": thinking if thinking else None,
243
+ "model": "north-air-1",
244
+ "inference": INFERENCE_MODE,
245
+ "tokens_generated": len(generated_ids),
246
+ "latency_ms": round(elapsed * 1000),
247
+ }
248
+
249
+
250
+ @app.post("/chat/stream")
251
+ def chat_stream(req: ChatRequest):
252
+ _check_model()
253
+
254
+ if not req.messages:
255
+ raise HTTPException(status_code=400, detail="messages are required")
256
+
257
+ batch, max_new_tokens, temperature, top_p = _prepare_request(req)
258
+ input_ids = batch["input_ids"]
259
+ attention_mask = batch["attention_mask"]
260
+
261
+ streamer = TextIteratorStreamer(TOKENIZER, skip_prompt=True, skip_special_tokens=True)
262
+
263
+ gen_kwargs = _generation_kwargs(
264
+ input_ids, attention_mask, max_new_tokens, temperature, top_p,
265
+ streamer=streamer,
266
+ )
267
+
268
+ t0 = time.time()
269
+ thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
270
+ thread.start()
271
+
272
+ def event_stream():
273
+ token_count = 0
274
+ in_thinking = False
275
+ buf = ""
276
+
277
+ for token_text in streamer:
278
+ buf += token_text
279
+ token_count += 1
280
+
281
+ if "<think>" in buf and not in_thinking:
282
+ in_thinking = True
283
+ yield f"data: {json.dumps({'type': 'thinking_start'})}\n\n"
284
+ after = buf.split("<think>", 1)[1]
285
+ buf = after if after else ""
286
+
287
+ if "</think>" in buf and in_thinking:
288
+ before = buf.split("</think>", 1)[0]
289
+ if before:
290
+ yield f"data: {json.dumps({'type': 'thinking', 'text': before})}\n\n"
291
+ in_thinking = False
292
+ yield f"data: {json.dumps({'type': 'thinking_end'})}\n\n"
293
+ after = buf.split("</think>", 1)[1].lstrip()
294
+ buf = ""
295
+ if after:
296
+ yield f"data: {json.dumps({'type': 'text', 'text': after})}\n\n"
297
+ continue
298
+
299
+ partial_open = "<think"
300
+ partial_close = "</think"
301
+ if not in_thinking and buf.endswith(tuple(partial_open[:i] for i in range(1, len(partial_open) + 1))):
302
+ continue
303
+ if in_thinking and buf.endswith(tuple(partial_close[:i] for i in range(1, len(partial_close) + 1))):
304
+ continue
305
+
306
+ if buf:
307
+ evt_type = "thinking" if in_thinking else "text"
308
+ yield f"data: {json.dumps({'type': evt_type, 'text': buf})}\n\n"
309
+ buf = ""
310
+
311
+ if buf:
312
+ evt_type = "thinking" if in_thinking else "text"
313
+ yield f"data: {json.dumps({'type': evt_type, 'text': buf})}\n\n"
314
+ if in_thinking:
315
+ yield f"data: {json.dumps({'type': 'thinking_end'})}\n\n"
316
+
317
+ thread.join()
318
+ elapsed = time.time() - t0
319
+ yield f"data: {json.dumps({'type': 'done', 'tokens_generated': token_count, 'latency_ms': round(elapsed * 1000), 'inference': INFERENCE_MODE})}\n\n"
320
+
321
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
322
+
323
+
324
+ def _generate_in_thread(kwargs):
325
+ with torch.no_grad():
326
+ MODEL.generate(**kwargs)
final_model/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-0.6B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen3-0.6B
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
final_model/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71d98c372775193e6199d70ffd998eabb5c8afc4d27f416dfd15faa81be0227
3
+ size 1047
final_model/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:764e42d66fb3bbca0fc6856001d844a41003a9fef1dd4d174e65d25507bc7462
3
+ size 161533160
final_model/chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if enable_thinking is defined and enable_thinking is false %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
final_model/config-2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e2352a177becce1e5e426d29523f133a7a6168cd19d2c77cfd8a8dda875a738
3
+ size 1145
final_model/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f34c44eb123a0f78dbc782f08e5543c2073ed4208e5f8ef2f3bf13c19b1d079d
3
+ size 11422748
final_model/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47bfa3e7727312946b29ac10d6dd0672d63cf7815b2a160b9523872040d2e536
3
+ size 665
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.6
3
+ pydantic==2.9.2
4
+ torch>=2.2.0
5
+ transformers>=4.45.0
6
+ peft>=0.12.0
7
+ accelerate>=0.34.2
8
+ sentencepiece>=0.2.0
9
+ safetensors>=0.4.5