Update usage docs: replace AutoModel with working safetensors inference

- Replace non-functional AutoModelForCausalLM example with direct safetensors loading
- Add GGUF/Ollama incompatibility notice (Mamba-2 hybrid architecture)
- Add evafrill_runner.py alternative method with frankenstallm_test link
- Add prerequisites (source clone, pip install) and system requirements
- Update both Korean and English sections
- Fix slerp/README.md usage code to match actual loading pattern

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

README.md +130 -38
slerp/README.md +53 -5

README.md CHANGED Viewed

@@ -179,37 +179,83 @@ ORPO의 약점: SFT 65K 스텝 대비 10K 스텝만 학습되어 기반 명령
 ### 사용법
 ```python
 import torch
 from model.transformer import LLM
 from tokenizers import Tokenizer
-# 커스텀 아키텍처이므로 저장소 클론 후 사용
-# git clone https://github.com/pathcosmos/EVAFRILL-Mo
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = LLM.from_pretrained("hf_export/slerp")
-model = model.to(device=device, dtype=torch.bfloat16)
-model.eval()
-tok = Tokenizer.from_file("tokenizer/korean_sp/tokenizer.json")
-prompt = "인공지능이란 무엇인가요?"
-ids = tok.encode(prompt).ids
-input_ids = torch.tensor([ids], device=device)
-with torch.no_grad():
-    output = model.generate(
-        input_ids,
-        max_new_tokens=256,
-        temperature=0.7,
-        repetition_penalty=1.2,
-    )
-print(tok.decode(output[0].tolist()))
 ```
 ### 재현 자료
 | 경로 | 내용 |
@@ -356,37 +402,83 @@ ORPO's weakness: only 10K steps of training vs SFT's 65K — insufficient base i
 ### Usage
 ```python
 import torch
 from model.transformer import LLM
 from tokenizers import Tokenizer
-# Requires cloning the repository (custom architecture — not loadable via AutoModel)
-# git clone https://github.com/pathcosmos/EVAFRILL-Mo
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = LLM.from_pretrained("hf_export/slerp")
-model = model.to(device=device, dtype=torch.bfloat16)
-model.eval()
-tok = Tokenizer.from_file("tokenizer/korean_sp/tokenizer.json")
-prompt = "What is artificial intelligence?"
-ids = tok.encode(prompt).ids
-input_ids = torch.tensor([ids], device=device)
-with torch.no_grad():
-    output = model.generate(
-        input_ids,
-        max_new_tokens=256,
-        temperature=0.7,
-        repetition_penalty=1.2,
-    )
-print(tok.decode(output[0].tolist()))
 ```
 ### Reproducibility
 | Path | Contents |

 ### 사용법
+> **GGUF/Ollama 미지원**: 커스텀 Mamba-2 하이브리드 아키텍처로 llama.cpp/GGUF/Ollama와 호환되지 않습니다. PyTorch 직접 추론만 가능합니다.
+**사전 준비:**
+```bash
+# 1. 소스 코드 클론 (커스텀 아키텍처 모듈 필요)
+git clone https://github.com/pathcosmos/EVAFRILL-Mo
+cd EVAFRILL-Mo
+# 2. 의존성 설치
+pip install torch safetensors tokenizers PyYAML
+```
+**방법 1: safetensors 직접 로딩 (권장)**
 ```python
+import json
 import torch
+from model.config import LMConfig
 from model.transformer import LLM
 from tokenizers import Tokenizer
+from safetensors.torch import load_file as load_safetensors
+CKPT = "path/to/EVAFRILL-Mo-3B/slerp"  # 이 저장소의 slerp/ 디렉토리
+# Config & 모델 로드
+with open(f"{CKPT}/config.json") as f:
+    data = json.load(f)
+for k in ("model_type", "architectures", "_variant", "_description"):
+    data.pop(k, None)
+cfg = LMConfig(**data)
+cfg.use_flash_attn = False
+model = LLM(cfg)
+state = load_safetensors(f"{CKPT}/model.safetensors", device="cpu")
+model.load_state_dict(state, strict=False)
+model = model.to(device="cuda:0", dtype=torch.bfloat16)
+model.eval()
+tok = Tokenizer.from_file(f"{CKPT}/tokenizer.json")
+# 생성 (권장: temp=0.7, rep_penalty=1.2)
+prompt = "<|user|>\n인공지능이란 무엇인가요?\n<|assistant|>\n"
+ids = torch.tensor([tok.encode(prompt).ids], device="cuda:0")
+with torch.no_grad():
+    for _ in range(256):
+        logits, _ = model(ids)
+        logits = logits[:, -1, :].float()
+        for prev_id in set(ids[0].tolist()):
+            if logits[0, prev_id] > 0: logits[0, prev_id] /= 1.2
+            else: logits[0, prev_id] *= 1.2
+        probs = torch.softmax(logits / 0.7, dim=-1)
+        next_id = torch.multinomial(probs, 1)
+        ids = torch.cat([ids, next_id], dim=1)
+        if next_id.item() == tok.token_to_id("</s>"): break
+print(tok.decode(ids[0].tolist()))
+```
+**방법 2: 평가 프레임워크 러너 사용**
+[frankenstallm_test](https://github.com/pathcosmos/frankenstallm_test)의 `evafrill_runner.py`가 위 과정을 래핑합니다:
+```python
+from eval_framework.evafrill_runner import generate, unload_model
+result = generate("한국어로 인사해주세요.")
+print(result["response"])
+print(f"속도: {result['tokens_per_sec']:.1f} TPS")
+unload_model()
 ```
+> 설정 방법: [frankenstallm_test README](https://github.com/pathcosmos/frankenstallm_test#evafrill-mo-모델-설정-pytorch-직접-추론) 참조
+**시스템 요구사항**: GPU VRAM 8GB+ (BF16), CPU 추론 가능하지만 극히 느림 (~0.5 TPS)
 ### 재현 자료
 | 경로 | 내용 |
 ### Usage
+> **GGUF/Ollama not supported**: Custom Mamba-2 hybrid architecture is incompatible with llama.cpp/GGUF/Ollama. PyTorch direct inference only.
+**Prerequisites:**
+```bash
+# 1. Clone source code (custom architecture modules required)
+git clone https://github.com/pathcosmos/EVAFRILL-Mo
+cd EVAFRILL-Mo
+# 2. Install dependencies
+pip install torch safetensors tokenizers PyYAML
+```
+**Method 1: Direct safetensors loading (recommended)**
 ```python
+import json
 import torch
+from model.config import LMConfig
 from model.transformer import LLM
 from tokenizers import Tokenizer
+from safetensors.torch import load_file as load_safetensors
+CKPT = "path/to/EVAFRILL-Mo-3B/slerp"  # slerp/ directory of this repo
+# Load config & model
+with open(f"{CKPT}/config.json") as f:
+    data = json.load(f)
+for k in ("model_type", "architectures", "_variant", "_description"):
+    data.pop(k, None)
+cfg = LMConfig(**data)
+cfg.use_flash_attn = False
+model = LLM(cfg)
+state = load_safetensors(f"{CKPT}/model.safetensors", device="cpu")
+model.load_state_dict(state, strict=False)
+model = model.to(device="cuda:0", dtype=torch.bfloat16)
+model.eval()
+tok = Tokenizer.from_file(f"{CKPT}/tokenizer.json")
+# Generate (recommended: temp=0.7, rep_penalty=1.2)
+prompt = "<|user|>\nWhat is artificial intelligence?\n<|assistant|>\n"
+ids = torch.tensor([tok.encode(prompt).ids], device="cuda:0")
+with torch.no_grad():
+    for _ in range(256):
+        logits, _ = model(ids)
+        logits = logits[:, -1, :].float()
+        for prev_id in set(ids[0].tolist()):
+            if logits[0, prev_id] > 0: logits[0, prev_id] /= 1.2
+            else: logits[0, prev_id] *= 1.2
+        probs = torch.softmax(logits / 0.7, dim=-1)
+        next_id = torch.multinomial(probs, 1)
+        ids = torch.cat([ids, next_id], dim=1)
+        if next_id.item() == tok.token_to_id("</s>"): break
+print(tok.decode(ids[0].tolist()))
+```
+**Method 2: Evaluation framework runner**
+The `evafrill_runner.py` in [frankenstallm_test](https://github.com/pathcosmos/frankenstallm_test) wraps the above into a simple API:
+```python
+from eval_framework.evafrill_runner import generate, unload_model
+result = generate("Hello, please introduce yourself.")
+print(result["response"])
+print(f"Speed: {result['tokens_per_sec']:.1f} TPS")
+unload_model()
 ```
+> Setup instructions: [frankenstallm_test README](https://github.com/pathcosmos/frankenstallm_test#evafrill-mo-모델-설정-pytorch-직접-추론)
+**System requirements**: GPU VRAM 8GB+ (BF16), CPU inference possible but extremely slow (~0.5 TPS)
 ### Reproducibility
 | Path | Contents |

slerp/README.md CHANGED Viewed

@@ -51,10 +51,58 @@ See the [main README](../../README.md) for full project details, architecture, a
 ## Usage
 ```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model = AutoModelForCausalLM.from_pretrained("path/to/slerp", torch_dtype="bfloat16")
-tokenizer = AutoTokenizer.from_pretrained("path/to/slerp")
-inputs = tokenizer("<|user|>\n질문을 여기에 입력하세요\n<|assistant|>\n", return_tensors="pt")
-output = model.generate(**inputs, temperature=0.7, repetition_penalty=1.2, max_new_tokens=512)
 ```

 ## Usage
+> **Note**: This is a custom Mamba-2 hybrid architecture — `AutoModelForCausalLM` is **not supported**. Use direct safetensors loading with the [EVAFRILL-Mo source code](https://github.com/pathcosmos/EVAFRILL-Mo).
+```bash
+# Prerequisites
+git clone https://github.com/pathcosmos/EVAFRILL-Mo
+pip install torch safetensors tokenizers PyYAML
+```
+```python
+import json, torch
+from model.config import LMConfig
+from model.transformer import LLM
+from tokenizers import Tokenizer
+from safetensors.torch import load_file as load_safetensors
+CKPT = "path/to/slerp"  # this directory
+with open(f"{CKPT}/config.json") as f:
+    data = json.load(f)
+for k in ("model_type", "architectures", "_variant", "_description"):
+    data.pop(k, None)
+cfg = LMConfig(**data)
+cfg.use_flash_attn = False
+model = LLM(cfg)
+state = load_safetensors(f"{CKPT}/model.safetensors", device="cpu")
+model.load_state_dict(state, strict=False)
+model = model.to(device="cuda:0", dtype=torch.bfloat16).eval()
+tok = Tokenizer.from_file(f"{CKPT}/tokenizer.json")
+prompt = "<|user|>\n질문을 여기에 입력하세요\n<|assistant|>\n"
+ids = torch.tensor([tok.encode(prompt).ids], device="cuda:0")
+with torch.no_grad():
+    for _ in range(512):
+        logits, _ = model(ids)
+        logits = logits[:, -1, :].float()
+        for prev_id in set(ids[0].tolist()):
+            if logits[0, prev_id] > 0: logits[0, prev_id] /= 1.2
+            else: logits[0, prev_id] *= 1.2
+        probs = torch.softmax(logits / 0.7, dim=-1)
+        next_id = torch.multinomial(probs, 1)
+        ids = torch.cat([ids, next_id], dim=1)
+        if next_id.item() == tok.token_to_id("</s>"): break
+print(tok.decode(ids[0].tolist()))
+```
+Alternatively, use the wrapped runner from [frankenstallm_test](https://github.com/pathcosmos/frankenstallm_test):
 ```python
+from eval_framework.evafrill_runner import generate
+result = generate("한국어로 인사해주세요.")
+print(result["response"])
 ```