yongqiang commited on
Commit ·
1ed9a31
1
Parent(s): 028f9bb
Initialize the repository
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +5 -0
- .gitignore +4 -0
- README.md +44 -0
- assets/demo.png +3 -0
- config.py +23 -0
- gui.py +65 -0
- index/docs.index +0 -0
- index/docs.pkl +3 -0
- llm_api.py +142 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.bfloat16.bin +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.float32.bin +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.npy +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l0_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l10_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l11_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l12_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l13_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l14_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l15_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l16_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l17_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l18_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l19_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l1_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l20_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l21_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l22_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l23_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l24_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l25_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l26_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l27_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l2_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l3_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l4_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l5_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l6_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l7_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l8_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l9_together.axmodel +3 -0
- models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_post.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.bfloat16.bin +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.float32.bin +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.npy +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l0_together.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l10_together.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l11_together.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l12_together.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l13_together.axmodel +3 -0
- models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l14_together.axmodel +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
examples/red-panda.mp4 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
build-output/
|
| 3 |
+
tmp/
|
| 4 |
+
*.safetensors
|
README.md
CHANGED
|
@@ -1,3 +1,47 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
# RAG.AXERA DEMO
|
| 6 |
+
|
| 7 |
+

|
| 8 |
+
|
| 9 |
+
## 项目说明
|
| 10 |
+
|
| 11 |
+
```sh
|
| 12 |
+
(hf) ➜ rag.axera git:(main) ✗ tree -L 2
|
| 13 |
+
.
|
| 14 |
+
├── assets
|
| 15 |
+
│ └── demo.png
|
| 16 |
+
├── config.py # 配置 axmodel, tokenizer 文件路径
|
| 17 |
+
├── data
|
| 18 |
+
├── gui.py # RAG 交互式程序
|
| 19 |
+
├── index # 文档编码向量索引保存位置
|
| 20 |
+
│ ├── docs.index
|
| 21 |
+
│ └── docs.pkl
|
| 22 |
+
├── llm_api.py # llm 主程序
|
| 23 |
+
├── models # axmodel 模型存储位置
|
| 24 |
+
│ ├── Qwen2.5-1.5B-Instruct_axmodel
|
| 25 |
+
│ └── Qwen3-Embedding-0.6B_axmodel
|
| 26 |
+
├── pdf_sample # 示例 pdf 文件
|
| 27 |
+
│ └── introduction.pdf
|
| 28 |
+
├── rag_engine.py # 文档向量编码程序
|
| 29 |
+
├── README.md
|
| 30 |
+
├── requirements.txt
|
| 31 |
+
├── tokenizer
|
| 32 |
+
│ ├── Qwen2.5-1.5B-Instruct
|
| 33 |
+
│ └── Qwen3-Embedding-0.6B
|
| 34 |
+
└── utils
|
| 35 |
+
└── infer_func.py
|
| 36 |
+
|
| 37 |
+
11 directories, 11 files
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## 运行
|
| 41 |
+
|
| 42 |
+
在 `AXCL` 机器或 `AX650` 开发板上启动两个终端界面, 分别运行下面的命令:
|
| 43 |
+
|
| 44 |
+
```sh
|
| 45 |
+
python3 llm_api.py # 在 AX650 或 AXCL 开发板启动 llm 服务
|
| 46 |
+
python3 gui.py # 启动交互式界面
|
| 47 |
+
```
|
assets/demo.png
ADDED
|
Git LFS Details
|
config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config.py
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# models and paths
|
| 8 |
+
LLM_HF_MODEL = os.getenv("LLM_HF_MODEL", "./tokenizer/Qwen2.5-1.5B-Instruct") # 只需要根据实际路径修改这里的模型路径即可
|
| 9 |
+
LLM_AX_MODEL = os.getenv("LLM_AX_MODEL", "./models/Qwen2.5-1.5B-Instruct_axmodel")
|
| 10 |
+
EMBED_HF_MODEL = os.getenv("EMBED_HF_MODEL", "./tokenizer/Qwen3-Embedding-0.6B")
|
| 11 |
+
EMBED_AX_MODEL = os.getenv("EMBED_AX_MODEL", "./models/Qwen3-Embedding-0.6B_axmodel")
|
| 12 |
+
|
| 13 |
+
# API URL
|
| 14 |
+
LLM_API_PORT = int(os.getenv("LLM_API_PORT", "8000"))
|
| 15 |
+
LLM_API_URL = os.getenv("LLM_API_URL", f"http://127.0.0.1:{LLM_API_PORT}/generate") # 具体函数名字需要对应修改
|
| 16 |
+
PORT = int(os.getenv("PORT", "7860"))
|
| 17 |
+
|
| 18 |
+
# Index paths
|
| 19 |
+
INDEX_DIR = os.getenv("INDEX_DIR", "index")
|
| 20 |
+
INDEX_FILE = os.path.join(INDEX_DIR, "docs.index")
|
| 21 |
+
EMBEDDINGS_FILE = os.path.join(INDEX_DIR, "docs.pkl")
|
| 22 |
+
|
| 23 |
+
os.makedirs(INDEX_DIR, exist_ok=True)
|
gui.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# gui.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from rag_engine import ask_question, build_index, stream_answer
|
| 4 |
+
from config import PORT
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
chat_history = []
|
| 8 |
+
|
| 9 |
+
def handle_upload(file):
|
| 10 |
+
if file is None:
|
| 11 |
+
return "❌ 请上传文件", ""
|
| 12 |
+
try:
|
| 13 |
+
result = build_index(file.name) # file.name 是本地路径
|
| 14 |
+
return result, ""
|
| 15 |
+
except Exception as e:
|
| 16 |
+
return f"❌ 构建索引失败:{str(e)}", ""
|
| 17 |
+
|
| 18 |
+
def handle_chat(message, history):
|
| 19 |
+
history = history or []
|
| 20 |
+
if not message.strip():
|
| 21 |
+
return "", history
|
| 22 |
+
try:
|
| 23 |
+
# 使用流式响应
|
| 24 |
+
history.append((message, ""))
|
| 25 |
+
full_response = ""
|
| 26 |
+
|
| 27 |
+
# 获取流式响应生成器
|
| 28 |
+
answer_generator = stream_answer(message)
|
| 29 |
+
|
| 30 |
+
# 逐个token添加到聊天历史
|
| 31 |
+
for token in answer_generator:
|
| 32 |
+
full_response += token
|
| 33 |
+
history[-1] = (message, full_response)
|
| 34 |
+
yield "", history
|
| 35 |
+
time.sleep(0.02) # 添加微小延迟使输出更平滑
|
| 36 |
+
|
| 37 |
+
# 流结束后添加一点停顿
|
| 38 |
+
time.sleep(0.1)
|
| 39 |
+
yield "", history
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
history.append((message, f"⚠️ 出错了:{str(e)}"))
|
| 43 |
+
return "", history
|
| 44 |
+
|
| 45 |
+
with gr.Blocks(title="RAG 文档问答系统") as demo:
|
| 46 |
+
gr.Markdown("## 🤖 AXERA RAG 文档问答\n请上传 PDF 或 TXT 文件并提问")
|
| 47 |
+
|
| 48 |
+
with gr.Row():
|
| 49 |
+
with gr.Column(scale=1):
|
| 50 |
+
file_input = gr.File(label="📄 上传文件", file_types=[".pdf", ".txt"])
|
| 51 |
+
upload_btn = gr.Button("📥 上传并构建索引")
|
| 52 |
+
upload_status = gr.Textbox(label="", interactive=False)
|
| 53 |
+
|
| 54 |
+
with gr.Column(scale=2):
|
| 55 |
+
chatbot = gr.Chatbot(height=400, label="🧠 问答对话")
|
| 56 |
+
with gr.Row():
|
| 57 |
+
message = gr.Textbox(placeholder="请输入你的问题,按 Shift + Enter 发送", show_label=False, lines=2)
|
| 58 |
+
send_btn = gr.Button("🚀 发送")
|
| 59 |
+
|
| 60 |
+
upload_btn.click(fn=handle_upload, inputs=[file_input], outputs=[upload_status, message])
|
| 61 |
+
send_btn.click(fn=handle_chat, inputs=[message, chatbot], outputs=[message, chatbot])
|
| 62 |
+
message.submit(fn=handle_chat, inputs=[message, chatbot], outputs=[message, chatbot])
|
| 63 |
+
|
| 64 |
+
# 启用队列并启动
|
| 65 |
+
demo.queue().launch(server_port=PORT)
|
index/docs.index
ADDED
|
Binary file (24.6 kB). View file
|
|
|
index/docs.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad92dcae43938a9ca8afbfff7c9cf6c671a2290e4839f0199a6f834eefdfceac
|
| 3 |
+
size 5705
|
llm_api.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# llm_api.py
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.responses import StreamingResponse
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import Optional
|
| 6 |
+
import uvicorn
|
| 7 |
+
import numpy as np
|
| 8 |
+
import os
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
| 11 |
+
from config import LLM_HF_MODEL, LLM_AX_MODEL, LLM_API_PORT
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
app = FastAPI(title="Fast-API", description="本地推理接口")
|
| 15 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
+
|
| 17 |
+
"""
|
| 18 |
+
axengine 相关
|
| 19 |
+
"""
|
| 20 |
+
from ml_dtypes import bfloat16
|
| 21 |
+
from utils.infer_func import InferManager
|
| 22 |
+
|
| 23 |
+
# 定义全局变量,但先不初始化
|
| 24 |
+
tokenizer = None
|
| 25 |
+
imer = None
|
| 26 |
+
embeds = None
|
| 27 |
+
|
| 28 |
+
def init_model():
|
| 29 |
+
global tokenizer, imer, embeds
|
| 30 |
+
if tokenizer is None: # 防止重复初始化
|
| 31 |
+
cfg = AutoConfig.from_pretrained(LLM_HF_MODEL)
|
| 32 |
+
imer = InferManager(cfg, LLM_AX_MODEL, model_type="qwen2")
|
| 33 |
+
embeds = np.load(os.path.join(LLM_AX_MODEL, "model.embed_tokens.weight.npy"))
|
| 34 |
+
# 加载 tokenizer
|
| 35 |
+
tokenizer = AutoTokenizer.from_pretrained(LLM_HF_MODEL, trust_remote_code=True)
|
| 36 |
+
print("✅ 模型加载完成。")
|
| 37 |
+
|
| 38 |
+
# 添加 FastAPI 的启动事件
|
| 39 |
+
@app.on_event("startup")
|
| 40 |
+
async def startup_event():
|
| 41 |
+
init_model()
|
| 42 |
+
|
| 43 |
+
class GenRequest(BaseModel):
|
| 44 |
+
prompt: str
|
| 45 |
+
max_tokens: Optional[int] = 1024
|
| 46 |
+
temperature: Optional[float] = 0.6
|
| 47 |
+
top_p: Optional[float] = 0.9
|
| 48 |
+
|
| 49 |
+
class GenResponse(BaseModel):
|
| 50 |
+
text: str
|
| 51 |
+
|
| 52 |
+
@app.post("/generate", response_model=GenResponse)
|
| 53 |
+
def generate_text(req: GenRequest):
|
| 54 |
+
try:
|
| 55 |
+
# input_ids = tokenizer(req.prompt, return_tensors="pt").input_ids.to(device)
|
| 56 |
+
|
| 57 |
+
# with torch.no_grad():
|
| 58 |
+
# output_ids = model.generate(
|
| 59 |
+
# input_ids=input_ids,
|
| 60 |
+
# max_new_tokens=req.max_tokens,
|
| 61 |
+
# temperature=req.temperature,
|
| 62 |
+
# top_p=req.top_p,
|
| 63 |
+
# # do_sample=True,
|
| 64 |
+
# eos_token_id=tokenizer.eos_token_id
|
| 65 |
+
# )
|
| 66 |
+
|
| 67 |
+
# response_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 68 |
+
messages = [
|
| 69 |
+
{"role": "system", "content": "你的名字叫做 [AXERA-RAG 助手]. 你是一个高效、精准的问答助手. 你可以根据上下文内容, 回答用户提出的问题, 回答时不要提及多余的、无用的内容, 且仅输出你的回答."},
|
| 70 |
+
{"role": "user", "content": req.prompt}
|
| 71 |
+
]
|
| 72 |
+
text = tokenizer.apply_chat_template(
|
| 73 |
+
messages,
|
| 74 |
+
tokenize=False,
|
| 75 |
+
add_generation_prompt=True
|
| 76 |
+
)
|
| 77 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
| 78 |
+
|
| 79 |
+
"""
|
| 80 |
+
axengine 框架模型推理
|
| 81 |
+
"""
|
| 82 |
+
input_ids = model_inputs['input_ids']
|
| 83 |
+
inputs_embeds = np.take(embeds, input_ids.cpu().numpy(), axis=0)
|
| 84 |
+
prefill_data = inputs_embeds
|
| 85 |
+
prefill_data = prefill_data.astype(bfloat16)
|
| 86 |
+
token_ids = input_ids[0].cpu().numpy().tolist()
|
| 87 |
+
generated_text = ""
|
| 88 |
+
|
| 89 |
+
def generate_stream():
|
| 90 |
+
nonlocal token_ids, generated_text
|
| 91 |
+
token_ids = imer.prefill(tokenizer, token_ids, prefill_data[0], slice_len=128)
|
| 92 |
+
generated_text += tokenizer.decode(token_ids[-1], skip_special_tokens=True)
|
| 93 |
+
|
| 94 |
+
# response_text = imer.decode(tokenizer, token_ids, embeds, slice_len=128)
|
| 95 |
+
# 去掉 prompt 的前缀, 只保留生成部分
|
| 96 |
+
# generated_text = response_text[len(req.prompt):].strip()
|
| 97 |
+
# generated_text = response_text
|
| 98 |
+
# return GenResponse(text=generated_text)
|
| 99 |
+
|
| 100 |
+
# 流式输出控制
|
| 101 |
+
prefill_word = tokenizer.decode(token_ids[-1], skip_special_tokens=True)
|
| 102 |
+
prefill_word = prefill_word.strip().replace("\n", "\\n").replace("\"", "\\\"")
|
| 103 |
+
|
| 104 |
+
seq_len = len(token_ids) - 1
|
| 105 |
+
prefill_len = 128
|
| 106 |
+
for step_idx in range(imer.max_seq_len):
|
| 107 |
+
if prefill_len > 0 and step_idx < seq_len:
|
| 108 |
+
continue
|
| 109 |
+
token_ids, next_token_id = imer.decode_next_token(tokenizer, token_ids, embeds, slice_len=128, step_idx=step_idx)
|
| 110 |
+
if next_token_id == tokenizer.eos_token_id and next_token_id > seq_len:
|
| 111 |
+
break
|
| 112 |
+
try:
|
| 113 |
+
if next_token_id is not None:
|
| 114 |
+
word = tokenizer.decode([next_token_id], skip_special_tokens=True)
|
| 115 |
+
generated_text += word
|
| 116 |
+
if prefill_word is not None:
|
| 117 |
+
word = prefill_word + word
|
| 118 |
+
prefill_word = None
|
| 119 |
+
# 以适合前端处理的 SSE 格式输出
|
| 120 |
+
# 处理特殊字符
|
| 121 |
+
word = word.strip().replace("\n", "\\n").replace("\"", "\\\"")
|
| 122 |
+
# import pdb; pdb.set_trace()
|
| 123 |
+
yield f"data: {{\"token\": \"{word}\"}}\n\n"
|
| 124 |
+
except Exception as e:
|
| 125 |
+
print(f"Error decoding token {next_token_id}: {e}")
|
| 126 |
+
|
| 127 |
+
return StreamingResponse(
|
| 128 |
+
generate_stream(),
|
| 129 |
+
media_type="text/event-stream", # 必须使用SSE格式
|
| 130 |
+
headers={
|
| 131 |
+
"Cache-Control": "no-cache",
|
| 132 |
+
"Connection": "keep-alive",
|
| 133 |
+
"X-Accel-Buffering": "no" # 禁用Nginx缓冲
|
| 134 |
+
}
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
if __name__ == "__main__":
|
| 142 |
+
uvicorn.run(app, host="0.0.0.0", port=LLM_API_PORT, reload=False)
|
models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.bfloat16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2c594e5f910978ef413824340261a6055c5bb905dcfefceed9d30dd2b80637e
|
| 3 |
+
size 466747392
|
models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2517b46ffa436067fc5e2bd7e191107c2c5c9f29892358e8639957b0057a287
|
| 3 |
+
size 933494784
|
models/Qwen2.5-1.5B-Instruct_axmodel/model.embed_tokens.weight.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcec2b0923f51df5891df10e715e3204e3bb039f26780112ade2f1b7da997bef
|
| 3 |
+
size 933494912
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63ef37adbe925889dc0be44e9ba35aefec5eb8f1824f3c785510765f1a1bc6fe
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe006a7dd1096a5d8c68ef9d70ed49f9f1ed4769f122080a735da8d04f025e02
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e350949ca5683b3084ae4241bdd1fddcaebe5ef8e70c0d8aef339a3b5e693c2
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39501bd3523287d5fa67973abf11a91f4a67c79ae096c0d31c7e07fe01427690
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:627aa4354f2c7327bf2c053d2a5f6acecde55a406ebd0713cd7c7feffa6855c0
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8d04ec0cb7ba6170c18c6fb6ccdec8586d3568717d93de562ac31b032fd34d5
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l15_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d0ab59dc0bf214354b6ed6b9b155d5ff4fb3e7cc10975cc0c1652e39a660f7a
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l16_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd4e7800e81bbed49d15c4354ad763a699af9f8ec3413dd9d1cc9a6b49d9c6fb
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l17_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b8d5ec8f6dafda2b90ede8a630a98e865c3b6012995c17e6a6b5b606f269237
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l18_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79b37891f1710e559ba39604b83badad68c4ec971fb6883733e963afff250c1b
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l19_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3863cbeb6191f43dfc7e3d31c3bfe4bf4dccba3ea9769003c52754ec2fb4be12
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l1_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4755cfff48189833c5062bc07683e551443424b985ed1575b8db4a2297ca0894
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l20_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f34c1d1589a2f2545f73eb03dc231e2fd434e26fb7015b0ae4e78cb9ab6329c2
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l21_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64df1cce0340b34a586363b5d86158289501895836667e084ca172c81ae5dbcf
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l22_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c60464b0a790a9e39934a35f7bdc4b1948903cb74c93c3321313a4f9d2e9b6b
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l23_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a71d4d9806fc91f4eeddc7c336074f26fc10a436163537c1f6fb5f92ab0c13c2
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l24_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:678177517e5efaa54688a2debe1402c83a056285062c00d119ec0f53f80b22ff
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l25_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea4077e32a81594ed2eab85edca2579aaa07d6610c38bae722d50519db54f7c5
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l26_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e23d6bb592bb43cc68f2cee5d95be7f97401be13f8bb882bfe5ca23d5f7a5a1
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l27_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6cd266fdd498697b600309ffd89c3f0e43d8dbb7aaf3f4f0413520c9500e7ab6
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l2_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beae1c79a0455b1d66747e0078a2057d299360558a4a4f7c81ea4a9062a8fa1b
|
| 3 |
+
size 67102542
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l3_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:716ea68fe92de87bb162f8033bfbe57448b4eca36ce25af0c5b74078240190ee
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l4_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47d97c2421ccbb5c64e50337a1f7b4615f5f51ad2c50a16f2f9afef951e33056
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l5_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eda5ef5b2eae8c023fa47f13ffc1757c921b0cecfcfee2d3dedf9f3ea8079d73
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l6_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:693e656bfc8c54beb76a194be5839ffe6535691b5fc3c74e6a7ad8ea291e9c89
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l7_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d842213daf2da92c1f4df7d981233f5faa64344f4ddec8009c9874a54fa650f
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l8_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:957387916555920f33d3207db940e1918f64c7c389c31f808d324d2e3d9d0d95
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_p128_l9_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3da1f60bc8b971ef8ef6098cf11e4b44cbc9c819cc6cfeb3b646941e50d13a9
|
| 3 |
+
size 67100526
|
models/Qwen2.5-1.5B-Instruct_axmodel/qwen2_post.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1d25088c38ccf35bd82c7d871878ab1cbe8512e00130ca871eb5e9601768a94
|
| 3 |
+
size 254449571
|
models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.bfloat16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a55b140d86852835bd18d8200222a9f302340730f0670eb7e23a4895e5489033
|
| 3 |
+
size 310618112
|
models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7a027c062fb61cd505e046bc832345be155e1eb2fab629675cebe7973646c85
|
| 3 |
+
size 621236224
|
models/Qwen3-Embedding-0.6B_axmodel/model.embed_tokens.weight.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6bbdc47aee1b4cdb97a42a255306d4e0a1cb52f797bfdc32f94469eb0cd0744e
|
| 3 |
+
size 621236352
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:025e17a92f3f19d58a36ef119294598073c4ccdc794aa9d4a2845a99b0c6b53d
|
| 3 |
+
size 28019747
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95d809d6cc6889517b1aad7a4e62e51ffbf75580dda5ceafb667dbd5ac10ba6e
|
| 3 |
+
size 28019779
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fb730805877eceea4aa037694bc2abb830fa960c6666d24b976d7ae35c058d0
|
| 3 |
+
size 28018723
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eef7266dff6af522a0a63095067a1c7823a9a1213e7bd498bcdb97f2814523ba
|
| 3 |
+
size 28019427
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9322be28dad0729b006238364297659594e2193a97a37556fb06f63d3fec9fa0
|
| 3 |
+
size 28019459
|
models/Qwen3-Embedding-0.6B_axmodel/qwen3_p128_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5a8f239e4a4e793e0bdf86226c08c2089f5199118bd38d2be4957f9b7023dda
|
| 3 |
+
size 28018723
|