Spaces:

VectorSQL
/

Text2VectorSQL

Runtime error

App Files Files Community

Text2VectorSQL / app.py

dongwenyao

vllm inference

b824726 about 1 month ago

raw

history blame contribute delete

6.16 kB

	import gradio as gr
	import torch
	# 导入 vllm 相关的库
	from vllm import LLM, SamplingParams
	# 保持 AutoTokenizer 用于处理聊天模板
	from transformers import AutoTokenizer
	from modelscope.hub.snapshot_download import snapshot_download
	import os
	import traceback

	# --- 关键配置 ---

	# !!! (重要) 请在此处填入您在 ModelScope 上的 "已合并" 模型的 ID
	# vLLM 将直接加载这个完整的模型
	#
	# 我使用了您原始的基础模型 ID 作为示例，但您必须将其替换为
	# 已经合并了 LoRA (UniVectorSQL) 的那个模型的 ID。
	MERGED_MODEL_ID = "zrwang/UniVectorSQL-7B-LoRA-merged"
	# 示例：如果您的合并后模型叫 "risemds/UniVectorSQL-7B-Merged"，请修改上面一行

	# -----------------


	# --- vLLM 模型加载 ---

	# 1. 下载模型
	# 我们将忽略 .md 文件和 training_args.bin 文件
	print(f"开始下载已合并的模型: {MERGED_MODEL_ID}")
	model_dir = snapshot_download(
	MERGED_MODEL_ID,
	revision='master',
	ignore_patterns=[".md", "training_args.bin", "checkpoint-"]
	)
	print(f"模型下载完成，路径: {model_dir}")


	# 2. 加载 Tokenizer
	# 我们需要 Tokenizer 来应用聊天模板
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
	print("从模型目录加载 Tokenizer 成功。")
	except Exception as e:
	print(f"从模型目录加载 Tokenizer 失败: {e}")
	# 如果失败，程序无法继续，因为 vLLM 需要知道 EOT token
	raise e

	# --- (保留) 修复 Tokenizer 的 pad_token_id ---
	# Qwen1.5 基础模型可能没有设置 pad_token_id
	if tokenizer.pad_token_id is None:
	print("Tokenizer 未设置 pad_token_id，将其设置为 eos_token_id。")
	tokenizer.pad_token_id = tokenizer.eos_token_id
	# (注意：我们不再需要修复 model.config，因为 vLLM 会处理)
	# ----------------------------------------


	# 3. 加载 vLLM 模型
	print(f"开始加载 vLLM 模型: {model_dir}")

	# 自动检测 GPU 数量以设置 tensor_parallel_size
	if torch.cuda.is_available():
	gpu_count = torch.cuda.device_count()
	print(f"检测到 {gpu_count} 个 CUDA GPU。")
	else:
	print("警告: 未检测到 CUDA GPU。vLLM 强烈建议在 GPU 上运行。")
	gpu_count = 1 # 假设至少为 1，vLLM 0.4.0+ 也支持 CPU (但很慢)

	llm = LLM(
	model=model_dir,
	trust_remote_code=True,
	tensor_parallel_size=gpu_count, # 自动使用所有可用的 GPU
	dtype="auto" # vLLM 会自动选择 (例如 bfloat16 或 float16)
	# max_model_len=4096 # (可选) 如果需要，设置最大上下文长度
	)
	print("vLLM 模型加载完成。")


	# 4. 定义推理函数 (* 已修改为使用 vLLM *)

	# --- 提前准备 EOT token 和 SamplingParams ---
	# 1. 获取 EOT (End of Text) token IDs (用于停止生成)
	# Qwen 系列使用 <\|im_end\|> (151645) 和/或 <\|endoftext\|> (151643)
	eot_ids = [tokenizer.eos_token_id]

	im_end_token_id = tokenizer.convert_tokens_to_ids("<\|im_end\|>")
	if im_end_token_id != tokenizer.unk_token_id and im_end_token_id not in eot_ids:
	eot_ids.append(im_end_token_id)

	print(f"vLLM 将使用 stop_token_ids: {eot_ids} (eos_token_id: {tokenizer.eos_token_id})")

	# 2. 创建 vLLM SamplingParams
	# (原 HFace generate 参数：max_new_tokens=2048, eos_token_id=eot_ids)
	sampling_params = SamplingParams(
	max_tokens=2048, # 对应 HFace 的 max_new_tokens
	stop_token_ids=eot_ids, # <--- 关键：告诉 vLLM 何时停止
	temperature=0.0, # 对于 SQL 生成，使用贪婪采样 (0.0) 通常是最好的
	top_p=1.0, # (配合 temperature=0.0)
	)
	# ----------------------------------------

	def inference(text_input):
	print(f"收到输入: {text_input}")
	try:
	# 1. 构建 Qwen (ChatML) 对话格式
	messages = [
	{"role": "user", "content": text_input}
	]

	# 2. 应用对话模板
	# vLLM 需要的是字符串，而不是 token IDs
	# tokenize=False 会返回格式化后的字符串
	# add_generation_prompt=True 会在末尾添加 <\|im_start\|>assistant\n
	prompt_str = tokenizer.apply_chat_template(
	messages,
	tokenize=False, # <--- 关键：返回字符串
	add_generation_prompt=True # <--- 关键：添加 assistant 提示
	)

	# 3. vLLM 生成
	# llm.generate 接受一个 prompt 列表
	outputs = llm.generate([prompt_str], sampling_params)

	# 4. 提取结果
	# outputs 是一个列表，对应输入的 prompt 列表
	# outputs[0] 是第一个 prompt 的 RequestOutput
	# outputs[0].outputs[0] 是第一个 (best_of=1) 的生成结果
	# .text 包含新生成的文本 (不包含 prompt)
	result = outputs[0].outputs[0].text.strip()

	print(f"生成结果 (仅新内容): {result}")
	return result

	except Exception as e:
	print(f"vLLM 推理时出错: {e}")
	# 打印更详细的 vLLM 错误
	traceback.print_exc()
	return f"错误: {e}"

	# ----------------------------------------------------

	# (示例保持不变)
	example = """You are a senior SQL engineer. Your task is to generate a single, correct, and executable SQL query to answer the user's question based on the provided database context.
	... (示例内容和之前一样) ...
	Let's think step by step!
	"""
	# (为了简洁，省略了示例的完整内容，使用您原始的 'example' 变量即可)
	examples = [example]


	# 6. 创建 Gradio 界面 (保持不变)
	iface = gr.Interface(
	fn=inference,
	inputs=gr.Textbox(lines=10, label="输入查询 (Input Query)"),
	outputs=gr.Textbox(lines=10, label="模型输出 (Model Output)"),
	title="UniVectorSQL-7B (vLLM 推理)",
	# description="这是一个 Text-to-SQL 模型。请输入您的问题 (Question) 和数据库模式 (Schema)。点击下方示例尝试。",
	examples=examples
	)
	# ---------------------

	print("启动 Gradio 界面...")
	# 在 Hugging Face 或 ModelScope Space 中，share=True 不是必需的
	iface.launch()