Spaces:

bztxb
/

shiluInfer

Sleeping

bztxb

完善提示信息

e562522 about 1 month ago

4.84 kB

	import os
	import json
	from typing import Dict, List

	import gradio as gr
	import torch
	from huggingface_hub import hf_hub_download
	from transformers import AutoTokenizer, pipeline


	MODEL_ID = "bztxb/shiluBERT"
	LOCAL_MODEL_DIR = os.getenv("LOCAL_MODEL_DIR", ".")
	MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
	THRESHOLD_DEFAULT = float(os.getenv("THRESHOLD_DEFAULT", "0.5"))
	STRIDE = 0
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DEVICE_INDEX = 0 if DEVICE == "cuda" else -1
	DEFAULT_SAMPLE_TEXT = "○嚴私鹽之禁時戶部奏在京各衙門遣官吏人等於長蘆運司關支食鹽有將批文不投運司照買私鹽裝載各處販賣一二次者又有夾帶私鹽沿途發賣者及中鹽客啇支鹽不循舊例每包添私鹽至三四百斤者請令沿途巡檢司批驗所等處務要拘驗鹽批及鹽引數目嚴加盤詰秤掣若有批文違限夾帶私鹽者依律入官官吏人等如例送問仍行巡鹽御史通行嚴禁從之"


	load_error = None
	tokenizer = None
	classifier = None
	label_list: List[str] = []


	def pick_model_source() -> str:
	if os.path.exists(os.path.join(LOCAL_MODEL_DIR, "config.json")):
	return LOCAL_MODEL_DIR
	return MODEL_ID


	def load_label_list(model_source: str) -> List[str]:
	local_path = os.path.join(model_source, "label_map.json")
	if os.path.exists(local_path):
	file_path = local_path
	else:
	file_path = hf_hub_download(repo_id=model_source, filename="label_map.json")
	with open(file_path, "r", encoding="utf-8") as file:
	data = json.load(file)
	if isinstance(data, dict) and isinstance(data.get("labels"), list):
	return data["labels"]
	if isinstance(data, list):
	return data
	return []


	def map_label_name(raw_label: str) -> str:
	if raw_label.startswith("LABEL_"):
	try:
	idx = int(raw_label.split("_", 1)[1])
	if 0 <= idx < len(label_list):
	return str(label_list[idx])
	except Exception:
	pass
	return raw_label

	try:
	model_source = pick_model_source()
	tokenizer = AutoTokenizer.from_pretrained(model_source, use_fast=True)
	classifier = pipeline(
	task="text-classification",
	model=model_source,
	tokenizer=tokenizer,
	top_k=None,
	device=DEVICE_INDEX,
	)
	label_list = load_label_list(model_source)
	except Exception as exc:
	load_error = str(exc)


	def split_windows(text: str) -> List[str]:
	enc = tokenizer(
	text,
	truncation=True,
	max_length=MAX_LENGTH,
	stride=STRIDE,
	return_overflowing_tokens=True,
	padding=False,
	return_tensors=None,
	)
	input_ids_batch = enc.get("input_ids", [])
	if not input_ids_batch:
	return [text]

	windows = [
	tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
	for ids in input_ids_batch
	]
	return [window for window in windows if window.strip()] or [text]


	def normalize_outputs(outputs):
	if not outputs:
	return []
	if isinstance(outputs, list) and outputs and isinstance(outputs[0], dict):
	return [outputs]
	return outputs


	def predict(text: str, threshold: float) -> Dict[str, float]:
	if load_error is not None:
	return {"error": load_error}
	if not text or not text.strip():
	return {"error": "请输入文本。"}

	windows = split_windows(text)
	outputs = classifier(windows, truncation=True, max_length=MAX_LENGTH)
	outputs = normalize_outputs(outputs)

	label_scores: Dict[str, float] = {}
	for window_result in outputs:
	for item in window_result:
	label = map_label_name(str(item.get("label", "UNKNOWN")))
	score = float(item.get("score", 0.0))
	label_scores[label] = max(score, label_scores.get(label, 0.0))

	items = sorted(label_scores.items(), key=lambda pair: pair[1], reverse=True)
	selected = [(label, score) for label, score in items if score >= threshold]
	if not selected:
	return {"info": f"无标签达到当前阈值 {threshold:.2f}，请尝试降低阈值以查看更多结果。"}

	return {label: round(score, 6) for label, score in selected}


	app = gr.Interface(
	fn=predict,
	inputs=[
	gr.Textbox(
	lines=8,
	label="输入文本后，可调整阈值以选择不同置信度水平下的标签",
	placeholder="请输入待分类文本...",
	value=DEFAULT_SAMPLE_TEXT,
	),
	gr.Slider(minimum=0.0, maximum=1.0, value=THRESHOLD_DEFAULT, step=0.01, label="阈值"),
	],
	outputs=gr.JSON(label="预测结果（标签:置信度）"),
	title="明/清实录多标签分类推理",
	#examples=[[DEFAULT_SAMPLE_TEXT, THRESHOLD_DEFAULT]],
	)


	if __name__ == "__main__":
	app.launch()