yongqiang

Update: save summary res to .md

2271615 10 days ago

12 kB

	#!/usr/bin/env python3
	# -- encoding: utf-8 --
	# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved.
	# MIT License (https://opensource.org/licenses/MIT)

	import os
	import torch
	import argparse
	from model import SinusoidalPositionEncoder
	from utils.ax_model_bin import AX_SenseVoiceSmall
	from utils.ax_vad_bin import AX_Fsmn_vad
	from utils.vad_utils import merge_vad
	from utils.ax_cam_bin import AX_SpeakerEmbeddingInference, do_clustering, distribute_spk, get_trans_sentence_sensevoice
	from funasr.tokenizer.sentencepiece_tokenizer import SentencepiecesTokenizer
	from utils.ax_cam_bin import chunk
	import time
	import librosa
	import soundfile as sf
	import numpy as np
	from utils.infer_func import InferManager
	from ml_dtypes import bfloat16
	from transformers import AutoConfig, AutoTokenizer
	from loguru import logger


	llm_axmodel_path = "./ax_model/Qwen3-4B-Instruct-2507-GPTQ-Int4_8k_axmodel"
	llm_hf_tokenizer_path = "./tokenizer_qwen3_int4"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	embeds = np.load(os.path.join(llm_axmodel_path, "model.embed_tokens.weight.npy"))


	def parse_args():
	parser = argparse.ArgumentParser(description="SenseVoice inference script")
	parser.add_argument("--output_dir", type=str, default="./output_dir", help="Output directory")
	parser.add_argument("--seq_len", type=int, default=132, help="Sequence length for model") #68 ,132
	parser.add_argument("--wav_file", type=str, default="wav/vad_example.wav",help="Input wav file")
	return parser.parse_args()


	def run_model(prompt, max_seq_len=8191, slice_len=256, max_prefill_len=4095):
	messages = [
	{
	"role": "system",
	"content": "你叫小惠, 你是一个专业的会议记录分析助手, 善于从会议记录(按照时间先后记录不同人物的发言)中提取关键信息并生成合适的总结. \n 请你基于以下会议记录, 在深度思考后, 总结这段会议记录的参会人员以及内容摘要. ",
	},
	{
	"role": "user",
	"content": prompt,
	},
	]

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)
	model_inputs = tokenizer([text], return_tensors="pt").to(device)
	input_ids = model_inputs.input_ids

	######################################################################

	token_ids = input_ids[0].cpu().numpy().tolist()
	token_len = len(token_ids)

	assert token_len <= max_prefill_len, f"Input token length {token_len} exceeds max prefill length {max_prefill_len}"
	# import pdb; pdb.set_trace()
	prefill_data = np.take(embeds, token_ids, axis=0)
	prefill_data = prefill_data.astype(bfloat16)

	imer = InferManager(cfg, llm_axmodel_path, max_seq_len=max_seq_len, max_prefill_len=max_prefill_len) # prefill + decode max length
	token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=slice_len)
	imer.decode(tokenizer, token_ids, embeds, slice_len=slice_len, eos_token_id=eos_token_id)

	# 分别打印输入和输出 token 数, kv cache 总长度
	print(f"\n输入 token 数: {token_len}")
	print(f"输出 token 数: {len(token_ids) - token_len}")
	print(f"kv cache 总长度: {max_seq_len}")

	# 保存输出 token
	output_text = tokenizer.decode(token_ids[token_len:], skip_special_tokens=True)
	return output_text

	if __name__ == "__main__":
	args = parse_args()
	seq_len = args.seq_len
	model_path = args.output_dir
	os.makedirs(model_path, exist_ok=True)

	print(f"Initializing model...")

	ax_model_dir = "ax_model"
	total_inference_start = time.time()

	model_vad = AX_Fsmn_vad(ax_model_dir)
	speaker_model = AX_SpeakerEmbeddingInference(model_dir=ax_model_dir)
	embed = SinusoidalPositionEncoder()
	position_encoding = embed.get_position_encoding(torch.randn(1, seq_len, 560)).numpy()
	model_bin = AX_SenseVoiceSmall(ax_model_dir, seq_len=seq_len)

	# build tokenizer
	print(f"Loading tokenizer...")
	tokenizer = None
	tokenizer_path = os.path.join(ax_model_dir, "chn_jpn_yue_eng_ko_spectok.bpe.model")
	tokenizer = SentencepiecesTokenizer(bpemodel=tokenizer_path)

	# Set up audio file for processing
	wav_file = args.wav_file #S_R004S03C01

	print(f"Running inference on example file...")

	withitn = True
	norm_type = "withitn" if withitn else "woitn"
	print(f"\nProcessing with text normalization: {norm_type}")

	language = "auto"
	print(f"\n--- Processing language: {language} ---")
	print(f"Processing file: {wav_file}")
	inference_start = time.time()
	# 加载音频数据

	speech, fs = librosa.load(wav_file, sr=None)
	# 检查采样率，如果不是16kHz则进行重采样
	if fs != 16000:
	print(f"Resampling audio from {fs}Hz to 16000Hz")
	speech = librosa.resample(y=speech, orig_sr=fs, target_sr=16000)
	fs = 16000
	audio_duration = librosa.get_duration(y=speech, sr=fs)
	speech_lengths = len(speech)

	try:
	#增加vad model 推理及处理
	vad_start_time = time.time()
	res_vad = model_vad(speech)[0]
	vad_segments = merge_vad(res_vad, 15 * 1000) #短语音段合并 # vad_segments: [[0, 6480], [6480, 23670], [23670, 38210], [38210, 49910], [49910, 59820], [59820, 70550]]
	vad_time_cost = time.time() - vad_start_time
	print(f"VAD processing time: {vad_time_cost:.2f} seconds")

	# emb_extraction
	vad_time = [[vad_t[0]/1000, vad_t[1]/1000] for vad_t in res_vad]
	chunks = [c for (st, ed) in vad_time for c in chunk(st, ed)]

	# Extract speaker embeddings for each chunk
	print("Extracting speaker embeddings...")
	speaker_start_time = time.time()
	# import pdb
	# pdb.set_trace()
	embeddings = speaker_model(speech, fs, chunks=chunks)
	speaker_time_cost = time.time() - speaker_start_time
	print(f"Speaker embedding extraction time: {speaker_time_cost:.2f} seconds")
	print(f"Generated embeddings shape: {embeddings.shape}")

	clustering_start_time = time.time()
	speaker_num, diar_results = do_clustering(chunks, embeddings, speaker_num=None)
	clustering_time_cost = time.time() - clustering_start_time
	print(f"Speaker clustering time: {clustering_time_cost:.2f} seconds")

	# print(f"VAD segments detected: {len(vad_segments)}")

	# 存储所有分片结果
	all_results = []
	all_metadata = {}

	# 遍历每个VAD片段并处理
	asr_start_time = time.time()
	for i, segment in enumerate(vad_segments):
	segment_start, segment_end = segment
	# 从原始音频中提取该片段
	start_sample = int(segment_start / 1000 * fs)
	end_sample = min(int(segment_end / 1000 * fs), speech_lengths)
	segment_speech = speech[start_sample:end_sample]

	# 计算时间偏移量（毫秒转秒）
	time_offset_sec = segment_start / 1000.0

	# 为当前片段创建临时文件
	segment_filename = f"temp_segment_{i}.wav"
	sf.write(segment_filename, segment_speech, fs)

	# 对当前片段进行识别
	try:
	segment_res, segment_meta = model_bin(
	segment_filename,
	language,
	withitn,
	position_encoding,
	tokenizer=tokenizer,
	output_timestamp=True,
	ban_emo_unk=False,
	output_dir=model_path,
	key=[f"{os.path.basename(wav_file)}_segment_{i}"]
	)

	if "merged_words" in segment_meta:
	if "merged_words" not in all_metadata:
	all_metadata["merged_words"] = []
	all_metadata["merged_words"].extend(segment_meta["merged_words"])

	if "merged_timestamps" in segment_meta:
	if "merged_timestamps" not in all_metadata:
	all_metadata["merged_timestamps"] = []
	adjusted_timestamps = [[min(ts[0] + time_offset_sec, audio_duration), min(ts[1] + time_offset_sec, audio_duration)]
	for ts in segment_meta["merged_timestamps"]] # 确保不超过音频总时长
	all_metadata["merged_timestamps"].extend(adjusted_timestamps)

	if os.path.exists(segment_filename):
	os.remove(segment_filename)

	except Exception as e:
	if os.path.exists(segment_filename):
	os.remove(segment_filename)
	raise

	output_asr = {
	"merged_words": all_metadata.get("merged_words", []),
	"merged_timestamps": all_metadata.get("merged_timestamps", [])
	}

	except Exception as e:
	raise

	asr_time_cost = time.time() - asr_start_time
	print(f"ASR processing time: {asr_time_cost:.2f} seconds")

	asr_timestamps = get_trans_sentence_sensevoice(output_asr)
	sentence_info_with_spk = distribute_spk(asr_timestamps, diar_results)
	inference_time_cost = time.time() - inference_start
	inference_time_cost_all = time.time() - total_inference_start
	rtf = inference_time_cost / audio_duration
	print(f"Inference time for {wav_file}: {inference_time_cost:.2f} seconds")
	print(f"load model + Inference time for {wav_file}: {inference_time_cost_all:.2f} seconds")
	print(f"Audio duration: {audio_duration:.2f} seconds")
	print(f"RTF: {rtf:.2f}")

	# Save Results
	output_trans_path = os.path.join(args.output_dir, f"{wav_file.split('/')[-1]}.txt")

	try:
	with open(output_trans_path, 'w', encoding='utf-8') as f:
	for text_string, timeinterval, spk in sentence_info_with_spk:
	f.write(f'Speaker_{spk}: [{timeinterval[0]:.3f} {timeinterval[1]:.3f}] {text_string}\n')
	except Exception as e:
	pass
	########################### LLM Inference #############################
	logger.info("Starting LLM Inference for meeting summary...")
	# load the tokenizer and the model
	tokenizer = AutoTokenizer.from_pretrained(llm_hf_tokenizer_path)
	cfg = AutoConfig.from_pretrained(llm_hf_tokenizer_path, trust_remote_code=True)

	eos_token_id = None
	if isinstance(cfg.eos_token_id, list) and len(cfg.eos_token_id) > 1:
	eos_token_id = cfg.eos_token_id

	with open(output_trans_path, 'r', encoding='utf-8') as file:
	prompt = file.read()

	prompt_list = prompt.split('\n')

	output_text_l = []
	cur_prompt = ""
	for idx, prompt in enumerate(prompt_list):

	cur_prompt += prompt + "\n"

	if len(cur_prompt) < 5000 and idx != len(prompt_list) -1:
	continue

	output_text = run_model(cur_prompt)
	output_text_l.append(output_text)
	print("\n")

	cur_prompt = ""

	# TODO: 如果输入文本很长, 可以考虑再次调用模型对分段总结进行全面总结, 但是此时也需要保证总输入长度小于模型规定的上限
	# print("\n\n开始进行最终总结...")
	# prompt = "\n".join(output_text_l)
	# final_output_text = run_model(prompt)
	# print("最终总结结果:\n", final_output_text)

	# 将 output_text_l 保存为 md 格式的文件, 按照原始 LLM 的输出格式进行保存
	output_summary_path = os.path.join(args.output_dir, f"{wav_file.split('/')[-1]}_summary.md")
	output_text_combined = "\n\n".join(output_text_l)
	with open(output_summary_path, 'w', encoding='utf-8') as f:
	f.write(output_text_combined)

	logger.info(f"LLM Inference completed. Summary saved to {output_summary_path}")