|
|
--- |
|
|
license: mit |
|
|
library_name: transformers |
|
|
datasets: |
|
|
- Fintech-Dreamer/FinSynth_data |
|
|
language: |
|
|
- en |
|
|
- zh |
|
|
base_model: |
|
|
- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
|
|
pipeline_tag: text-generation |
|
|
tags: |
|
|
- finance |
|
|
--- |
|
|
|
|
|
# 金融欺诈检测机器人 |
|
|
|
|
|
## 使用方式 |
|
|
|
|
|
|
|
|
```python |
|
|
import os |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
import torch |
|
|
|
|
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" |
|
|
|
|
|
|
|
|
# 设置模型和数据路径 |
|
|
MODEL_PATH = "Fintech-Dreamer/FinSynth_model_fraud" |
|
|
|
|
|
|
|
|
def generate_response(model, tokenizer, instruction, input_text, max_length=2048): |
|
|
""" |
|
|
使用模型生成回答 |
|
|
|
|
|
参数: |
|
|
model: 加载的语言模型实例 |
|
|
tokenizer: 模型对应的分词器 |
|
|
instruction: 指令部分文本,一般是任务描述 |
|
|
input_text: 输入文本,一般是需要分析的内容 |
|
|
max_length: 生成文本的最大长度,默认为2048个token |
|
|
|
|
|
返回: |
|
|
prompt: 完整的输入提示词 |
|
|
response: 模型生成的回答 |
|
|
""" |
|
|
# 构造提示词格式 - 使用特殊标记组织对话形式 |
|
|
# <|begin of sentence|>标记句子开始,<|User|>和<|Assistant|>分别标记用户和助手角色 |
|
|
prompt = f"<|begin of sentence|><|User|>{instruction}\n{input_text}<|Assistant|>" |
|
|
|
|
|
# 编码输入,将文本转换为模型可以理解的token序列 |
|
|
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True) |
|
|
# 将输入移动到模型所在的设备(CPU或GPU) |
|
|
inputs = inputs.to(model.device) |
|
|
|
|
|
# 使用torch.no_grad()避免计算梯度,节省内存并加速推理过程 |
|
|
with torch.no_grad(): |
|
|
# 调用模型的generate方法生成回答 |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_length=max_length, # 设置生成文本的最大长度 |
|
|
num_return_sequences=1, # 只返回一个生成序列 |
|
|
do_sample=True, # 使用采样策略,增加多样性 |
|
|
temperature=0.6, # 温度参数,控制生成文本的随机性(较低的值使输出更确定) |
|
|
top_p=0.95, # 使用nucleus sampling,只考虑概率和超过0.95的token |
|
|
pad_token_id=tokenizer.eos_token_id, # 将填充标记设置为结束标记 |
|
|
use_cache=True, # 使用缓存加速生成过程 |
|
|
) |
|
|
|
|
|
# 将生成的token序列解码为文本 |
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) |
|
|
|
|
|
# 提取回答部分(去除提示词部分) |
|
|
if "<|Assistant|>" in response: |
|
|
response = response.split("<|Assistant|>")[1].strip() |
|
|
|
|
|
return prompt, response |
|
|
|
|
|
|
|
|
def process_test_data(): |
|
|
""" |
|
|
处理测试数据集并生成预测结果 |
|
|
|
|
|
功能: |
|
|
- 加载测试数据集 |
|
|
- 初始化模型和分词器 |
|
|
- 对每个测试样本进行预测 |
|
|
- 输出预测结果 |
|
|
|
|
|
返回: |
|
|
None,结果直接打印 |
|
|
""" |
|
|
# 加载测试数据 |
|
|
|
|
|
# 加载模型和分词器 |
|
|
print(f"加载模型: {MODEL_PATH}") |
|
|
# 加载预训练的分词器,用于将文本转换为token |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
|
|
# 加载预训练的语言模型,设置为自动选择设备,使用bfloat16精度以提高性能 |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_PATH, |
|
|
device_map="auto", # 自动选择可用的设备(CPU/GPU) |
|
|
torch_dtype=torch.bfloat16, # 使用bfloat16精度,在保持准确性的同时减少内存占用 |
|
|
use_cache=True, # 启用缓存以提高生成速度 |
|
|
) |
|
|
|
|
|
# 设置模型为评估模式,关闭dropout等训练特性,提高推理性能 |
|
|
model.eval() |
|
|
|
|
|
# 处理每个测试样本 |
|
|
print("开始生成预测...") |
|
|
|
|
|
try: |
|
|
# 提取指令和输入文本 |
|
|
instruction = "Combine your financial knowledge to carefully analyze whether this is a fraudulent transaction,just answer yes or no" # 指令部分,告诉模型要执行的任务 |
|
|
input_text = "The transaction involves a 20-year-old customer with an income of $0.1, who has been at their current address for 46 months. The request was made 0.03 days ago, and the intended amount is $48.49. The payment type is 'AA', and the email used is from a free provider. The customer's bank account is 6 months old, and they do not have other credit cards. The proposed credit limit is $1500, and the request was made via the internet. The session lasted 3.36 minutes, and the device used had 1 distinct email in the last 8 weeks. The device has no prior fraud records." # 输入文本,需要分析的内容 |
|
|
|
|
|
print("\n正在生成预测...") |
|
|
|
|
|
# 调用generate_response函数生成预测 |
|
|
full_prompt, response = generate_response(model, tokenizer, instruction, input_text) |
|
|
|
|
|
# 打印完整的预测结果,不截断 |
|
|
print("\n===== 完整输入输出 =====") |
|
|
print(f"提示词: {full_prompt}") |
|
|
print(f"\n预测结果: {response}") |
|
|
|
|
|
# 简要展示关键结果 |
|
|
print("\n===== 简要结果 =====") |
|
|
# 简单的欺诈判断逻辑:检查输出中是否包含"fraudulent"且不包含"not" |
|
|
print(f"预测标签: {'欺诈' if 'fraudulent' in response.lower() and 'not' not in response.lower() else '非欺诈'}") |
|
|
print(f"输出长度: {len(response)} 字符") |
|
|
|
|
|
# 以JSON格式输出结果,便于后续处理或保存 |
|
|
print("\n===== JSON格式 =====") |
|
|
result_json = {"prompt": full_prompt, "predict": response} |
|
|
print(result_json) |
|
|
|
|
|
except Exception as e: |
|
|
# 异常处理,确保一个样本的错误不会导致整个程序崩溃 |
|
|
print(f"\n处理样本时出错: {str(e)}") |
|
|
import traceback |
|
|
|
|
|
traceback.print_exc() # 打印详细错误信息,便于调试 |
|
|
|
|
|
print("\n预测完成!") |
|
|
return None |
|
|
|
|
|
|
|
|
def main(): |
|
|
""" |
|
|
主函数,程序入口点 |
|
|
|
|
|
功能: |
|
|
- 启动测试数据处理和预测流程 |
|
|
""" |
|
|
print("===== 模型调用 =====") |
|
|
process_test_data() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
``` |
|
|
|
|
|
## 数据集参考 |
|
|
|
|
|
[Fintech-Dreamer/FinSynth_data · Datasets at Hugging Face](https://huggingface.co/datasets/Fintech-Dreamer/FinSynth_data) |
|
|
|
|
|
## 前端框架参考 |
|
|
|
|
|
[Fintech-Dreamer/FinSynth](https://github.com/Fintech-Dreamer/FinSynth) |
|
|
|
|
|
## 数据处理方式参考 |
|
|
|
|
|
[Fintech-Dreamer/FinSynth-Data-Processing](https://github.com/Fintech-Dreamer/FinSynth-Data-Processing) |
|
|
|
|
|
|