Fintech-Dreamer
/

FinSynth_model_fraud

 pipeline_tag: text-generation
 tags:
 - finance
+---
+# 金融欺诈检测机器人
+## 使用方式
+```python
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+# 设置模型和数据路径
+MODEL_PATH = "Fintech-Dreamer/FinSynth_model_fraud"
+def generate_response(model, tokenizer, instruction, input_text, max_length=2048):
+    """
+    使用模型生成回答
+    参数：
+        model: 加载的语言模型实例
+        tokenizer: 模型对应的分词器
+        instruction: 指令部分文本，一般是任务描述
+        input_text: 输入文本，一般是需要分析的内容
+        max_length: 生成文本的最大长度，默认为2048个token
+    返回：
+        prompt: 完整的输入提示词
+        response: 模型生成的回答
+    """
+    # 构造提示词格式 - 使用特殊标记组织对话形式
+    # <｜begin of sentence｜>标记句子开始，<｜User｜>和<｜Assistant｜>分别标记用户和助手角色
+    prompt = f"<｜begin of sentence｜><｜User｜>{instruction}\n{input_text}<｜Assistant｜>"
+    # 编码输入，将文本转换为模型可以理解的token序列
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
+    # 将输入移动到模型所在的设备（CPU或GPU）
+    inputs = inputs.to(model.device)
+    # 使用torch.no_grad()避免计算梯度，节省内存并加速推理过程
+    with torch.no_grad():
+        # 调用模型的generate方法生成回答
+        outputs = model.generate(
+            **inputs,
+            max_length=max_length,  # 设置生成文本的最大长度
+            num_return_sequences=1,  # 只返回一个生成序列
+            do_sample=True,  # 使用采样策略，增加多样性
+            temperature=0.6,  # 温度参数，控制生成文本的随机性（较低的值使输出更确定）
+            top_p=0.95,  # 使用nucleus sampling，只考虑概率和超过0.95的token
+            pad_token_id=tokenizer.eos_token_id,  # 将填充标记设置为结束标记
+            use_cache=True,  # 使用缓存加速生成过程
+        )
+    # 将生成的token序列解码为文本
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    # 提取回答部分（去除提示词部分）
+    if "<｜Assistant｜>" in response:
+        response = response.split("<｜Assistant｜>")[1].strip()
+    return prompt, response
+def process_test_data():
+    """
+    处理测试数据集并生成预测结果
+    功能：
+    - 加载测试数据集
+    - 初始化模型和分词器
+    - 对每个测试样本进行预测
+    - 输出预测结果
+    返回：
+        None，结果直接打印
+    """
+    # 加载测试数据
+    # 加载模型和分词器
+    print(f"加载模型: {MODEL_PATH}")
+    # 加载预训练的分词器，用于将文本转换为token
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+    # 加载预训练的语言模型，设置为自动选择设备，使用bfloat16精度以提高性能
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        device_map="auto",  # 自动选择可用的设备（CPU/GPU）
+        torch_dtype=torch.bfloat16,  # 使用bfloat16精度，在保持准确性的同时减少内存占用
+        use_cache=True,  # 启用缓存以提高生成速度
+    )
+    # 设置模型为评估模式，关闭dropout等训练特性，提高推理性能
+    model.eval()
+    # 处理每个测试样本
+    print("开始生成预测...")
+    try:
+        # 提取指令和输入文本
+        instruction = "Combine your financial knowledge to carefully analyze whether this is a fraudulent transaction，just answer yes or no"  # 指令部分，告诉模型要执行的任务
+        input_text = "The transaction involves a 20-year-old customer with an income of $0.1, who has been at their current address for 46 months. The request was made 0.03 days ago, and the intended amount is $48.49. The payment type is 'AA', and the email used is from a free provider. The customer's bank account is 6 months old, and they do not have other credit cards. The proposed credit limit is $1500, and the request was made via the internet. The session lasted 3.36 minutes, and the device used had 1 distinct email in the last 8 weeks. The device has no prior fraud records."  # 输入文本，需要分析的内容
+        print("\n正在生成预测...")
+        # 调用generate_response函数生成预测
+        full_prompt, response = generate_response(model, tokenizer, instruction, input_text)
+        # 打印完整的预测结果，不截断
+        print("\n===== 完整输入输出 =====")
+        print(f"提示词: {full_prompt}")
+        print(f"\n预测结果: {response}")
+        # 简要展示关键结果
+        print("\n===== 简要结果 =====")
+        # 简单的欺诈判断逻辑：检查输出中是否包含"fraudulent"且不包含"not"
+        print(f"预测标签: {'欺诈' if 'fraudulent' in response.lower() and 'not' not in response.lower() else '非欺诈'}")
+        print(f"输出长度: {len(response)} 字符")
+        # 以JSON格式输出结果，便于后续处理或保存
+        print("\n===== JSON格式 =====")
+        result_json = {"prompt": full_prompt, "predict": response}
+        print(result_json)
+    except Exception as e:
+        # 异常处理，确保一个样本的错误不会导致整个程序崩溃
+        print(f"\n处理样本时出错: {str(e)}")
+        import traceback
+        traceback.print_exc()  # 打印详细错误信息，便于调试
+    print("\n预测完成！")
+    return None
+def main():
+    """
+    主函数，程序入口点
+    功能：
+    - 启动测试数据处理和预测流程
+    """
+    print("===== 模型调用 =====")
+    process_test_data()
+if __name__ == "__main__":
+    main()
+```
+## 数据集参考
+[Fintech-Dreamer/FinSynth_data · Datasets at Hugging Face](https://huggingface.co/datasets/Fintech-Dreamer/FinSynth_data)
+## 前端框架参考
+[Fintech-Dreamer/FinSynth](https://github.com/Fintech-Dreamer/FinSynth)
+## 数据处理方式参考
+[Fintech-Dreamer/FinSynth-Data-Processing](https://github.com/Fintech-Dreamer/FinSynth-Data-Processing)