fengxb30
/

FinGPT_TaskII_Compliance

Safetensors

qwen3

Model card Files Files and versions

xet

Community

fengxb30 commited on Nov 12, 2025

Commit

776d8e5

verified ·

1 Parent(s): 56db8f3

Delete FinGPT_TaskII_Submission/scripts/task_2_finetune.py

Browse files

Files changed (1) hide show

FinGPT_TaskII_Submission/scripts/task_2_finetune.py +0 -361

FinGPT_TaskII_Submission/scripts/task_2_finetune.py DELETED Viewed

@@ -1,361 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# # FinAI Contest Task 2 – LoRA Fine-Tuning Walk-through
-#
-# (内容已修改：使用 Fin-o1-8B 替代 Llama-3.1)
-#
-# This notebook walks through **data preparation, fine-tuning, and inference** for FinAI Task 2 using the [FinLoRA](https://github.com/Open-Finance-Lab/FinLoRA) framework.
-#
-# **Target Tasks:**
-# - CFA exams
-# - BloombergGPT public benchmarks
-# - XBRL tasks
-# ## 1. Environment Setup
-#
-# **Prerequisites:**
-# - NVIDIA GPU with ≥ 24 GB VRAM (8-bit) or ≥ 16 GB VRAM (4-bit)
-# - CUDA ≥ 11.8
-# - Alternatively, use runpod.io (see FinLoRA docs for instructions on using it: https://finlora-docs.readthedocs.io/en/latest/tutorials/setup.html)
-import subprocess
-import os
-import json
-import random
-from pathlib import Path
-# 导入第 5 步所需的库
-# 提前导入，以便用户可以先检查依赖
-try:
-    from transformers import AutoTokenizer, AutoModelForCausalLM
-    from peft import PeftModel
-    import torch
-except ImportError:
-    print("Warning: 'transformers', 'peft', or 'torch' not found.")
-    print("Please ensure the environment is set up correctly before reaching step 5.\n")
-def step_1_setup_environment():
-    """
-    克隆 FinLoRA 仓库并安装依赖。
-    """
-    print("--- 1. Environment Setup ---")
-    try:
-        # Clone FinLoRA and install dependencies
-        print("Cloning FinLoRA...")
-        subprocess.run(["git", "clone", "https://github.com/Open-Finance-Lab/FinLoRA.git"], check=True)
-        print("Changing directory to FinLoRA...")
-        os.chdir("FinLoRA")
-        # Option A - bash script
-        print("Running setup.sh...")
-        # 使用 shell=True 来处理 '&&' 链式命令
-        subprocess.run("chmod +x setup.sh && ./setup.sh", shell=True, check=True)
-        # Option B - conda (alternative)
-        # # !conda env create -f environment.yml && conda activate finenv
-        print("\n--- Environment Setup Complete ---")
-        # *** 已移除 ***
-        # Fin-o1-8B 不是受控模型，不需要 huggingface-cli login
-    except FileNotFoundError as e:
-        print(f"Error: Command not found. Is git installed? {e}")
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred during environment setup: {e}")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-    print("----------------------------\n")
-# ## 2. Data Preparation
-#
-# **Data Sources to Collect:**
-# - CFA mock-exam PDFs or CSVs
-# - BloombergGPT benchmark datasets (FPB, FiQA SA, Headline, NER, ConvFinQA)
-# - XBRL corpora for tag/value/formula tasks
-#
-# **Required Format:** JSONL with `{\"context\": \"<question>\", \"target\": \"<answer>\"}`
-def step_2_prepare_data():
-    """
-    将原始数据文件分割为训练集和测试集。
-    """
-    print("--- 2. Data Preparation ---")
-    # Assume you have collected raw Q&A pairs in 'finai_raw.jsonl'
-    # Each line should be: {"context": "question", "target": "answer"}
-    # Read raw data
-    raw_file = Path('data/finai_raw.jsonl')  # Update path as needed
-    if raw_file.exists():
-        print(f"Reading raw data from {raw_file}...")
-        with open(raw_file, 'r', encoding='utf-8') as f:
-            lines = f.read().splitlines()
-        # Shuffle for random split
-        random.seed(42)
-        random.shuffle(lines)
-        # 80/20 split
-        n = len(lines)
-        n_train = int(0.8 * n)
-        train_lines = lines[:n_train]
-        test_lines = lines[n_train:]
-        # Create directories
-        Path('data/train').mkdir(parents=True, exist_ok=True)
-        Path('data/test').mkdir(parents=True, exist_ok=True)
-        train_path = Path('data/train/finai_train.jsonl')
-        test_path = Path('data/test/finai_test.jsonl')
-        # Save splits
-        with open(train_path, 'w', encoding='utf-8') as f:
-            f.write('\n'.join(train_lines) + '\n')
-        with open(test_path, 'w', encoding='utf-8') as f:
-            f.write('\n'.join(test_lines) + '\n')
-        print(f"Split {n} examples into {len(train_lines)} (train) and {len(test_lines)} (test)")
-        print(f"Train data saved to: {train_path}")
-        print(f"Test data saved to: {test_path}")
-    else:
-        print(f"Warning: Raw data file not found at {raw_file}")
-        print("Please create this file with your collected Q&A pairs first.")
-        print("Skipping data split.")
-    print("----------------------------\n")
-# ## 3. Configure Fine-Tuning
-#
-# Add configuration to `finetune_configs.json` for your FinAI model.
-def step_3_configure_finetuning():
-    """
-    向 lora/finetune_configs.json 添加此任务的配置。
-    """
-    print("--- 3. Configure Fine-Tuning ---")
-    config_file = Path('lora/finetune_configs.json')
-    # *** 已更改 ***
-    # 更新配置名称以反映新模型
-    config_name = "finai_fino1_8b_8bits_r8_lora"
-    # 路径保持不变，使用第 2 步中创建的数据
-    dataset_path = "../data/train/finai_train.jsonl"
-    if not config_file.exists():
-        print(f"Error: Config file not found at {config_file}")
-        print("Please ensure you are in the 'FinLoRA' directory (created in step 1).")
-        return False
-    try:
-        # Read existing config
-        with open(config_file, 'r') as f:
-            configs = json.load(f)
-        # Add competition fine-tuned model configuration
-        configs[config_name] = {
-            #change
-            "base_model": "The-FinAI/Fin-o1-8B",
-            "dataset_path": dataset_path,
-            "lora_r": 8,
-            "quant_bits": 8,
-            "learning_rate": 1e-4,
-            "num_epochs": 4,
-            "batch_size": 2,
-            "gradient_accumulation_steps": 2
-        }
-        # Save updated config
-        with open(config_file, 'w') as f:
-            json.dump(configs, f, indent=2)
-        print(f"Successfully updated {config_file} with config: '{config_name}'")
-        print("----------------------------\n")
-        return True
-    except Exception as e:
-        print(f"An error occurred while updating config file: {e}")
-        print("----------------------------\n")
-        return False
-# ## 4. Run Fine-Tuning
-#
-# This will take some time depending on your dataset size and GPU setup.
-def step_4_run_finetuning():
-    """
-    执行微调脚本。
-    """
-    print("--- 4. Run Fine-Tuning ---")
-    # *** 已更改 ***
-    # 使用在第 3 步中定义的新配置名称
-    config_name = "finai_fino1_8b_8bits_r8_lora"
-    try:
-        print("Changing directory to 'lora'...")
-        os.chdir("lora")
-        # Fetch DeepSpeed configs
-        print("Fetching DeepSpeed configs...")
-        subprocess.run(["axolotl", "fetch", "deepspeed_configs"], check=True)
-        # Run the fine-tuning
-        print(f"\nRunning fine-tuning for config: {config_name}...")
-        print("This may take a long time...")
-        subprocess.run(["python", "finetune.py", config_name], check=True)
-        print("\nFine-tuning complete.")
-    except FileNotFoundError:
-        print("Error: 'lora' directory not found or 'axolotl'/'python' command not found.")
-        print("Please ensure you are in the 'FinLoRA' directory and setup was successful.")
-    except subprocess.CalledProcessError as e:
-        print(f"An error occurred during the fine-tuning process: {e}")
-    except Exception as e:
-        print(f"An unexpected error occurred: {e}")
-    finally:
-        # 无论成功与否，都尝试返回上一级目录，以便第 5 步可以正常运行
-        if Path.cwd().name == "lora":
-            print("Changing directory back to FinLoRA root...")
-            os.chdir("..")
-    print("----------------------------\n")
-# ## 5. Load Adapter & Run Inference
-#
-# Once fine-tuning is complete, you can run inferences as follows.
-def step_5_run_inference():
-    """
-    加载微调后的适配器并运行推理测试。
-    """
-    print("--- 5. Load Adapter & Run Inference ---")
-    try:
-        # Load base model and tokenizer
-        # *** 已更改 ***
-        base_model_name = "The-FinAI/Fin-o1-8B"
-        # *** 已更改 ***
-        # 适配器路径 (adapter_path) 取决于配置名称。
-        config_name_for_path = "finai_fino1_8b_8bits_r8_lora"
-        adapter_path = Path(f"axolotl-output/{config_name_for_path}")
-        if not adapter_path.exists():
-            print(f"Error: Adapter path not found: {adapter_path}")
-            print("This likely means the fine-tuning step (Step 4) failed or was skipped.")
-            return
-        print(f"Loading base model: {base_model_name}...")
-        tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-        base_model = AutoModelForCausalLM.from_pretrained(
-            base_model_name,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            trust_remote_code=True
-        )
-        print(f"Loading LoRA adapter from: {adapter_path}...")
-        # Load and apply the LoRA adapter
-        model = PeftModel.from_pretrained(base_model, str(adapter_path))
-        print("\nAdapter loaded successfully. Running inference test...")
-        # Test with sample questions
-        test_questions = [
-            "What is the primary purpose of a cash flow hedge under IFRS?",
-            "Explain the concept of economic value added (EVA).",
-            "How do you calculate the price-to-earnings ratio?"
-        ]
-        for question in test_questions:
-            print(f"\nQuestion: {question}")
-            inputs = tokenizer(question, return_tensors="pt").to(model.device)
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=100,  # 添加 max_new_tokens 以免生成过长
-                    pad_token_id=tokenizer.eos_token_id
-                )
-            # 响应包含原始问题，我们将其剥离
-            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            answer = response[len(question):].strip()
-            print(f"Answer: {answer}")
-            print("-" * 80)
-    except ImportError:
-        print("Error: 'transformers', 'peft', or 'torch' not found.")
-        print("Please ensure the environment setup in step 1 was successful.")
-    except Exception as e:
-        print(f"An error occurred during inference: {e}")
-    print("----------------------------\n")
-def main():
-    """
-    按顺序执行 FinLoRA 微调的所选步骤。
-    """
-    # 检查是否在 FinLoRA 目录中
-    if Path.cwd().name == "FinLoRA":
-        print("Already in FinLoRA directory. Skipping Step 1 (Setup).")
-        # 假设环境已设置，直接从数据准备开始
-        step_2_prepare_data()
-        if step_3_configure_finetuning():
-            step_4_run_finetuning()
-            step_5_run_inference()
-        else:
-            print("Configuration failed. Aborting.")
-    elif Path("FinLoRA").exists():
-        print("Found 'FinLoRA' directory. Changing directory and proceeding.")
-        os.chdir("FinLoRA")
-        # *** 已移除 ***
-        # Fin-o1-8B 不需要登录
-        step_2_prepare_data()
-        if step_3_configure_finetuning():
-            step_4_run_finetuning()
-            step_5_run_inference()
-        else:
-            print("Configuration failed. Aborting.")
-    else:
-        # 运行完整的设置步骤
-        print("Starting from scratch...")
-        step_1_setup_environment()
-        # 检查设置是否成功（即 FinLoRA 目录现在是否存在）
-        if Path.cwd().name == "FinLoRA":
-            step_2_prepare_data()
-            if step_3_configure_finetuning():
-                step_4_run_finetuning()
-                step_5_run_inference()
-            else:
-                print("Configuration failed. Aborting.")
-        else:
-            print("Step 1 (Setup) seems to have failed. Aborting.")
-    print("Script finished.")
-if __name__ == "__main__":
-    main()