fengxb30 commited on
Commit
776d8e5
·
verified ·
1 Parent(s): 56db8f3

Delete FinGPT_TaskII_Submission/scripts/task_2_finetune.py

Browse files
FinGPT_TaskII_Submission/scripts/task_2_finetune.py DELETED
@@ -1,361 +0,0 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # # FinAI Contest Task 2 – LoRA Fine-Tuning Walk-through
5
- #
6
- # (内容已修改:使用 Fin-o1-8B 替代 Llama-3.1)
7
- #
8
- # This notebook walks through **data preparation, fine-tuning, and inference** for FinAI Task 2 using the [FinLoRA](https://github.com/Open-Finance-Lab/FinLoRA) framework.
9
- #
10
- # **Target Tasks:**
11
- # - CFA exams
12
- # - BloombergGPT public benchmarks
13
- # - XBRL tasks
14
-
15
- # ## 1. Environment Setup
16
- #
17
- # **Prerequisites:**
18
- # - NVIDIA GPU with ≥ 24 GB VRAM (8-bit) or ≥ 16 GB VRAM (4-bit)
19
- # - CUDA ≥ 11.8
20
- # - Alternatively, use runpod.io (see FinLoRA docs for instructions on using it: https://finlora-docs.readthedocs.io/en/latest/tutorials/setup.html)
21
-
22
- import subprocess
23
- import os
24
- import json
25
- import random
26
- from pathlib import Path
27
-
28
- # 导入第 5 步所需的库
29
- # 提前导入,以便用户可以先检查依赖
30
- try:
31
- from transformers import AutoTokenizer, AutoModelForCausalLM
32
- from peft import PeftModel
33
- import torch
34
- except ImportError:
35
- print("Warning: 'transformers', 'peft', or 'torch' not found.")
36
- print("Please ensure the environment is set up correctly before reaching step 5.\n")
37
-
38
-
39
- def step_1_setup_environment():
40
- """
41
- 克隆 FinLoRA 仓库并安装依赖。
42
- """
43
- print("--- 1. Environment Setup ---")
44
- try:
45
- # Clone FinLoRA and install dependencies
46
- print("Cloning FinLoRA...")
47
- subprocess.run(["git", "clone", "https://github.com/Open-Finance-Lab/FinLoRA.git"], check=True)
48
-
49
- print("Changing directory to FinLoRA...")
50
- os.chdir("FinLoRA")
51
-
52
- # Option A - bash script
53
- print("Running setup.sh...")
54
- # 使用 shell=True 来处理 '&&' 链式命令
55
- subprocess.run("chmod +x setup.sh && ./setup.sh", shell=True, check=True)
56
-
57
- # Option B - conda (alternative)
58
- # # !conda env create -f environment.yml && conda activate finenv
59
-
60
- print("\n--- Environment Setup Complete ---")
61
-
62
- # *** 已移除 ***
63
- # Fin-o1-8B 不是受控模型,不需要 huggingface-cli login
64
-
65
- except FileNotFoundError as e:
66
- print(f"Error: Command not found. Is git installed? {e}")
67
- except subprocess.CalledProcessError as e:
68
- print(f"An error occurred during environment setup: {e}")
69
- except Exception as e:
70
- print(f"An unexpected error occurred: {e}")
71
- print("----------------------------\n")
72
-
73
-
74
- # ## 2. Data Preparation
75
- #
76
- # **Data Sources to Collect:**
77
- # - CFA mock-exam PDFs or CSVs
78
- # - BloombergGPT benchmark datasets (FPB, FiQA SA, Headline, NER, ConvFinQA)
79
- # - XBRL corpora for tag/value/formula tasks
80
- #
81
- # **Required Format:** JSONL with `{\"context\": \"<question>\", \"target\": \"<answer>\"}`
82
-
83
- def step_2_prepare_data():
84
- """
85
- 将原始数据文件分割为训练集和测试集。
86
- """
87
- print("--- 2. Data Preparation ---")
88
-
89
- # Assume you have collected raw Q&A pairs in 'finai_raw.jsonl'
90
- # Each line should be: {"context": "question", "target": "answer"}
91
-
92
- # Read raw data
93
- raw_file = Path('data/finai_raw.jsonl') # Update path as needed
94
- if raw_file.exists():
95
- print(f"Reading raw data from {raw_file}...")
96
- with open(raw_file, 'r', encoding='utf-8') as f:
97
- lines = f.read().splitlines()
98
-
99
- # Shuffle for random split
100
- random.seed(42)
101
- random.shuffle(lines)
102
-
103
- # 80/20 split
104
- n = len(lines)
105
- n_train = int(0.8 * n)
106
-
107
- train_lines = lines[:n_train]
108
- test_lines = lines[n_train:]
109
-
110
- # Create directories
111
- Path('data/train').mkdir(parents=True, exist_ok=True)
112
- Path('data/test').mkdir(parents=True, exist_ok=True)
113
-
114
- train_path = Path('data/train/finai_train.jsonl')
115
- test_path = Path('data/test/finai_test.jsonl')
116
-
117
- # Save splits
118
- with open(train_path, 'w', encoding='utf-8') as f:
119
- f.write('\n'.join(train_lines) + '\n')
120
-
121
- with open(test_path, 'w', encoding='utf-8') as f:
122
- f.write('\n'.join(test_lines) + '\n')
123
-
124
- print(f"Split {n} examples into {len(train_lines)} (train) and {len(test_lines)} (test)")
125
- print(f"Train data saved to: {train_path}")
126
- print(f"Test data saved to: {test_path}")
127
- else:
128
- print(f"Warning: Raw data file not found at {raw_file}")
129
- print("Please create this file with your collected Q&A pairs first.")
130
- print("Skipping data split.")
131
-
132
- print("----------------------------\n")
133
-
134
-
135
- # ## 3. Configure Fine-Tuning
136
- #
137
- # Add configuration to `finetune_configs.json` for your FinAI model.
138
-
139
- def step_3_configure_finetuning():
140
- """
141
- 向 lora/finetune_configs.json 添加此任务的配置。
142
- """
143
- print("--- 3. Configure Fine-Tuning ---")
144
-
145
- config_file = Path('lora/finetune_configs.json')
146
-
147
- # *** 已更改 ***
148
- # 更新配置名称以反映新模型
149
- config_name = "finai_fino1_8b_8bits_r8_lora"
150
-
151
- # 路径保持不变,使用第 2 步中创建的数据
152
- dataset_path = "../data/train/finai_train.jsonl"
153
-
154
- if not config_file.exists():
155
- print(f"Error: Config file not found at {config_file}")
156
- print("Please ensure you are in the 'FinLoRA' directory (created in step 1).")
157
- return False
158
-
159
- try:
160
- # Read existing config
161
- with open(config_file, 'r') as f:
162
- configs = json.load(f)
163
-
164
- # Add competition fine-tuned model configuration
165
- configs[config_name] = {
166
- #change
167
- "base_model": "The-FinAI/Fin-o1-8B",
168
- "dataset_path": dataset_path,
169
- "lora_r": 8,
170
- "quant_bits": 8,
171
- "learning_rate": 1e-4,
172
- "num_epochs": 4,
173
- "batch_size": 2,
174
- "gradient_accumulation_steps": 2
175
- }
176
-
177
- # Save updated config
178
- with open(config_file, 'w') as f:
179
- json.dump(configs, f, indent=2)
180
-
181
- print(f"Successfully updated {config_file} with config: '{config_name}'")
182
- print("----------------------------\n")
183
- return True
184
-
185
- except Exception as e:
186
- print(f"An error occurred while updating config file: {e}")
187
- print("----------------------------\n")
188
- return False
189
-
190
-
191
- # ## 4. Run Fine-Tuning
192
- #
193
- # This will take some time depending on your dataset size and GPU setup.
194
-
195
- def step_4_run_finetuning():
196
- """
197
- 执行微调脚本。
198
- """
199
- print("--- 4. Run Fine-Tuning ---")
200
-
201
- # *** 已更改 ***
202
- # 使用在第 3 步中定义的新配置名称
203
- config_name = "finai_fino1_8b_8bits_r8_lora"
204
-
205
- try:
206
- print("Changing directory to 'lora'...")
207
- os.chdir("lora")
208
-
209
- # Fetch DeepSpeed configs
210
- print("Fetching DeepSpeed configs...")
211
- subprocess.run(["axolotl", "fetch", "deepspeed_configs"], check=True)
212
-
213
- # Run the fine-tuning
214
- print(f"\nRunning fine-tuning for config: {config_name}...")
215
- print("This may take a long time...")
216
- subprocess.run(["python", "finetune.py", config_name], check=True)
217
-
218
- print("\nFine-tuning complete.")
219
-
220
- except FileNotFoundError:
221
- print("Error: 'lora' directory not found or 'axolotl'/'python' command not found.")
222
- print("Please ensure you are in the 'FinLoRA' directory and setup was successful.")
223
- except subprocess.CalledProcessError as e:
224
- print(f"An error occurred during the fine-tuning process: {e}")
225
- except Exception as e:
226
- print(f"An unexpected error occurred: {e}")
227
- finally:
228
- # 无论成功与否,都尝试返回上一级目录,以便第 5 步可以正常运行
229
- if Path.cwd().name == "lora":
230
- print("Changing directory back to FinLoRA root...")
231
- os.chdir("..")
232
-
233
- print("----------------------------\n")
234
-
235
-
236
- # ## 5. Load Adapter & Run Inference
237
- #
238
- # Once fine-tuning is complete, you can run inferences as follows.
239
-
240
- def step_5_run_inference():
241
- """
242
- 加载微调后的适配器并运行推理测试。
243
- """
244
- print("--- 5. Load Adapter & Run Inference ---")
245
-
246
- try:
247
- # Load base model and tokenizer
248
- # *** 已更改 ***
249
- base_model_name = "The-FinAI/Fin-o1-8B"
250
-
251
- # *** 已更改 ***
252
- # 适配器路径 (adapter_path) 取决于配置名称。
253
- config_name_for_path = "finai_fino1_8b_8bits_r8_lora"
254
- adapter_path = Path(f"axolotl-output/{config_name_for_path}")
255
-
256
- if not adapter_path.exists():
257
- print(f"Error: Adapter path not found: {adapter_path}")
258
- print("This likely means the fine-tuning step (Step 4) failed or was skipped.")
259
- return
260
-
261
- print(f"Loading base model: {base_model_name}...")
262
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
263
- base_model = AutoModelForCausalLM.from_pretrained(
264
- base_model_name,
265
- torch_dtype=torch.float16,
266
- device_map="auto",
267
- trust_remote_code=True
268
- )
269
-
270
- print(f"Loading LoRA adapter from: {adapter_path}...")
271
- # Load and apply the LoRA adapter
272
- model = PeftModel.from_pretrained(base_model, str(adapter_path))
273
-
274
- print("\nAdapter loaded successfully. Running inference test...")
275
-
276
- # Test with sample questions
277
- test_questions = [
278
- "What is the primary purpose of a cash flow hedge under IFRS?",
279
- "Explain the concept of economic value added (EVA).",
280
- "How do you calculate the price-to-earnings ratio?"
281
- ]
282
-
283
- for question in test_questions:
284
- print(f"\nQuestion: {question}")
285
- inputs = tokenizer(question, return_tensors="pt").to(model.device)
286
-
287
- with torch.no_grad():
288
- outputs = model.generate(
289
- **inputs,
290
- max_new_tokens=100, # 添加 max_new_tokens 以免生成过长
291
- pad_token_id=tokenizer.eos_token_id
292
- )
293
-
294
- # 响应包含原始问题,我们将其剥离
295
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
296
- answer = response[len(question):].strip()
297
-
298
- print(f"Answer: {answer}")
299
- print("-" * 80)
300
-
301
- except ImportError:
302
- print("Error: 'transformers', 'peft', or 'torch' not found.")
303
- print("Please ensure the environment setup in step 1 was successful.")
304
- except Exception as e:
305
- print(f"An error occurred during inference: {e}")
306
-
307
- print("----------------------------\n")
308
-
309
-
310
- def main():
311
- """
312
- 按顺序执行 FinLoRA 微调的所选步骤。
313
- """
314
- # 检查是否在 FinLoRA 目录中
315
- if Path.cwd().name == "FinLoRA":
316
- print("Already in FinLoRA directory. Skipping Step 1 (Setup).")
317
-
318
- # 假设环境已设置,直接从数据准备开始
319
- step_2_prepare_data()
320
-
321
- if step_3_configure_finetuning():
322
- step_4_run_finetuning()
323
- step_5_run_inference()
324
- else:
325
- print("Configuration failed. Aborting.")
326
-
327
- elif Path("FinLoRA").exists():
328
- print("Found 'FinLoRA' directory. Changing directory and proceeding.")
329
- os.chdir("FinLoRA")
330
-
331
- # *** 已移除 ***
332
- # Fin-o1-8B 不需要登录
333
-
334
- step_2_prepare_data()
335
- if step_3_configure_finetuning():
336
- step_4_run_finetuning()
337
- step_5_run_inference()
338
- else:
339
- print("Configuration failed. Aborting.")
340
-
341
- else:
342
- # 运行完整的设置步骤
343
- print("Starting from scratch...")
344
- step_1_setup_environment()
345
-
346
- # 检查设置是否成功(即 FinLoRA 目录现在是否存在)
347
- if Path.cwd().name == "FinLoRA":
348
- step_2_prepare_data()
349
- if step_3_configure_finetuning():
350
- step_4_run_finetuning()
351
- step_5_run_inference()
352
- else:
353
- print("Configuration failed. Aborting.")
354
- else:
355
- print("Step 1 (Setup) seems to have failed. Aborting.")
356
-
357
- print("Script finished.")
358
-
359
-
360
- if __name__ == "__main__":
361
- main()