Kahrhoff commited on
Commit
6f5c468
·
verified ·
1 Parent(s): 137c491

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +282 -282
app.py CHANGED
@@ -1,283 +1,283 @@
1
- #!/usr/bin/env python3
2
- """
3
- OpenFinancial Chatbot - Hugging Face Space Trainer
4
- ==================================================
5
-
6
- This script is designed to run directly in a Hugging Face Space.
7
- Upload this file along with your training data to a HF Space and it will:
8
- 1. Load your training data automatically
9
- 2. Train the model using available hardware (GPU/CPU)
10
- 3. Save the trained model to the space's file system
11
- 4. Provide a simple interface to monitor progress
12
-
13
- Instructions:
14
- 1. Create a new HF Space (Gradio SDK)
15
- 2. Upload this file as app.py
16
- 3. Upload your training CSV files to the space
17
- 4. The space will automatically start training when it loads
18
- """
19
-
20
- import os
21
- import json
22
- import time
23
- import pandas as pd
24
- from datasets import Dataset
25
- from transformers import (
26
- AutoModelForCausalLM,
27
- AutoTokenizer,
28
- Trainer,
29
- TrainingArguments,
30
- DataCollatorForLanguageModeling
31
- )
32
- import torch
33
- from huggingface_hub import login
34
- import gradio as gr
35
-
36
- # Configuration
37
- BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
38
- OUTPUT_MODEL_DIR = "./trained_model"
39
- TRAINING_DATA_FILES = ["trainingData.csv", "training_data.csv", "data.csv"] # Try multiple names
40
-
41
- def find_training_data():
42
- """Find training data files in the space"""
43
- print("🔍 Looking for training data files...")
44
-
45
- # Check for CSV files
46
- for filename in TRAINING_DATA_FILES:
47
- if os.path.exists(filename):
48
- print(f"Found training data: {filename}")
49
- return filename
50
-
51
- # Check all CSV files in current directory
52
- csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
53
- if csv_files:
54
- print(f"Found CSV files: {csv_files}")
55
- return csv_files[0] # Use the first one
56
-
57
- print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
58
- return None
59
-
60
- def load_training_data(filename):
61
- """Load and prepare training data"""
62
- print(f"📊 Loading training data from {filename}...")
63
-
64
- try:
65
- # Read CSV file
66
- df = pd.read_csv(filename)
67
- print(f"Raw data shape: {df.shape}")
68
-
69
- # Check for required columns (flexible naming)
70
- question_cols = [col for col in df.columns if 'question' in col.lower() or 'prompt' in col.lower() or 'input' in col.lower()]
71
- answer_cols = [col for col in df.columns if 'answer' in col.lower() or 'response' in col.lower() or 'output' in col.lower()]
72
-
73
- if not question_cols or not answer_cols:
74
- print(f"Available columns: {list(df.columns)}")
75
- raise ValueError("Could not find Question/Answer columns")
76
-
77
- question_col = question_cols[0]
78
- answer_col = answer_cols[0]
79
-
80
- print(f"Using columns: {question_col} -> {answer_col}")
81
-
82
- # Create training format
83
- training_data = []
84
- for _, row in df.iterrows():
85
- question = str(row[question_col]).strip()
86
- answer = str(row[answer_col]).strip()
87
-
88
- if question and answer and question != 'nan' and answer != 'nan':
89
- # Format as conversation
90
- text = f"### Question: {question}\n### Answer: {answer}<|endoftext|>"
91
- training_data.append({"text": text})
92
-
93
- print(f"Processed {len(training_data)} valid training examples")
94
- return training_data
95
-
96
- except Exception as e:
97
- print(f"Error loading data: {e}")
98
- return None
99
-
100
- def train_model(training_data):
101
- """Train the model with the provided data"""
102
- print("🚀 Starting model training...")
103
-
104
- # Check hardware
105
- device = "cuda" if torch.cuda.is_available() else "cpu"
106
- print(f"💻 Using device: {device}")
107
- if torch.cuda.is_available():
108
- print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
109
-
110
- # Create dataset
111
- dataset = Dataset.from_list(training_data)
112
- print(f"📊 Dataset size: {len(dataset)} examples")
113
-
114
- # Load tokenizer and model
115
- print("🔧 Loading model and tokenizer...")
116
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
117
- if tokenizer.pad_token is None:
118
- tokenizer.pad_token = tokenizer.eos_token
119
-
120
- model = AutoModelForCausalLM.from_pretrained(
121
- BASE_MODEL,
122
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
123
- device_map="auto" if torch.cuda.is_available() else None
124
- )
125
-
126
- # Tokenize dataset
127
- print("🔄 Tokenizing dataset...")
128
- def tokenize_function(examples):
129
- return tokenizer(
130
- examples["text"],
131
- truncation=True,
132
- padding=False,
133
- max_length=512
134
- )
135
-
136
- tokenized_dataset = dataset.map(
137
- tokenize_function,
138
- batched=True,
139
- remove_columns=["text"]
140
- )
141
-
142
- # Training arguments
143
- batch_size = 4 if torch.cuda.is_available() else 2
144
- gradient_steps = 4 if torch.cuda.is_available() else 8
145
-
146
- training_args = TrainingArguments(
147
- output_dir="./results",
148
- num_train_epochs=3,
149
- per_device_train_batch_size=batch_size,
150
- gradient_accumulation_steps=gradient_steps,
151
- warmup_steps=50,
152
- learning_rate=2e-5,
153
- logging_steps=10,
154
- save_steps=500,
155
- save_total_limit=2,
156
- remove_unused_columns=False,
157
- dataloader_num_workers=0, # Avoid multiprocessing issues
158
- fp16=torch.cuda.is_available(),
159
- report_to=None, # Disable wandb
160
- )
161
-
162
- # Data collator
163
- data_collator = DataCollatorForLanguageModeling(
164
- tokenizer=tokenizer,
165
- mlm=False,
166
- )
167
-
168
- # Create trainer
169
- print("⚙️ Initializing trainer...")
170
- trainer = Trainer(
171
- model=model,
172
- args=training_args,
173
- train_dataset=tokenized_dataset,
174
- data_collator=data_collator,
175
- tokenizer=tokenizer,
176
- )
177
-
178
- # Train the model
179
- print("🔥 Starting training...")
180
- start_time = time.time()
181
-
182
- try:
183
- trainer.train()
184
-
185
- end_time = time.time()
186
- training_duration = (end_time - start_time) / 60
187
-
188
- # Save the model
189
- print("💾 Saving trained model...")
190
- trainer.save_model(OUTPUT_MODEL_DIR)
191
- tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
192
-
193
- # Create a completion marker
194
- with open("training_complete.txt", "w") as f:
195
- f.write(f"Training completed successfully!\nDuration: {training_duration:.1f} minutes\nModel saved to: {OUTPUT_MODEL_DIR}")
196
-
197
- return f"Training completed in {training_duration:.1f} minutes!\n\nModel saved to: {OUTPUT_MODEL_DIR}\n\nYou can now download the trained_model folder."
198
-
199
- except Exception as e:
200
- error_msg = f"Training failed: {str(e)}"
201
- print(error_msg)
202
-
203
- # Create error marker
204
- with open("training_error.txt", "w") as f:
205
- f.write(error_msg)
206
-
207
- return error_msg
208
-
209
- def create_interface():
210
- """Create Gradio interface"""
211
-
212
- # Check for existing status
213
- initial_status = "🚀 Ready to start training..."
214
-
215
- if os.path.exists("training_complete.txt"):
216
- with open("training_complete.txt", "r") as f:
217
- initial_status = f.read()
218
- elif os.path.exists("training_error.txt"):
219
- with open("training_error.txt", "r") as f:
220
- initial_status = f.read()
221
-
222
- with gr.Blocks(title="OpenFinancial Chatbot Trainer") as demo:
223
- gr.Markdown("# 🤖 OpenFinancial Chatbot - Cloud Trainer")
224
- gr.Markdown("Upload your training CSV file and click 'Start Training' to begin.")
225
-
226
- status_output = gr.Textbox(
227
- label="Training Status",
228
- value=initial_status,
229
- lines=10,
230
- max_lines=20
231
- )
232
-
233
- with gr.Row():
234
- start_btn = gr.Button("🚀 Start Training", variant="primary")
235
- refresh_btn = gr.Button("🔄 Refresh Status", variant="secondary")
236
-
237
- # File download section
238
- gr.Markdown("## 📥 Download Trained Model")
239
- download_info = gr.Markdown("After training completes, download the files below:")
240
-
241
- def start_training():
242
- # Find and load data
243
- data_file = find_training_data()
244
- if not data_file:
245
- return "No training data found. Please upload a CSV file with Question and Answer columns."
246
-
247
- training_data = load_training_data(data_file)
248
- if not training_data:
249
- return "Failed to load training data. Check the CSV format."
250
-
251
- # Start training
252
- return train_model(training_data)
253
-
254
- def refresh_status():
255
- if os.path.exists("training_complete.txt"):
256
- with open("training_complete.txt", "r") as f:
257
- return f.read()
258
- elif os.path.exists("training_error.txt"):
259
- with open("training_error.txt", "r") as f:
260
- return f.read()
261
- else:
262
- return "🚀 Ready to start training..."
263
-
264
- start_btn.click(start_training, outputs=status_output)
265
- refresh_btn.click(refresh_status, outputs=status_output)
266
-
267
- return demo
268
-
269
- if __name__ == "__main__":
270
- print("🤖 OpenFinancial Chatbot - HF Space Trainer")
271
- print("=" * 50)
272
-
273
- # Auto-login if token is available
274
- if "HF_TOKEN" in os.environ:
275
- try:
276
- login(token=os.environ["HF_TOKEN"])
277
- print("Hugging Face authentication successful")
278
- except:
279
- print("⚠️ HF authentication failed (optional)")
280
-
281
- # Launch interface
282
- interface = create_interface()
283
  interface.launch()
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenFinancial Chatbot - Hugging Face Space Trainer
4
+ ==================================================
5
+
6
+ This script is designed to run directly in a Hugging Face Space.
7
+ Upload this file along with your training data to a HF Space and it will:
8
+ 1. Load your training data automatically
9
+ 2. Train the model using available hardware (GPU/CPU)
10
+ 3. Save the trained model to the space's file system
11
+ 4. Provide a simple interface to monitor progress
12
+
13
+ Instructions:
14
+ 1. Create a new HF Space (Gradio SDK)
15
+ 2. Upload this file as app.py
16
+ 3. Upload your training CSV files to the space
17
+ 4. The space will automatically start training when it loads
18
+ """
19
+
20
+ import os
21
+ import json
22
+ import time
23
+ import pandas as pd
24
+ from datasets import Dataset
25
+ from transformers import (
26
+ AutoModelForCausalLM,
27
+ AutoTokenizer,
28
+ Trainer,
29
+ TrainingArguments,
30
+ DataCollatorForLanguageModeling
31
+ )
32
+ import torch
33
+ from huggingface_hub import login
34
+ import gradio as gr
35
+
36
+ # Configuration
37
+ BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
38
+ OUTPUT_MODEL_DIR = "./trained_model"
39
+ TRAINING_DATA_FILES = ["customer_service_conversations.csv", "financial_conversations.csv", "financial_qa_conversations.csv", "trainingData.csv"] # Try multiple names
40
+
41
+ def find_training_data():
42
+ """Find training data files in the space"""
43
+ print("🔍 Looking for training data files...")
44
+
45
+ # Check for CSV files
46
+ for filename in TRAINING_DATA_FILES:
47
+ if os.path.exists(filename):
48
+ print(f"Found training data: {filename}")
49
+ return filename
50
+
51
+ # Check all CSV files in current directory
52
+ csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
53
+ if csv_files:
54
+ print(f"Found CSV files: {csv_files}")
55
+ return csv_files[0] # Use the first one
56
+
57
+ print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
58
+ return None
59
+
60
+ def load_training_data(filename):
61
+ """Load and prepare training data"""
62
+ print(f"📊 Loading training data from {filename}...")
63
+
64
+ try:
65
+ # Read CSV file
66
+ df = pd.read_csv(filename)
67
+ print(f"Raw data shape: {df.shape}")
68
+
69
+ # Check for required columns (flexible naming)
70
+ question_cols = [col for col in df.columns if 'question' in col.lower() or 'prompt' in col.lower() or 'input' in col.lower()]
71
+ answer_cols = [col for col in df.columns if 'answer' in col.lower() or 'response' in col.lower() or 'output' in col.lower()]
72
+
73
+ if not question_cols or not answer_cols:
74
+ print(f"Available columns: {list(df.columns)}")
75
+ raise ValueError("Could not find Question/Answer columns")
76
+
77
+ question_col = question_cols[0]
78
+ answer_col = answer_cols[0]
79
+
80
+ print(f"Using columns: {question_col} -> {answer_col}")
81
+
82
+ # Create training format
83
+ training_data = []
84
+ for _, row in df.iterrows():
85
+ question = str(row[question_col]).strip()
86
+ answer = str(row[answer_col]).strip()
87
+
88
+ if question and answer and question != 'nan' and answer != 'nan':
89
+ # Format as conversation
90
+ text = f"### Question: {question}\n### Answer: {answer}<|endoftext|>"
91
+ training_data.append({"text": text})
92
+
93
+ print(f"Processed {len(training_data)} valid training examples")
94
+ return training_data
95
+
96
+ except Exception as e:
97
+ print(f"Error loading data: {e}")
98
+ return None
99
+
100
+ def train_model(training_data):
101
+ """Train the model with the provided data"""
102
+ print("Starting model training...")
103
+
104
+ # Check hardware
105
+ device = "cuda" if torch.cuda.is_available() else "cpu"
106
+ print(f"Using device: {device}")
107
+ if torch.cuda.is_available():
108
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
109
+
110
+ # Create dataset
111
+ dataset = Dataset.from_list(training_data)
112
+ print(f"Dataset size: {len(dataset)} examples")
113
+
114
+ # Load tokenizer and model
115
+ print("Loading model and tokenizer...")
116
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
117
+ if tokenizer.pad_token is None:
118
+ tokenizer.pad_token = tokenizer.eos_token
119
+
120
+ model = AutoModelForCausalLM.from_pretrained(
121
+ BASE_MODEL,
122
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
123
+ device_map="auto" if torch.cuda.is_available() else None
124
+ )
125
+
126
+ # Tokenize dataset
127
+ print("Tokenizing dataset...")
128
+ def tokenize_function(examples):
129
+ return tokenizer(
130
+ examples["text"],
131
+ truncation=True,
132
+ padding=False,
133
+ max_length=512
134
+ )
135
+
136
+ tokenized_dataset = dataset.map(
137
+ tokenize_function,
138
+ batched=True,
139
+ remove_columns=["text"]
140
+ )
141
+
142
+ # Training arguments
143
+ batch_size = 4 if torch.cuda.is_available() else 2
144
+ gradient_steps = 4 if torch.cuda.is_available() else 8
145
+
146
+ training_args = TrainingArguments(
147
+ output_dir="./results",
148
+ num_train_epochs=3,
149
+ per_device_train_batch_size=batch_size,
150
+ gradient_accumulation_steps=gradient_steps,
151
+ warmup_steps=50,
152
+ learning_rate=2e-5,
153
+ logging_steps=10,
154
+ save_steps=500,
155
+ save_total_limit=2,
156
+ remove_unused_columns=False,
157
+ dataloader_num_workers=0, # Avoid multiprocessing issues
158
+ fp16=torch.cuda.is_available(),
159
+ report_to=None, # Disable wandb
160
+ )
161
+
162
+ # Data collator
163
+ data_collator = DataCollatorForLanguageModeling(
164
+ tokenizer=tokenizer,
165
+ mlm=False,
166
+ )
167
+
168
+ # Create trainer
169
+ print("Initializing trainer...")
170
+ trainer = Trainer(
171
+ model=model,
172
+ args=training_args,
173
+ train_dataset=tokenized_dataset,
174
+ data_collator=data_collator,
175
+ tokenizer=tokenizer,
176
+ )
177
+
178
+ # Train the model
179
+ print("Starting training...")
180
+ start_time = time.time()
181
+
182
+ try:
183
+ trainer.train()
184
+
185
+ end_time = time.time()
186
+ training_duration = (end_time - start_time) / 60
187
+
188
+ # Save the model
189
+ print("Saving trained model...")
190
+ trainer.save_model(OUTPUT_MODEL_DIR)
191
+ tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
192
+
193
+ # Create a completion marker
194
+ with open("training_complete.txt", "w") as f:
195
+ f.write(f"Training completed successfully!\nDuration: {training_duration:.1f} minutes\nModel saved to: {OUTPUT_MODEL_DIR}")
196
+
197
+ return f"Training completed in {training_duration:.1f} minutes!\n\nModel saved to: {OUTPUT_MODEL_DIR}\n\nYou can now download the trained_model folder."
198
+
199
+ except Exception as e:
200
+ error_msg = f"Training failed: {str(e)}"
201
+ print(error_msg)
202
+
203
+ # Create error marker
204
+ with open("training_error.txt", "w") as f:
205
+ f.write(error_msg)
206
+
207
+ return error_msg
208
+
209
+ def create_interface():
210
+ """Create Gradio interface"""
211
+
212
+ # Check for existing status
213
+ initial_status = "Ready to start training..."
214
+
215
+ if os.path.exists("training_complete.txt"):
216
+ with open("training_complete.txt", "r") as f:
217
+ initial_status = f.read()
218
+ elif os.path.exists("training_error.txt"):
219
+ with open("training_error.txt", "r") as f:
220
+ initial_status = f.read()
221
+
222
+ with gr.Blocks(title="OpenFinancial Chatbot Trainer") as demo:
223
+ gr.Markdown("# OpenFinancial Chatbot - Cloud Trainer")
224
+ gr.Markdown("Upload your training CSV file and click 'Start Training' to begin.")
225
+
226
+ status_output = gr.Textbox(
227
+ label="Training Status",
228
+ value=initial_status,
229
+ lines=10,
230
+ max_lines=20
231
+ )
232
+
233
+ with gr.Row():
234
+ start_btn = gr.Button("Start Training", variant="primary")
235
+ refresh_btn = gr.Button("Refresh Status", variant="secondary")
236
+
237
+ # File download section
238
+ gr.Markdown("## Download Trained Model")
239
+ download_info = gr.Markdown("After training completes, download the files below:")
240
+
241
+ def start_training():
242
+ # Find and load data
243
+ data_file = find_training_data()
244
+ if not data_file:
245
+ return "No training data found. Please upload a CSV file with Question and Answer columns."
246
+
247
+ training_data = load_training_data(data_file)
248
+ if not training_data:
249
+ return "Failed to load training data. Check the CSV format."
250
+
251
+ # Start training
252
+ return train_model(training_data)
253
+
254
+ def refresh_status():
255
+ if os.path.exists("training_complete.txt"):
256
+ with open("training_complete.txt", "r") as f:
257
+ return f.read()
258
+ elif os.path.exists("training_error.txt"):
259
+ with open("training_error.txt", "r") as f:
260
+ return f.read()
261
+ else:
262
+ return "Ready to start training..."
263
+
264
+ start_btn.click(start_training, outputs=status_output)
265
+ refresh_btn.click(refresh_status, outputs=status_output)
266
+
267
+ return demo
268
+
269
+ if __name__ == "__main__":
270
+ print("OpenFinancial Chatbot - HF Space Trainer")
271
+ print("=" * 50)
272
+
273
+ # Auto-login if token is available
274
+ if "HF_TOKEN" in os.environ:
275
+ try:
276
+ login(token=os.environ["HF_TOKEN"])
277
+ print("Hugging Face authentication successful")
278
+ except:
279
+ print("HF authentication failed (optional)")
280
+
281
+ # Launch interface
282
+ interface = create_interface()
283
  interface.launch()