Kahrhoff commited on
Commit
bb4659c
·
verified ·
1 Parent(s): 6f5c468

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -27
app.py CHANGED
@@ -40,18 +40,15 @@ TRAINING_DATA_FILES = ["customer_service_conversations.csv", "financial_conversa
40
 
41
  def find_training_data():
42
  """Find training data files in the space"""
43
- print("🔍 Looking for training data files...")
44
 
45
  # Check for CSV files
46
  for filename in TRAINING_DATA_FILES:
47
  if os.path.exists(filename):
48
- print(f"Found training data: {filename}")
49
  return filename
50
 
51
  # Check all CSV files in current directory
52
  csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
53
  if csv_files:
54
- print(f"Found CSV files: {csv_files}")
55
  return csv_files[0] # Use the first one
56
 
57
  print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
@@ -59,26 +56,21 @@ def find_training_data():
59
 
60
  def load_training_data(filename):
61
  """Load and prepare training data"""
62
- print(f"📊 Loading training data from {filename}...")
63
 
64
  try:
65
  # Read CSV file
66
  df = pd.read_csv(filename)
67
- print(f"Raw data shape: {df.shape}")
68
 
69
  # Check for required columns (flexible naming)
70
- question_cols = [col for col in df.columns if 'question' in col.lower() or 'prompt' in col.lower() or 'input' in col.lower()]
71
- answer_cols = [col for col in df.columns if 'answer' in col.lower() or 'response' in col.lower() or 'output' in col.lower()]
72
 
73
  if not question_cols or not answer_cols:
74
- print(f"Available columns: {list(df.columns)}")
75
  raise ValueError("Could not find Question/Answer columns")
76
 
77
  question_col = question_cols[0]
78
  answer_col = answer_cols[0]
79
 
80
- print(f"Using columns: {question_col} -> {answer_col}")
81
-
82
  # Create training format
83
  training_data = []
84
  for _, row in df.iterrows():
@@ -94,7 +86,6 @@ def load_training_data(filename):
94
  return training_data
95
 
96
  except Exception as e:
97
- print(f"Error loading data: {e}")
98
  return None
99
 
100
  def train_model(training_data):
@@ -109,10 +100,8 @@ def train_model(training_data):
109
 
110
  # Create dataset
111
  dataset = Dataset.from_list(training_data)
112
- print(f"Dataset size: {len(dataset)} examples")
113
 
114
  # Load tokenizer and model
115
- print("Loading model and tokenizer...")
116
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
117
  if tokenizer.pad_token is None:
118
  tokenizer.pad_token = tokenizer.eos_token
@@ -124,7 +113,6 @@ def train_model(training_data):
124
  )
125
 
126
  # Tokenize dataset
127
- print("Tokenizing dataset...")
128
  def tokenize_function(examples):
129
  return tokenizer(
130
  examples["text"],
@@ -166,7 +154,6 @@ def train_model(training_data):
166
  )
167
 
168
  # Create trainer
169
- print("Initializing trainer...")
170
  trainer = Trainer(
171
  model=model,
172
  args=training_args,
@@ -176,7 +163,6 @@ def train_model(training_data):
176
  )
177
 
178
  # Train the model
179
- print("Starting training...")
180
  start_time = time.time()
181
 
182
  try:
@@ -186,7 +172,6 @@ def train_model(training_data):
186
  training_duration = (end_time - start_time) / 60
187
 
188
  # Save the model
189
- print("Saving trained model...")
190
  trainer.save_model(OUTPUT_MODEL_DIR)
191
  tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
192
 
@@ -267,16 +252,6 @@ def create_interface():
267
  return demo
268
 
269
  if __name__ == "__main__":
270
- print("OpenFinancial Chatbot - HF Space Trainer")
271
- print("=" * 50)
272
-
273
- # Auto-login if token is available
274
- if "HF_TOKEN" in os.environ:
275
- try:
276
- login(token=os.environ["HF_TOKEN"])
277
- print("Hugging Face authentication successful")
278
- except:
279
- print("HF authentication failed (optional)")
280
 
281
  # Launch interface
282
  interface = create_interface()
 
40
 
41
  def find_training_data():
42
  """Find training data files in the space"""
 
43
 
44
  # Check for CSV files
45
  for filename in TRAINING_DATA_FILES:
46
  if os.path.exists(filename):
 
47
  return filename
48
 
49
  # Check all CSV files in current directory
50
  csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
51
  if csv_files:
 
52
  return csv_files[0] # Use the first one
53
 
54
  print("No training data found. Please upload a CSV file with 'Question' and 'Answer' columns.")
 
56
 
57
  def load_training_data(filename):
58
  """Load and prepare training data"""
 
59
 
60
  try:
61
  # Read CSV file
62
  df = pd.read_csv(filename)
 
63
 
64
  # Check for required columns (flexible naming)
65
+ question_cols = [col for col in df.columns if 'question' in col.lower()]
66
+ answer_cols = [col for col in df.columns if 'answer' in col.lower()]
67
 
68
  if not question_cols or not answer_cols:
 
69
  raise ValueError("Could not find Question/Answer columns")
70
 
71
  question_col = question_cols[0]
72
  answer_col = answer_cols[0]
73
 
 
 
74
  # Create training format
75
  training_data = []
76
  for _, row in df.iterrows():
 
86
  return training_data
87
 
88
  except Exception as e:
 
89
  return None
90
 
91
  def train_model(training_data):
 
100
 
101
  # Create dataset
102
  dataset = Dataset.from_list(training_data)
 
103
 
104
  # Load tokenizer and model
 
105
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
106
  if tokenizer.pad_token is None:
107
  tokenizer.pad_token = tokenizer.eos_token
 
113
  )
114
 
115
  # Tokenize dataset
 
116
  def tokenize_function(examples):
117
  return tokenizer(
118
  examples["text"],
 
154
  )
155
 
156
  # Create trainer
 
157
  trainer = Trainer(
158
  model=model,
159
  args=training_args,
 
163
  )
164
 
165
  # Train the model
 
166
  start_time = time.time()
167
 
168
  try:
 
172
  training_duration = (end_time - start_time) / 60
173
 
174
  # Save the model
 
175
  trainer.save_model(OUTPUT_MODEL_DIR)
176
  tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
177
 
 
252
  return demo
253
 
254
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
255
 
256
  # Launch interface
257
  interface = create_interface()