chatbot / basic_training_data.py
markobinario's picture
Update basic_training_data.py
531b743 verified
import pandas as pd
import numpy as np
from database_connection import DatabaseConnection
def create_basic_training_data():
"""Create basic training data for the course recommender - DEPRECATED"""
print("WARNING: Basic training data is deprecated. Use student feedback data instead.")
raise ValueError("Basic training data is no longer used. Please use student feedback data from /student_feedback_counts endpoint.")
# Define strands
strands = ["STEM", "ABM", "HUMSS", "GAS", "TVL"]
# Define common hobbies
hobbies_list = [
"Programming", "Reading", "Sports", "Music", "Art", "Gaming",
"Photography", "Writing", "Dancing", "Cooking", "Traveling",
"Mathematics", "Science", "History", "Literature", "Technology"
]
# Generate synthetic data
np.random.seed(42) # For reproducible results
n_samples = 1000
data = []
for _ in range(n_samples):
# Generate random but realistic data
stanine = np.random.randint(1, 10)
gwa = np.random.uniform(75, 100) # GWA between 75-100
strand = np.random.choice(strands)
course = np.random.choice(courses)
hobbies = np.random.choice(hobbies_list, size=np.random.randint(1, 4), replace=False)
hobbies_str = ", ".join(hobbies)
# Generate rating based on some logic
if stanine >= 7 and gwa >= 85:
rating = np.random.choice([4, 5], p=[0.3, 0.7])
elif stanine >= 5 and gwa >= 80:
rating = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3])
else:
rating = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2])
count = np.random.randint(1, 10)
data.append({
'course': course,
'stanine': stanine,
'gwa': gwa,
'strand': strand,
'rating': rating,
'hobbies': hobbies_str,
'count': count
})
return pd.DataFrame(data)
def save_basic_data():
"""Save basic training data to CSV"""
df = create_basic_training_data()
df.to_csv('basic_training_data.csv', index=False)
print(f"Basic training data saved with {len(df)} samples")
return df
if __name__ == "__main__":
save_basic_data()