import pandas as pd import numpy as np from database_connection import DatabaseConnection def create_basic_training_data(): """Create basic training data for the course recommender - DEPRECATED""" print("WARNING: Basic training data is deprecated. Use student feedback data instead.") raise ValueError("Basic training data is no longer used. Please use student feedback data from /student_feedback_counts endpoint.") # Define strands strands = ["STEM", "ABM", "HUMSS", "GAS", "TVL"] # Define common hobbies hobbies_list = [ "Programming", "Reading", "Sports", "Music", "Art", "Gaming", "Photography", "Writing", "Dancing", "Cooking", "Traveling", "Mathematics", "Science", "History", "Literature", "Technology" ] # Generate synthetic data np.random.seed(42) # For reproducible results n_samples = 1000 data = [] for _ in range(n_samples): # Generate random but realistic data stanine = np.random.randint(1, 10) gwa = np.random.uniform(75, 100) # GWA between 75-100 strand = np.random.choice(strands) course = np.random.choice(courses) hobbies = np.random.choice(hobbies_list, size=np.random.randint(1, 4), replace=False) hobbies_str = ", ".join(hobbies) # Generate rating based on some logic if stanine >= 7 and gwa >= 85: rating = np.random.choice([4, 5], p=[0.3, 0.7]) elif stanine >= 5 and gwa >= 80: rating = np.random.choice([3, 4, 5], p=[0.2, 0.5, 0.3]) else: rating = np.random.choice([1, 2, 3, 4], p=[0.1, 0.3, 0.4, 0.2]) count = np.random.randint(1, 10) data.append({ 'course': course, 'stanine': stanine, 'gwa': gwa, 'strand': strand, 'rating': rating, 'hobbies': hobbies_str, 'count': count }) return pd.DataFrame(data) def save_basic_data(): """Save basic training data to CSV""" df = create_basic_training_data() df.to_csv('basic_training_data.csv', index=False) print(f"Basic training data saved with {len(df)} samples") return df if __name__ == "__main__": save_basic_data()