ShiroOnigami23 commited on
Commit
d533652
·
verified ·
1 Parent(s): dc19554

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +28 -0
train.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+
5
+ # 1. Load your 5000 samples
6
+ print("👻 Loading Rosetta Stone Dataset...")
7
+ try:
8
+ df = pd.read_csv("rosetta_code_dataset.csv")
9
+ print(f" -> Loaded {len(df)} examples.")
10
+ except:
11
+ print("Error: Could not find rosetta_code_dataset.csv")
12
+ exit()
13
+
14
+ # 2. Train the Brain (TF-IDF Vectorizer)
15
+ # This converts English text ("fibonacci in java") into Math Numbers
16
+ print("🧠 Training the Ghost Engine...")
17
+ vectorizer = TfidfVectorizer()
18
+ tfidf_matrix = vectorizer.fit_transform(df['prompt'].values.astype('U'))
19
+
20
+ # 3. Save the Brain file
21
+ # We save the Vectorizer (translator), Matrix (memory), and Code (answers)
22
+ output_file = "ghost_brain.pkl"
23
+ with open(output_file, "wb") as f:
24
+ pickle.dump((vectorizer, tfidf_matrix, df['code'].values), f)
25
+
26
+ print(f"✅ SUCCESS! Brain saved as '{output_file}'")
27
+ print(f" Size: {os.path.getsize(output_file) / 1024:.2f} KB (Tiny!)")
28
+ print(" Copy this file + ghost_coder.py to your USB stick.")