psychology-tutor-engine / compute_embeddings.py
adfras's picture
Initial commit: Psychology tutor engine and data pipelines
1da14e1
# compute_embeddings.py
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
# --- Configuration ---
DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
OUTPUT_FILE = "data/processed/psychology_data_with_embeddings.parquet"
# This model is small, fast, and effective for sentence-level tasks.
MODEL_NAME = 'all-MiniLM-L6-v2'
if __name__ == "__main__":
print("--- Starting Embedding Computation ---")
if not os.path.exists(DATA_FILE):
print(f"FATAL: Data file not found at {DATA_FILE}. Please run normalize_psych_data.py first.")
else:
df = pd.read_parquet(DATA_FILE)
print(f"Loading sentence-transformer model: '{MODEL_NAME}'...")
# This will download the model on the first run
model = SentenceTransformer(MODEL_NAME)
print(f"Computing embeddings for {len(df)} questions... (This may take a while)")
# The .encode() method takes a list of strings and returns a list of numpy arrays (embeddings)
embeddings = model.encode(df['question'].tolist(), show_progress_bar=True)
# The embedding is a 384-dimensional vector for this model.
# We'll store it as separate columns in the dataframe.
embedding_df = pd.DataFrame(embeddings, index=df.index)
embedding_df = embedding_df.add_prefix('embed_')
# Combine the original dataframe with the new embedding columns
df_with_embeddings = pd.concat([df, embedding_df], axis=1)
print(f"Saving new dataframe with embeddings to '{OUTPUT_FILE}'...")
df_with_embeddings.to_parquet(OUTPUT_FILE)
print("\nSUCCESS: Embeddings computed and saved.")
print(f"New dataframe shape: {df_with_embeddings.shape}")