Spaces:

adfras
/

psychology-tutor-engine

Runtime error

App Files Files Community

psychology-tutor-engine / compute_embeddings.py

adfras

Initial commit: Psychology tutor engine and data pipelines

1da14e1 10 months ago

raw

history blame contribute delete

1.75 kB

	# compute_embeddings.py
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import os

	# --- Configuration ---
	DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet"
	OUTPUT_FILE = "data/processed/psychology_data_with_embeddings.parquet"
	# This model is small, fast, and effective for sentence-level tasks.
	MODEL_NAME = 'all-MiniLM-L6-v2'

	if __name__ == "__main__":
	print("--- Starting Embedding Computation ---")

	if not os.path.exists(DATA_FILE):
	print(f"FATAL: Data file not found at {DATA_FILE}. Please run normalize_psych_data.py first.")
	else:
	df = pd.read_parquet(DATA_FILE)

	print(f"Loading sentence-transformer model: '{MODEL_NAME}'...")
	# This will download the model on the first run
	model = SentenceTransformer(MODEL_NAME)

	print(f"Computing embeddings for {len(df)} questions... (This may take a while)")
	# The .encode() method takes a list of strings and returns a list of numpy arrays (embeddings)
	embeddings = model.encode(df['question'].tolist(), show_progress_bar=True)

	# The embedding is a 384-dimensional vector for this model.
	# We'll store it as separate columns in the dataframe.
	embedding_df = pd.DataFrame(embeddings, index=df.index)
	embedding_df = embedding_df.add_prefix('embed_')

	# Combine the original dataframe with the new embedding columns
	df_with_embeddings = pd.concat([df, embedding_df], axis=1)

	print(f"Saving new dataframe with embeddings to '{OUTPUT_FILE}'...")
	df_with_embeddings.to_parquet(OUTPUT_FILE)

	print("\nSUCCESS: Embeddings computed and saved.")
	print(f"New dataframe shape: {df_with_embeddings.shape}")