Spaces:
Runtime error
Runtime error
| # compute_embeddings.py | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| # --- Configuration --- | |
| DATA_FILE = "data/processed/ALL_PSYCHOLOGY_DATA_normalized.parquet" | |
| OUTPUT_FILE = "data/processed/psychology_data_with_embeddings.parquet" | |
| # This model is small, fast, and effective for sentence-level tasks. | |
| MODEL_NAME = 'all-MiniLM-L6-v2' | |
| if __name__ == "__main__": | |
| print("--- Starting Embedding Computation ---") | |
| if not os.path.exists(DATA_FILE): | |
| print(f"FATAL: Data file not found at {DATA_FILE}. Please run normalize_psych_data.py first.") | |
| else: | |
| df = pd.read_parquet(DATA_FILE) | |
| print(f"Loading sentence-transformer model: '{MODEL_NAME}'...") | |
| # This will download the model on the first run | |
| model = SentenceTransformer(MODEL_NAME) | |
| print(f"Computing embeddings for {len(df)} questions... (This may take a while)") | |
| # The .encode() method takes a list of strings and returns a list of numpy arrays (embeddings) | |
| embeddings = model.encode(df['question'].tolist(), show_progress_bar=True) | |
| # The embedding is a 384-dimensional vector for this model. | |
| # We'll store it as separate columns in the dataframe. | |
| embedding_df = pd.DataFrame(embeddings, index=df.index) | |
| embedding_df = embedding_df.add_prefix('embed_') | |
| # Combine the original dataframe with the new embedding columns | |
| df_with_embeddings = pd.concat([df, embedding_df], axis=1) | |
| print(f"Saving new dataframe with embeddings to '{OUTPUT_FILE}'...") | |
| df_with_embeddings.to_parquet(OUTPUT_FILE) | |
| print("\nSUCCESS: Embeddings computed and saved.") | |
| print(f"New dataframe shape: {df_with_embeddings.shape}") |