Spaces:
No application file
No application file
| import os | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| from google.cloud import bigquery | |
| from sentence_transformers import SentenceTransformer | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| # Load environment variables | |
| load_dotenv() | |
| def setup_credentials(): | |
| """Setup Google Cloud credentials""" | |
| creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | |
| if not creds_path: | |
| raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file") | |
| # Convert to absolute path if relative | |
| if not os.path.isabs(creds_path): | |
| creds_path = os.path.join(os.path.dirname(__file__), creds_path) | |
| if not os.path.exists(creds_path): | |
| raise FileNotFoundError(f"Credentials file not found at: {creds_path}") | |
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path | |
| return creds_path | |
| def main(): | |
| # Setup credentials | |
| creds_path = setup_credentials() | |
| print(f"Using credentials from: {creds_path}") | |
| # BigQuery setup | |
| client = bigquery.Client() | |
| query = f""" | |
| SELECT | |
| product_id, | |
| product_name, | |
| description, | |
| category, | |
| brand, | |
| price | |
| FROM `{os.getenv("BIGQUERY_PROJECT_ID")}.{os.getenv("BIGQUERY_DATASET")}.{os.getenv("BIGQUERY_TABLE")}` | |
| WHERE status = 'ACTIVE' | |
| """ | |
| try: | |
| df = client.query(query).to_dataframe() | |
| except Exception as e: | |
| print(f"Error querying BigQuery: {e}") | |
| return | |
| # Prepare product descriptions for embedding | |
| df['combined_text'] = df.apply( | |
| lambda x: f"{x['product_name']} {x['description']} {x['brand']} {' '.join(eval(x['category']))}", | |
| axis=1 | |
| ) | |
| # Load pre-trained NLP model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Generate embeddings for product descriptions | |
| print("Generating embeddings...") | |
| df["embedding"] = df["combined_text"].apply(lambda x: model.encode(x)) | |
| # Save embeddings | |
| embeddings_dict = { | |
| "product_ids": df["product_id"].tolist(), | |
| "product_names": df["product_name"].tolist(), | |
| "descriptions": df["description"].tolist(), | |
| "brands": df["brand"].tolist(), | |
| "prices": df["price"].tolist(), | |
| "categories": df["category"].tolist(), | |
| "embeddings": np.vstack(df["embedding"].values) | |
| } | |
| # Save model embeddings | |
| output_dir = os.path.join(os.path.dirname(__file__), "models") | |
| os.makedirs(output_dir, exist_ok=True) | |
| output_path = os.path.join(output_dir, "product_embeddings.pkl") | |
| with open(output_path, "wb") as f: | |
| pickle.dump(embeddings_dict, f) | |
| print(f"✅ Model trained and embeddings saved to: {output_path}") | |
| print(f"Total products processed: {len(df)}") | |
| if __name__ == "__main__": | |
| main() | |