Spaces:

vincentddc23
/

e-commerce

No application file

App Files Files Community

e-commerce / backend /train.py

VincentA2K

Product Recommendation RestAPI

480e694 about 1 year ago

raw

history blame contribute delete

2.78 kB

	import os
	import pickle
	import numpy as np
	import pandas as pd
	from google.cloud import bigquery
	from sentence_transformers import SentenceTransformer
	from dotenv import load_dotenv
	from pathlib import Path

	# Load environment variables
	load_dotenv()

	def setup_credentials():
	"""Setup Google Cloud credentials"""
	creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
	if not creds_path:
	raise ValueError("GOOGLE_APPLICATION_CREDENTIALS not set in .env file")

	# Convert to absolute path if relative
	if not os.path.isabs(creds_path):
	creds_path = os.path.join(os.path.dirname(__file__), creds_path)

	if not os.path.exists(creds_path):
	raise FileNotFoundError(f"Credentials file not found at: {creds_path}")

	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
	return creds_path

	def main():
	# Setup credentials
	creds_path = setup_credentials()
	print(f"Using credentials from: {creds_path}")

	# BigQuery setup
	client = bigquery.Client()
	query = f"""
	SELECT
	product_id,
	product_name,
	description,
	category,
	brand,
	price
	FROM `{os.getenv("BIGQUERY_PROJECT_ID")}.{os.getenv("BIGQUERY_DATASET")}.{os.getenv("BIGQUERY_TABLE")}`
	WHERE status = 'ACTIVE'
	"""

	try:
	df = client.query(query).to_dataframe()
	except Exception as e:
	print(f"Error querying BigQuery: {e}")
	return

	# Prepare product descriptions for embedding
	df['combined_text'] = df.apply(
	lambda x: f"{x['product_name']} {x['description']} {x['brand']} {' '.join(eval(x['category']))}",
	axis=1
	)

	# Load pre-trained NLP model
	model = SentenceTransformer('all-MiniLM-L6-v2')

	# Generate embeddings for product descriptions
	print("Generating embeddings...")
	df["embedding"] = df["combined_text"].apply(lambda x: model.encode(x))

	# Save embeddings
	embeddings_dict = {
	"product_ids": df["product_id"].tolist(),
	"product_names": df["product_name"].tolist(),
	"descriptions": df["description"].tolist(),
	"brands": df["brand"].tolist(),
	"prices": df["price"].tolist(),
	"categories": df["category"].tolist(),
	"embeddings": np.vstack(df["embedding"].values)
	}

	# Save model embeddings
	output_dir = os.path.join(os.path.dirname(__file__), "models")
	os.makedirs(output_dir, exist_ok=True)

	output_path = os.path.join(output_dir, "product_embeddings.pkl")
	with open(output_path, "wb") as f:
	pickle.dump(embeddings_dict, f)

	print(f"✅ Model trained and embeddings saved to: {output_path}")
	print(f"Total products processed: {len(df)}")

	if __name__ == "__main__":
	main()