Spaces:

nesanchezo
/

petProject

Sleeping

App Files Files Community

petProject / enriching_description.py

nesanchezo

Update enriching_description.py

2f27d1f verified about 1 year ago

raw

history blame contribute delete

8.29 kB

	import logging
	from typing import Dict, Any
	import time
	from tqdm import tqdm
	import openai
	import pandas as pd
	from dotenv import load_dotenv, find_dotenv
	import os
	import glob
	from langsmith import traceable
	from langsmith import Client
	from langsmith.wrappers import wrap_openai

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# Load environment variables
	#_ = load_dotenv(find_dotenv())
	# Obtener las claves de los secrets de Hugging Face
	#openai.api_key = st.secrets["OPENAI_API_KEY"].strip()
	#os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
	#os.environ["LANGCHAIN_TRACING_V2"] = st.secrets["LANGCHAIN_TRACING_V2"]

	# Initialize LangSmith client
	langsmith_client = Client()
	# Wrap OpenAI client with LangSmith
	openai = wrap_openai(openai)

	@traceable(run_type="chain")
	def create_product_prompt(row: pd.Series, language: str) -> str:
	"""
	Create a detailed prompt for product description generation.

	Args:
	row: DataFrame row containing product information
	language: Target language ('en' or 'es')

	Returns:
	str: Formatted prompt for the LLM
	"""
	base_prompts = {
	'en': """Create a compelling and detailed marketing description for a premium dog food product.
	Include the following information and expand with your knowledge:

	• Brand: {brand}
	• Product Name: {product_name}
	• Specifically designed for: {dog_type}
	• Type: {food_type}
	• Package Size: {weight} kg
	• Price Point: ${price:.2f}

	Focus on:
	1. Key nutritional benefits
	2. Quality of ingredients
	3. Health advantages
	4. Why it's perfect for the specified dog type
	5. Value proposition

	Make it engaging and persuasive while maintaining accuracy.""",

	'es': """Crea una descripción comercial detallada y convincente para un producto premium de alimentación canina.
	Incluye la siguiente información y expándela con tu conocimiento:

	• Marca: {brand}
	• Nombre del Producto: {product_name}
	• Diseñado específicamente para: {dog_type}
	• Tipo: {food_type}
	• Tamaño del Paquete: {weight} kg
	• Precio: ${price:.2f}

	Enfócate en:
	1. Beneficios nutricionales clave
	2. Calidad de los ingredientes
	3. Ventajas para la salud
	4. Por qué es perfecto para el tipo de perro especificado
	5. Propuesta de valor

	Hazlo atractivo y persuasivo mientras mantienes la precisión."""
	}
	prompt = base_prompts[language].format(**row.to_dict())
	return prompt

	@traceable(run_type="chain")
	def generate_description(row: pd.Series, language: str, retry_attempts: int = 3) -> str:
	"""
	Generate a product description using OpenAI's API with retry logic.

	Args:
	row: DataFrame row containing product information
	language: Target language ('en' or 'es')
	retry_attempts: Number of retry attempts on failure

	Returns:
	str: Generated description or error message
	"""
	prompt = create_product_prompt(row, language)

	for attempt in range(retry_attempts):
	try:
	response = openai.chat.completions.create(
	model="gpt-4o-mini", # Using GPT-4 for better quality
	messages=[{"role": "user", "content": prompt}],
	max_tokens=150,
	temperature=0.7,
	presence_penalty=0.3,
	frequency_penalty=0.3
	)
	return response.choices[0].message.content.strip()

	except Exception as e:
	logging.error(f"Attempt {attempt + 1} failed: {str(e)}") # Added more detailed logging
	if attempt == retry_attempts - 1:
	return f"Error generating {language} description: {e}"
	time.sleep(2 ** attempt) # Exponential backoff

	def save_checkpoint(df: pd.DataFrame, batch_num: int, checkpoint_dir: str = 'checkpoints') -> None:
	"""
	Save a checkpoint of the current DataFrame.

	Args:
	df: DataFrame to checkpoint
	batch_num: Current batch number
	checkpoint_dir: Directory to store checkpoints
	"""
	# Create checkpoint directory if it doesn't exist
	os.makedirs(checkpoint_dir, exist_ok=True)

	checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_batch_{batch_num}.pkl')
	df.to_pickle(checkpoint_path)
	logging.info(f"Saved checkpoint at batch {batch_num}")

	def load_latest_checkpoint(checkpoint_dir: str = 'checkpoints') -> tuple[pd.DataFrame \| None, int]:
	"""
	Load the most recent checkpoint if it exists.

	Args:
	checkpoint_dir: Directory containing checkpoints

	Returns:
	tuple: (DataFrame or None, last completed batch number)
	"""
	if not os.path.exists(checkpoint_dir):
	return None, 0

	checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'checkpoint_batch_*.pkl'))
	if not checkpoint_files:
	return None, 0

	latest_checkpoint = max(checkpoint_files)
	batch_num = int(latest_checkpoint.split('_')[-1].split('.')[0])

	logging.info(f"Loading checkpoint from batch {batch_num}")
	return pd.read_pickle(latest_checkpoint), batch_num

	@traceable(run_type="chain")
	def enrich_descriptions(df: pd.DataFrame, batch_size: int = 10, checkpoint_frequency: int = 5) -> pd.DataFrame:
	"""
	Enrich DataFrame with product descriptions in both languages.

	Args:
	df: Input DataFrame
	batch_size: Number of items to process in each batch
	checkpoint_frequency: Number of batches between checkpoints

	Returns:
	pd.DataFrame: Enriched DataFrame with new description columns
	"""
	logging.info("Starting description generation process...")

	initial_row_count = len(df)
	df = df.copy()

	# Try to load from checkpoint
	checkpoint_df, last_batch = load_latest_checkpoint()
	if checkpoint_df is not None:
	df = checkpoint_df
	start_idx = (last_batch + 1) * batch_size
	logging.info(f"Resuming from batch {last_batch + 1}")
	else:
	start_idx = 0

	total_batches = (len(df) + batch_size - 1) // batch_size

	for batch_num, i in enumerate(tqdm(range(start_idx, len(df), batch_size)), start=last_batch + 1):
	batch = df.iloc[i:i + batch_size]

	df.loc[batch.index, 'description_en'] = batch.apply(
	lambda row: generate_description(row, 'en'), axis=1
	)
	df.loc[batch.index, 'description_es'] = batch.apply(
	lambda row: generate_description(row, 'es'), axis=1
	)

	if batch_num % checkpoint_frequency == 0:
	save_checkpoint(df, batch_num)

	time.sleep(1) # Rate limiting

	# Validate row counts and description completeness
	final_row_count = len(df)
	if final_row_count != initial_row_count:
	raise ValueError(f"Row count mismatch: Started with {initial_row_count} rows, ended with {final_row_count} rows")

	# Check for missing descriptions
	missing_en = df['description_en'].isna().sum()
	missing_es = df['description_es'].isna().sum()
	if missing_en > 0 or missing_es > 0:
	logging.warning(f"Missing descriptions detected: English: {missing_en}, Spanish: {missing_es}")

	return df

	def main():
	"""Main execution function."""
	try:
	# Load the dataset
	file_path = '2nd_clean_comida_dogs_filtered.pkl'
	data = pd.read_pickle(file_path)
	initial_count = len(data)
	logging.info(f"Loaded dataset with {initial_count} records")

	# Enrich with descriptions
	enriched_data = enrich_descriptions(data)

	# Final validation before saving
	if len(enriched_data) != initial_count:
	raise ValueError(f"Row count mismatch: Original had {initial_count} rows, enriched has {len(enriched_data)} rows")

	# Save the enriched dataset
	output_path = '3rd_clean_comida_dogs_enriched_multilingual_2.pkl'
	enriched_data.to_pickle(output_path)
	logging.info(f"Enriched dataset saved to {output_path}")

	except Exception as e:
	logging.error(f"An error occurred: {e}")
	raise

	if __name__ == "__main__":
	main()