petProject / enriching_description.py
nesanchezo's picture
Update enriching_description.py
2f27d1f verified
import logging
from typing import Dict, Any
import time
from tqdm import tqdm
import openai
import pandas as pd
from dotenv import load_dotenv, find_dotenv
import os
import glob
from langsmith import traceable
from langsmith import Client
from langsmith.wrappers import wrap_openai
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
# Load environment variables
#_ = load_dotenv(find_dotenv())
# Obtener las claves de los secrets de Hugging Face
#openai.api_key = st.secrets["OPENAI_API_KEY"].strip()
#os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
#os.environ["LANGCHAIN_TRACING_V2"] = st.secrets["LANGCHAIN_TRACING_V2"]
# Initialize LangSmith client
langsmith_client = Client()
# Wrap OpenAI client with LangSmith
openai = wrap_openai(openai)
@traceable(run_type="chain")
def create_product_prompt(row: pd.Series, language: str) -> str:
"""
Create a detailed prompt for product description generation.
Args:
row: DataFrame row containing product information
language: Target language ('en' or 'es')
Returns:
str: Formatted prompt for the LLM
"""
base_prompts = {
'en': """Create a compelling and detailed marketing description for a premium dog food product.
Include the following information and expand with your knowledge:
• Brand: {brand}
• Product Name: {product_name}
• Specifically designed for: {dog_type}
• Type: {food_type}
• Package Size: {weight} kg
• Price Point: ${price:.2f}
Focus on:
1. Key nutritional benefits
2. Quality of ingredients
3. Health advantages
4. Why it's perfect for the specified dog type
5. Value proposition
Make it engaging and persuasive while maintaining accuracy.""",
'es': """Crea una descripción comercial detallada y convincente para un producto premium de alimentación canina.
Incluye la siguiente información y expándela con tu conocimiento:
• Marca: {brand}
• Nombre del Producto: {product_name}
• Diseñado específicamente para: {dog_type}
• Tipo: {food_type}
• Tamaño del Paquete: {weight} kg
• Precio: ${price:.2f}
Enfócate en:
1. Beneficios nutricionales clave
2. Calidad de los ingredientes
3. Ventajas para la salud
4. Por qué es perfecto para el tipo de perro especificado
5. Propuesta de valor
Hazlo atractivo y persuasivo mientras mantienes la precisión."""
}
prompt = base_prompts[language].format(**row.to_dict())
return prompt
@traceable(run_type="chain")
def generate_description(row: pd.Series, language: str, retry_attempts: int = 3) -> str:
"""
Generate a product description using OpenAI's API with retry logic.
Args:
row: DataFrame row containing product information
language: Target language ('en' or 'es')
retry_attempts: Number of retry attempts on failure
Returns:
str: Generated description or error message
"""
prompt = create_product_prompt(row, language)
for attempt in range(retry_attempts):
try:
response = openai.chat.completions.create(
model="gpt-4o-mini", # Using GPT-4 for better quality
messages=[{"role": "user", "content": prompt}],
max_tokens=150,
temperature=0.7,
presence_penalty=0.3,
frequency_penalty=0.3
)
return response.choices[0].message.content.strip()
except Exception as e:
logging.error(f"Attempt {attempt + 1} failed: {str(e)}") # Added more detailed logging
if attempt == retry_attempts - 1:
return f"Error generating {language} description: {e}"
time.sleep(2 ** attempt) # Exponential backoff
def save_checkpoint(df: pd.DataFrame, batch_num: int, checkpoint_dir: str = 'checkpoints') -> None:
"""
Save a checkpoint of the current DataFrame.
Args:
df: DataFrame to checkpoint
batch_num: Current batch number
checkpoint_dir: Directory to store checkpoints
"""
# Create checkpoint directory if it doesn't exist
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_batch_{batch_num}.pkl')
df.to_pickle(checkpoint_path)
logging.info(f"Saved checkpoint at batch {batch_num}")
def load_latest_checkpoint(checkpoint_dir: str = 'checkpoints') -> tuple[pd.DataFrame | None, int]:
"""
Load the most recent checkpoint if it exists.
Args:
checkpoint_dir: Directory containing checkpoints
Returns:
tuple: (DataFrame or None, last completed batch number)
"""
if not os.path.exists(checkpoint_dir):
return None, 0
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'checkpoint_batch_*.pkl'))
if not checkpoint_files:
return None, 0
latest_checkpoint = max(checkpoint_files)
batch_num = int(latest_checkpoint.split('_')[-1].split('.')[0])
logging.info(f"Loading checkpoint from batch {batch_num}")
return pd.read_pickle(latest_checkpoint), batch_num
@traceable(run_type="chain")
def enrich_descriptions(df: pd.DataFrame, batch_size: int = 10, checkpoint_frequency: int = 5) -> pd.DataFrame:
"""
Enrich DataFrame with product descriptions in both languages.
Args:
df: Input DataFrame
batch_size: Number of items to process in each batch
checkpoint_frequency: Number of batches between checkpoints
Returns:
pd.DataFrame: Enriched DataFrame with new description columns
"""
logging.info("Starting description generation process...")
initial_row_count = len(df)
df = df.copy()
# Try to load from checkpoint
checkpoint_df, last_batch = load_latest_checkpoint()
if checkpoint_df is not None:
df = checkpoint_df
start_idx = (last_batch + 1) * batch_size
logging.info(f"Resuming from batch {last_batch + 1}")
else:
start_idx = 0
total_batches = (len(df) + batch_size - 1) // batch_size
for batch_num, i in enumerate(tqdm(range(start_idx, len(df), batch_size)), start=last_batch + 1):
batch = df.iloc[i:i + batch_size]
df.loc[batch.index, 'description_en'] = batch.apply(
lambda row: generate_description(row, 'en'), axis=1
)
df.loc[batch.index, 'description_es'] = batch.apply(
lambda row: generate_description(row, 'es'), axis=1
)
if batch_num % checkpoint_frequency == 0:
save_checkpoint(df, batch_num)
time.sleep(1) # Rate limiting
# Validate row counts and description completeness
final_row_count = len(df)
if final_row_count != initial_row_count:
raise ValueError(f"Row count mismatch: Started with {initial_row_count} rows, ended with {final_row_count} rows")
# Check for missing descriptions
missing_en = df['description_en'].isna().sum()
missing_es = df['description_es'].isna().sum()
if missing_en > 0 or missing_es > 0:
logging.warning(f"Missing descriptions detected: English: {missing_en}, Spanish: {missing_es}")
return df
def main():
"""Main execution function."""
try:
# Load the dataset
file_path = '2nd_clean_comida_dogs_filtered.pkl'
data = pd.read_pickle(file_path)
initial_count = len(data)
logging.info(f"Loaded dataset with {initial_count} records")
# Enrich with descriptions
enriched_data = enrich_descriptions(data)
# Final validation before saving
if len(enriched_data) != initial_count:
raise ValueError(f"Row count mismatch: Original had {initial_count} rows, enriched has {len(enriched_data)} rows")
# Save the enriched dataset
output_path = '3rd_clean_comida_dogs_enriched_multilingual_2.pkl'
enriched_data.to_pickle(output_path)
logging.info(f"Enriched dataset saved to {output_path}")
except Exception as e:
logging.error(f"An error occurred: {e}")
raise
if __name__ == "__main__":
main()