Spaces:
Sleeping
Sleeping
| import logging | |
| from typing import Dict, Any | |
| import time | |
| from tqdm import tqdm | |
| import openai | |
| import pandas as pd | |
| from dotenv import load_dotenv, find_dotenv | |
| import os | |
| import glob | |
| from langsmith import traceable | |
| from langsmith import Client | |
| from langsmith.wrappers import wrap_openai | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s' | |
| ) | |
| # Load environment variables | |
| #_ = load_dotenv(find_dotenv()) | |
| # Obtener las claves de los secrets de Hugging Face | |
| #openai.api_key = st.secrets["OPENAI_API_KEY"].strip() | |
| #os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"] | |
| #os.environ["LANGCHAIN_TRACING_V2"] = st.secrets["LANGCHAIN_TRACING_V2"] | |
| # Initialize LangSmith client | |
| langsmith_client = Client() | |
| # Wrap OpenAI client with LangSmith | |
| openai = wrap_openai(openai) | |
| def create_product_prompt(row: pd.Series, language: str) -> str: | |
| """ | |
| Create a detailed prompt for product description generation. | |
| Args: | |
| row: DataFrame row containing product information | |
| language: Target language ('en' or 'es') | |
| Returns: | |
| str: Formatted prompt for the LLM | |
| """ | |
| base_prompts = { | |
| 'en': """Create a compelling and detailed marketing description for a premium dog food product. | |
| Include the following information and expand with your knowledge: | |
| • Brand: {brand} | |
| • Product Name: {product_name} | |
| • Specifically designed for: {dog_type} | |
| • Type: {food_type} | |
| • Package Size: {weight} kg | |
| • Price Point: ${price:.2f} | |
| Focus on: | |
| 1. Key nutritional benefits | |
| 2. Quality of ingredients | |
| 3. Health advantages | |
| 4. Why it's perfect for the specified dog type | |
| 5. Value proposition | |
| Make it engaging and persuasive while maintaining accuracy.""", | |
| 'es': """Crea una descripción comercial detallada y convincente para un producto premium de alimentación canina. | |
| Incluye la siguiente información y expándela con tu conocimiento: | |
| • Marca: {brand} | |
| • Nombre del Producto: {product_name} | |
| • Diseñado específicamente para: {dog_type} | |
| • Tipo: {food_type} | |
| • Tamaño del Paquete: {weight} kg | |
| • Precio: ${price:.2f} | |
| Enfócate en: | |
| 1. Beneficios nutricionales clave | |
| 2. Calidad de los ingredientes | |
| 3. Ventajas para la salud | |
| 4. Por qué es perfecto para el tipo de perro especificado | |
| 5. Propuesta de valor | |
| Hazlo atractivo y persuasivo mientras mantienes la precisión.""" | |
| } | |
| prompt = base_prompts[language].format(**row.to_dict()) | |
| return prompt | |
| def generate_description(row: pd.Series, language: str, retry_attempts: int = 3) -> str: | |
| """ | |
| Generate a product description using OpenAI's API with retry logic. | |
| Args: | |
| row: DataFrame row containing product information | |
| language: Target language ('en' or 'es') | |
| retry_attempts: Number of retry attempts on failure | |
| Returns: | |
| str: Generated description or error message | |
| """ | |
| prompt = create_product_prompt(row, language) | |
| for attempt in range(retry_attempts): | |
| try: | |
| response = openai.chat.completions.create( | |
| model="gpt-4o-mini", # Using GPT-4 for better quality | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=150, | |
| temperature=0.7, | |
| presence_penalty=0.3, | |
| frequency_penalty=0.3 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| logging.error(f"Attempt {attempt + 1} failed: {str(e)}") # Added more detailed logging | |
| if attempt == retry_attempts - 1: | |
| return f"Error generating {language} description: {e}" | |
| time.sleep(2 ** attempt) # Exponential backoff | |
| def save_checkpoint(df: pd.DataFrame, batch_num: int, checkpoint_dir: str = 'checkpoints') -> None: | |
| """ | |
| Save a checkpoint of the current DataFrame. | |
| Args: | |
| df: DataFrame to checkpoint | |
| batch_num: Current batch number | |
| checkpoint_dir: Directory to store checkpoints | |
| """ | |
| # Create checkpoint directory if it doesn't exist | |
| os.makedirs(checkpoint_dir, exist_ok=True) | |
| checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_batch_{batch_num}.pkl') | |
| df.to_pickle(checkpoint_path) | |
| logging.info(f"Saved checkpoint at batch {batch_num}") | |
| def load_latest_checkpoint(checkpoint_dir: str = 'checkpoints') -> tuple[pd.DataFrame | None, int]: | |
| """ | |
| Load the most recent checkpoint if it exists. | |
| Args: | |
| checkpoint_dir: Directory containing checkpoints | |
| Returns: | |
| tuple: (DataFrame or None, last completed batch number) | |
| """ | |
| if not os.path.exists(checkpoint_dir): | |
| return None, 0 | |
| checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'checkpoint_batch_*.pkl')) | |
| if not checkpoint_files: | |
| return None, 0 | |
| latest_checkpoint = max(checkpoint_files) | |
| batch_num = int(latest_checkpoint.split('_')[-1].split('.')[0]) | |
| logging.info(f"Loading checkpoint from batch {batch_num}") | |
| return pd.read_pickle(latest_checkpoint), batch_num | |
| def enrich_descriptions(df: pd.DataFrame, batch_size: int = 10, checkpoint_frequency: int = 5) -> pd.DataFrame: | |
| """ | |
| Enrich DataFrame with product descriptions in both languages. | |
| Args: | |
| df: Input DataFrame | |
| batch_size: Number of items to process in each batch | |
| checkpoint_frequency: Number of batches between checkpoints | |
| Returns: | |
| pd.DataFrame: Enriched DataFrame with new description columns | |
| """ | |
| logging.info("Starting description generation process...") | |
| initial_row_count = len(df) | |
| df = df.copy() | |
| # Try to load from checkpoint | |
| checkpoint_df, last_batch = load_latest_checkpoint() | |
| if checkpoint_df is not None: | |
| df = checkpoint_df | |
| start_idx = (last_batch + 1) * batch_size | |
| logging.info(f"Resuming from batch {last_batch + 1}") | |
| else: | |
| start_idx = 0 | |
| total_batches = (len(df) + batch_size - 1) // batch_size | |
| for batch_num, i in enumerate(tqdm(range(start_idx, len(df), batch_size)), start=last_batch + 1): | |
| batch = df.iloc[i:i + batch_size] | |
| df.loc[batch.index, 'description_en'] = batch.apply( | |
| lambda row: generate_description(row, 'en'), axis=1 | |
| ) | |
| df.loc[batch.index, 'description_es'] = batch.apply( | |
| lambda row: generate_description(row, 'es'), axis=1 | |
| ) | |
| if batch_num % checkpoint_frequency == 0: | |
| save_checkpoint(df, batch_num) | |
| time.sleep(1) # Rate limiting | |
| # Validate row counts and description completeness | |
| final_row_count = len(df) | |
| if final_row_count != initial_row_count: | |
| raise ValueError(f"Row count mismatch: Started with {initial_row_count} rows, ended with {final_row_count} rows") | |
| # Check for missing descriptions | |
| missing_en = df['description_en'].isna().sum() | |
| missing_es = df['description_es'].isna().sum() | |
| if missing_en > 0 or missing_es > 0: | |
| logging.warning(f"Missing descriptions detected: English: {missing_en}, Spanish: {missing_es}") | |
| return df | |
| def main(): | |
| """Main execution function.""" | |
| try: | |
| # Load the dataset | |
| file_path = '2nd_clean_comida_dogs_filtered.pkl' | |
| data = pd.read_pickle(file_path) | |
| initial_count = len(data) | |
| logging.info(f"Loaded dataset with {initial_count} records") | |
| # Enrich with descriptions | |
| enriched_data = enrich_descriptions(data) | |
| # Final validation before saving | |
| if len(enriched_data) != initial_count: | |
| raise ValueError(f"Row count mismatch: Original had {initial_count} rows, enriched has {len(enriched_data)} rows") | |
| # Save the enriched dataset | |
| output_path = '3rd_clean_comida_dogs_enriched_multilingual_2.pkl' | |
| enriched_data.to_pickle(output_path) | |
| logging.info(f"Enriched dataset saved to {output_path}") | |
| except Exception as e: | |
| logging.error(f"An error occurred: {e}") | |
| raise | |
| if __name__ == "__main__": | |
| main() |