Spaces:

tueniuu
/

itinerary-recommender-api

Sleeping

File size: 7,003 Bytes

8c15eb9

import pandas as pd
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer
import sys
import os
import time
import json
import random
import google.generativeai as genai
from google.api_core import exceptions

# --- (The 'generate_gemini_description_json' function remains the same as the last version with exponential backoff) ---
def generate_gemini_description_json(row, model):
    # This function with the retry logic is still needed and is unchanged.
    location_data = {
        'name': row.get('name', ''),
        'category': row.get('primary_category', ''),
        'tags': row.get('tags', '')
    }
    location_data = {k: v for k, v in location_data.items() if v}
    data_string = ", ".join([f"{key}: {value}" for key, value in location_data.items()])
    prompt = f"""

    You are a data enrichment specialist. Your task is to generate a JSON object containing a creative, appealing description for a location in Busan, South Korea.

    **Instructions:**

    - Use the provided data to create an engaging, fluent paragraph between 40 and 70 words.

    - Your output MUST be a valid JSON object with a single key: "description".

    **Input Data:**

    {data_string} 

    **Output JSON:**

    """
    max_retries = 5
    base_delay = 60
    for attempt in range(max_retries):
        try:
            generation_config = genai.types.GenerationConfig(response_mime_type="application/json")
            response = model.generate_content(prompt, generation_config=generation_config)
            response_json = json.loads(response.text)
            description = response_json['description']
            return description.strip()
        except exceptions.ResourceExhausted as e:
            print(f"  - Rate limit hit for ID {row.get('id', 'N/A')}. Waiting... (Attempt {attempt + 1}/{max_retries})")
            delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
            print(f"    ...backing off for {delay:.2f} seconds.")
            time.sleep(delay)
        except (json.JSONDecodeError, KeyError) as e:
            print(f"  - JSON Parsing Error for ID {row.get('id', 'N/A')}: {e}. Retrying...")
            time.sleep(5)
        except Exception as e:
            print(f"  - Unexpected API Error for ID {row.get('id', 'N/A')}: {e}. Retrying with backoff...")
            delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
            time.sleep(delay)
    print(f"  - All retries failed for ID {row.get('id', 'N/A')}. Using fallback.")
    return f"{row.get('name', '')}. Tags include: {row.get('tags', '')}"


# --- THE MAIN ORCHESTRATION SCRIPT ---
def generate_and_save_embeddings(db_params, api_key):
    # Configure the Gemini API
    genai.configure(api_key=api_key)
    gemini_model = genai.GenerativeModel('gemini-2.5-pro')

    # --- NEW: RESUME LOGIC ---
    PROGRESS_FILE = 'descriptions_progress.csv'
    
    # 1. Fetch ALL locations from the database first
    conn = psycopg2.connect(**db_params)
    sql_query = "SELECT id, name, tags, primary_category, meal_type FROM locations ORDER BY id;"
    df_all_locations = pd.read_sql_query(sql_query, conn)
    conn.close()
    print(f"Total locations to process: {len(df_all_locations)}")

    # 2. Check for an existing progress file
    processed_ids = set()
    if os.path.exists(PROGRESS_FILE):
        print(f"Found existing progress file: '{PROGRESS_FILE}'. Resuming...")
        df_progress = pd.read_csv(PROGRESS_FILE)
        processed_ids = set(df_progress['id'])
        print(f"{len(processed_ids)} locations already have descriptions.")
    else:
        print("No progress file found. Starting a new session.")
        df_progress = pd.DataFrame(columns=['id', 'description'])

    # 3. Filter out the locations that are already processed
    df_to_process = df_all_locations[~df_all_locations['id'].isin(processed_ids)]
    print(f"{len(df_to_process)} locations remaining to be processed.")

    if df_to_process.empty:
        print("All locations have already been processed.")
    
    # --- NEW: GRACEFUL SHUTDOWN AND INCREMENTAL SAVING ---
    newly_processed_data = []
    try:
        if not df_to_process.empty:
            print("\nStarting Gemini description generation...")
            for index, row in df_to_process.iterrows():
                print(f"Processing ID: {row['id']}...")
                description = generate_gemini_description_json(row, gemini_model)
                newly_processed_data.append({'id': row['id'], 'description': description})

    except KeyboardInterrupt:
        print("\n--- KeyboardInterrupt detected! Saving progress before exiting. ---")
    finally:
        if newly_processed_data:
            df_new_progress = pd.DataFrame(newly_processed_data)
            df_combined = pd.concat([df_progress, df_new_progress], ignore_index=True)
            df_combined.to_csv(PROGRESS_FILE, index=False)
            print(f"\nSuccessfully saved {len(newly_processed_data)} new descriptions to '{PROGRESS_FILE}'.")
            df_progress = df_combined
        else:
            print("\nNo new descriptions were generated in this session.")

    # --- FINAL EMBEDDING GENERATION (runs after the loop is complete) ---
    print("\n--- All descriptions are now generated. Proceeding to create embeddings. ---")
    
    # Merge the final descriptions with the original data to ensure correct order
    df_final = df_all_locations.merge(df_progress, on='id', how='left')
    
    # Check for any locations that might have been missed
    if df_final['description'].isnull().any():
        print("WARNING: Some locations are missing descriptions. Using fallback.")
        df_final['description'].fillna("No description available.", inplace=True)
        
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = df_final['description'].tolist()
    print(f"Encoding {len(sentences)} final descriptions into vectors...")
    
    location_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
    location_ids = df_final['id'].to_numpy()
    
    np.save('location_embeddings.npy', location_embeddings)
    np.save('location_ids.npy', location_ids)
    
    print("\nEmbeddings from Gemini descriptions generated successfully!")
    print(f"Embeddings matrix shape: {location_embeddings.shape}")

if __name__ == '__main__':
    GEMINI_API_KEY = "AIzaSyBwMSL341arzL_FxPzy_DvhDl4Jc46DlaY"

    if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
        print("ERROR: Please replace 'YOUR_API_KEY_HERE' with your actual Gemini API key.")
    else:
        db_connection_params = {
            "host": "localhost",
            "database": "recommendation_locations",
            "user": "postgres",
            "password": "nafikova03",
            "port": "5432"
        }
        generate_and_save_embeddings(db_connection_params, GEMINI_API_KEY)