tueniuu's picture
Upload 4 files
8c15eb9 verified
import pandas as pd
import psycopg2
import numpy as np
from sentence_transformers import SentenceTransformer
import sys
import os
import time
import json
import random
import google.generativeai as genai
from google.api_core import exceptions
# --- (The 'generate_gemini_description_json' function remains the same as the last version with exponential backoff) ---
def generate_gemini_description_json(row, model):
# This function with the retry logic is still needed and is unchanged.
location_data = {
'name': row.get('name', ''),
'category': row.get('primary_category', ''),
'tags': row.get('tags', '')
}
location_data = {k: v for k, v in location_data.items() if v}
data_string = ", ".join([f"{key}: {value}" for key, value in location_data.items()])
prompt = f"""
You are a data enrichment specialist. Your task is to generate a JSON object containing a creative, appealing description for a location in Busan, South Korea.
**Instructions:**
- Use the provided data to create an engaging, fluent paragraph between 40 and 70 words.
- Your output MUST be a valid JSON object with a single key: "description".
**Input Data:**
{data_string}
**Output JSON:**
"""
max_retries = 5
base_delay = 60
for attempt in range(max_retries):
try:
generation_config = genai.types.GenerationConfig(response_mime_type="application/json")
response = model.generate_content(prompt, generation_config=generation_config)
response_json = json.loads(response.text)
description = response_json['description']
return description.strip()
except exceptions.ResourceExhausted as e:
print(f" - Rate limit hit for ID {row.get('id', 'N/A')}. Waiting... (Attempt {attempt + 1}/{max_retries})")
delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
print(f" ...backing off for {delay:.2f} seconds.")
time.sleep(delay)
except (json.JSONDecodeError, KeyError) as e:
print(f" - JSON Parsing Error for ID {row.get('id', 'N/A')}: {e}. Retrying...")
time.sleep(5)
except Exception as e:
print(f" - Unexpected API Error for ID {row.get('id', 'N/A')}: {e}. Retrying with backoff...")
delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
time.sleep(delay)
print(f" - All retries failed for ID {row.get('id', 'N/A')}. Using fallback.")
return f"{row.get('name', '')}. Tags include: {row.get('tags', '')}"
# --- THE MAIN ORCHESTRATION SCRIPT ---
def generate_and_save_embeddings(db_params, api_key):
# Configure the Gemini API
genai.configure(api_key=api_key)
gemini_model = genai.GenerativeModel('gemini-2.5-pro')
# --- NEW: RESUME LOGIC ---
PROGRESS_FILE = 'descriptions_progress.csv'
# 1. Fetch ALL locations from the database first
conn = psycopg2.connect(**db_params)
sql_query = "SELECT id, name, tags, primary_category, meal_type FROM locations ORDER BY id;"
df_all_locations = pd.read_sql_query(sql_query, conn)
conn.close()
print(f"Total locations to process: {len(df_all_locations)}")
# 2. Check for an existing progress file
processed_ids = set()
if os.path.exists(PROGRESS_FILE):
print(f"Found existing progress file: '{PROGRESS_FILE}'. Resuming...")
df_progress = pd.read_csv(PROGRESS_FILE)
processed_ids = set(df_progress['id'])
print(f"{len(processed_ids)} locations already have descriptions.")
else:
print("No progress file found. Starting a new session.")
df_progress = pd.DataFrame(columns=['id', 'description'])
# 3. Filter out the locations that are already processed
df_to_process = df_all_locations[~df_all_locations['id'].isin(processed_ids)]
print(f"{len(df_to_process)} locations remaining to be processed.")
if df_to_process.empty:
print("All locations have already been processed.")
# --- NEW: GRACEFUL SHUTDOWN AND INCREMENTAL SAVING ---
newly_processed_data = []
try:
if not df_to_process.empty:
print("\nStarting Gemini description generation...")
for index, row in df_to_process.iterrows():
print(f"Processing ID: {row['id']}...")
description = generate_gemini_description_json(row, gemini_model)
newly_processed_data.append({'id': row['id'], 'description': description})
except KeyboardInterrupt:
print("\n--- KeyboardInterrupt detected! Saving progress before exiting. ---")
finally:
if newly_processed_data:
df_new_progress = pd.DataFrame(newly_processed_data)
df_combined = pd.concat([df_progress, df_new_progress], ignore_index=True)
df_combined.to_csv(PROGRESS_FILE, index=False)
print(f"\nSuccessfully saved {len(newly_processed_data)} new descriptions to '{PROGRESS_FILE}'.")
df_progress = df_combined
else:
print("\nNo new descriptions were generated in this session.")
# --- FINAL EMBEDDING GENERATION (runs after the loop is complete) ---
print("\n--- All descriptions are now generated. Proceeding to create embeddings. ---")
# Merge the final descriptions with the original data to ensure correct order
df_final = df_all_locations.merge(df_progress, on='id', how='left')
# Check for any locations that might have been missed
if df_final['description'].isnull().any():
print("WARNING: Some locations are missing descriptions. Using fallback.")
df_final['description'].fillna("No description available.", inplace=True)
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sentences = df_final['description'].tolist()
print(f"Encoding {len(sentences)} final descriptions into vectors...")
location_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
location_ids = df_final['id'].to_numpy()
np.save('location_embeddings.npy', location_embeddings)
np.save('location_ids.npy', location_ids)
print("\nEmbeddings from Gemini descriptions generated successfully!")
print(f"Embeddings matrix shape: {location_embeddings.shape}")
if __name__ == '__main__':
GEMINI_API_KEY = "AIzaSyBwMSL341arzL_FxPzy_DvhDl4Jc46DlaY"
if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
print("ERROR: Please replace 'YOUR_API_KEY_HERE' with your actual Gemini API key.")
else:
db_connection_params = {
"host": "localhost",
"database": "recommendation_locations",
"user": "postgres",
"password": "nafikova03",
"port": "5432"
}
generate_and_save_embeddings(db_connection_params, GEMINI_API_KEY)