Spaces:

tueniuu
/

itinerary-recommender-api

Sleeping

App Files Files Community

itinerary-recommender-api / data_processing /generate_embeddings.py

tueniuu

Upload 4 files

8c15eb9 verified 4 months ago

raw

history blame contribute delete

7 kB

	import pandas as pd
	import psycopg2
	import numpy as np
	from sentence_transformers import SentenceTransformer
	import sys
	import os
	import time
	import json
	import random
	import google.generativeai as genai
	from google.api_core import exceptions

	# --- (The 'generate_gemini_description_json' function remains the same as the last version with exponential backoff) ---
	def generate_gemini_description_json(row, model):
	# This function with the retry logic is still needed and is unchanged.
	location_data = {
	'name': row.get('name', ''),
	'category': row.get('primary_category', ''),
	'tags': row.get('tags', '')
	}
	location_data = {k: v for k, v in location_data.items() if v}
	data_string = ", ".join([f"{key}: {value}" for key, value in location_data.items()])
	prompt = f"""
	You are a data enrichment specialist. Your task is to generate a JSON object containing a creative, appealing description for a location in Busan, South Korea.
	Instructions:
	- Use the provided data to create an engaging, fluent paragraph between 40 and 70 words.
	- Your output MUST be a valid JSON object with a single key: "description".
	Input Data:
	{data_string}
	Output JSON:
	"""
	max_retries = 5
	base_delay = 60
	for attempt in range(max_retries):
	try:
	generation_config = genai.types.GenerationConfig(response_mime_type="application/json")
	response = model.generate_content(prompt, generation_config=generation_config)
	response_json = json.loads(response.text)
	description = response_json['description']
	return description.strip()
	except exceptions.ResourceExhausted as e:
	print(f" - Rate limit hit for ID {row.get('id', 'N/A')}. Waiting... (Attempt {attempt + 1}/{max_retries})")
	delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
	print(f" ...backing off for {delay:.2f} seconds.")
	time.sleep(delay)
	except (json.JSONDecodeError, KeyError) as e:
	print(f" - JSON Parsing Error for ID {row.get('id', 'N/A')}: {e}. Retrying...")
	time.sleep(5)
	except Exception as e:
	print(f" - Unexpected API Error for ID {row.get('id', 'N/A')}: {e}. Retrying with backoff...")
	delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
	time.sleep(delay)
	print(f" - All retries failed for ID {row.get('id', 'N/A')}. Using fallback.")
	return f"{row.get('name', '')}. Tags include: {row.get('tags', '')}"


	# --- THE MAIN ORCHESTRATION SCRIPT ---
	def generate_and_save_embeddings(db_params, api_key):
	# Configure the Gemini API
	genai.configure(api_key=api_key)
	gemini_model = genai.GenerativeModel('gemini-2.5-pro')

	# --- NEW: RESUME LOGIC ---
	PROGRESS_FILE = 'descriptions_progress.csv'

	# 1. Fetch ALL locations from the database first
	conn = psycopg2.connect(**db_params)
	sql_query = "SELECT id, name, tags, primary_category, meal_type FROM locations ORDER BY id;"
	df_all_locations = pd.read_sql_query(sql_query, conn)
	conn.close()
	print(f"Total locations to process: {len(df_all_locations)}")

	# 2. Check for an existing progress file
	processed_ids = set()
	if os.path.exists(PROGRESS_FILE):
	print(f"Found existing progress file: '{PROGRESS_FILE}'. Resuming...")
	df_progress = pd.read_csv(PROGRESS_FILE)
	processed_ids = set(df_progress['id'])
	print(f"{len(processed_ids)} locations already have descriptions.")
	else:
	print("No progress file found. Starting a new session.")
	df_progress = pd.DataFrame(columns=['id', 'description'])

	# 3. Filter out the locations that are already processed
	df_to_process = df_all_locations[~df_all_locations['id'].isin(processed_ids)]
	print(f"{len(df_to_process)} locations remaining to be processed.")

	if df_to_process.empty:
	print("All locations have already been processed.")

	# --- NEW: GRACEFUL SHUTDOWN AND INCREMENTAL SAVING ---
	newly_processed_data = []
	try:
	if not df_to_process.empty:
	print("\nStarting Gemini description generation...")
	for index, row in df_to_process.iterrows():
	print(f"Processing ID: {row['id']}...")
	description = generate_gemini_description_json(row, gemini_model)
	newly_processed_data.append({'id': row['id'], 'description': description})

	except KeyboardInterrupt:
	print("\n--- KeyboardInterrupt detected! Saving progress before exiting. ---")
	finally:
	if newly_processed_data:
	df_new_progress = pd.DataFrame(newly_processed_data)
	df_combined = pd.concat([df_progress, df_new_progress], ignore_index=True)
	df_combined.to_csv(PROGRESS_FILE, index=False)
	print(f"\nSuccessfully saved {len(newly_processed_data)} new descriptions to '{PROGRESS_FILE}'.")
	df_progress = df_combined
	else:
	print("\nNo new descriptions were generated in this session.")

	# --- FINAL EMBEDDING GENERATION (runs after the loop is complete) ---
	print("\n--- All descriptions are now generated. Proceeding to create embeddings. ---")

	# Merge the final descriptions with the original data to ensure correct order
	df_final = df_all_locations.merge(df_progress, on='id', how='left')

	# Check for any locations that might have been missed
	if df_final['description'].isnull().any():
	print("WARNING: Some locations are missing descriptions. Using fallback.")
	df_final['description'].fillna("No description available.", inplace=True)

	sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
	sentences = df_final['description'].tolist()
	print(f"Encoding {len(sentences)} final descriptions into vectors...")

	location_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
	location_ids = df_final['id'].to_numpy()

	np.save('location_embeddings.npy', location_embeddings)
	np.save('location_ids.npy', location_ids)

	print("\nEmbeddings from Gemini descriptions generated successfully!")
	print(f"Embeddings matrix shape: {location_embeddings.shape}")

	if __name__ == '__main__':
	GEMINI_API_KEY = "AIzaSyBwMSL341arzL_FxPzy_DvhDl4Jc46DlaY"

	if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
	print("ERROR: Please replace 'YOUR_API_KEY_HERE' with your actual Gemini API key.")
	else:
	db_connection_params = {
	"host": "localhost",
	"database": "recommendation_locations",
	"user": "postgres",
	"password": "nafikova03",
	"port": "5432"
	}
	generate_and_save_embeddings(db_connection_params, GEMINI_API_KEY)