Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import psycopg2 | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| import sys | |
| import os | |
| import time | |
| import json | |
| import random | |
| import google.generativeai as genai | |
| from google.api_core import exceptions | |
| # --- (The 'generate_gemini_description_json' function remains the same as the last version with exponential backoff) --- | |
| def generate_gemini_description_json(row, model): | |
| # This function with the retry logic is still needed and is unchanged. | |
| location_data = { | |
| 'name': row.get('name', ''), | |
| 'category': row.get('primary_category', ''), | |
| 'tags': row.get('tags', '') | |
| } | |
| location_data = {k: v for k, v in location_data.items() if v} | |
| data_string = ", ".join([f"{key}: {value}" for key, value in location_data.items()]) | |
| prompt = f""" | |
| You are a data enrichment specialist. Your task is to generate a JSON object containing a creative, appealing description for a location in Busan, South Korea. | |
| **Instructions:** | |
| - Use the provided data to create an engaging, fluent paragraph between 40 and 70 words. | |
| - Your output MUST be a valid JSON object with a single key: "description". | |
| **Input Data:** | |
| {data_string} | |
| **Output JSON:** | |
| """ | |
| max_retries = 5 | |
| base_delay = 60 | |
| for attempt in range(max_retries): | |
| try: | |
| generation_config = genai.types.GenerationConfig(response_mime_type="application/json") | |
| response = model.generate_content(prompt, generation_config=generation_config) | |
| response_json = json.loads(response.text) | |
| description = response_json['description'] | |
| return description.strip() | |
| except exceptions.ResourceExhausted as e: | |
| print(f" - Rate limit hit for ID {row.get('id', 'N/A')}. Waiting... (Attempt {attempt + 1}/{max_retries})") | |
| delay = base_delay * (2 ** attempt) + random.uniform(0, 5) | |
| print(f" ...backing off for {delay:.2f} seconds.") | |
| time.sleep(delay) | |
| except (json.JSONDecodeError, KeyError) as e: | |
| print(f" - JSON Parsing Error for ID {row.get('id', 'N/A')}: {e}. Retrying...") | |
| time.sleep(5) | |
| except Exception as e: | |
| print(f" - Unexpected API Error for ID {row.get('id', 'N/A')}: {e}. Retrying with backoff...") | |
| delay = base_delay * (2 ** attempt) + random.uniform(0, 5) | |
| time.sleep(delay) | |
| print(f" - All retries failed for ID {row.get('id', 'N/A')}. Using fallback.") | |
| return f"{row.get('name', '')}. Tags include: {row.get('tags', '')}" | |
| # --- THE MAIN ORCHESTRATION SCRIPT --- | |
| def generate_and_save_embeddings(db_params, api_key): | |
| # Configure the Gemini API | |
| genai.configure(api_key=api_key) | |
| gemini_model = genai.GenerativeModel('gemini-2.5-pro') | |
| # --- NEW: RESUME LOGIC --- | |
| PROGRESS_FILE = 'descriptions_progress.csv' | |
| # 1. Fetch ALL locations from the database first | |
| conn = psycopg2.connect(**db_params) | |
| sql_query = "SELECT id, name, tags, primary_category, meal_type FROM locations ORDER BY id;" | |
| df_all_locations = pd.read_sql_query(sql_query, conn) | |
| conn.close() | |
| print(f"Total locations to process: {len(df_all_locations)}") | |
| # 2. Check for an existing progress file | |
| processed_ids = set() | |
| if os.path.exists(PROGRESS_FILE): | |
| print(f"Found existing progress file: '{PROGRESS_FILE}'. Resuming...") | |
| df_progress = pd.read_csv(PROGRESS_FILE) | |
| processed_ids = set(df_progress['id']) | |
| print(f"{len(processed_ids)} locations already have descriptions.") | |
| else: | |
| print("No progress file found. Starting a new session.") | |
| df_progress = pd.DataFrame(columns=['id', 'description']) | |
| # 3. Filter out the locations that are already processed | |
| df_to_process = df_all_locations[~df_all_locations['id'].isin(processed_ids)] | |
| print(f"{len(df_to_process)} locations remaining to be processed.") | |
| if df_to_process.empty: | |
| print("All locations have already been processed.") | |
| # --- NEW: GRACEFUL SHUTDOWN AND INCREMENTAL SAVING --- | |
| newly_processed_data = [] | |
| try: | |
| if not df_to_process.empty: | |
| print("\nStarting Gemini description generation...") | |
| for index, row in df_to_process.iterrows(): | |
| print(f"Processing ID: {row['id']}...") | |
| description = generate_gemini_description_json(row, gemini_model) | |
| newly_processed_data.append({'id': row['id'], 'description': description}) | |
| except KeyboardInterrupt: | |
| print("\n--- KeyboardInterrupt detected! Saving progress before exiting. ---") | |
| finally: | |
| if newly_processed_data: | |
| df_new_progress = pd.DataFrame(newly_processed_data) | |
| df_combined = pd.concat([df_progress, df_new_progress], ignore_index=True) | |
| df_combined.to_csv(PROGRESS_FILE, index=False) | |
| print(f"\nSuccessfully saved {len(newly_processed_data)} new descriptions to '{PROGRESS_FILE}'.") | |
| df_progress = df_combined | |
| else: | |
| print("\nNo new descriptions were generated in this session.") | |
| # --- FINAL EMBEDDING GENERATION (runs after the loop is complete) --- | |
| print("\n--- All descriptions are now generated. Proceeding to create embeddings. ---") | |
| # Merge the final descriptions with the original data to ensure correct order | |
| df_final = df_all_locations.merge(df_progress, on='id', how='left') | |
| # Check for any locations that might have been missed | |
| if df_final['description'].isnull().any(): | |
| print("WARNING: Some locations are missing descriptions. Using fallback.") | |
| df_final['description'].fillna("No description available.", inplace=True) | |
| sbert_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| sentences = df_final['description'].tolist() | |
| print(f"Encoding {len(sentences)} final descriptions into vectors...") | |
| location_embeddings = sbert_model.encode(sentences, show_progress_bar=True) | |
| location_ids = df_final['id'].to_numpy() | |
| np.save('location_embeddings.npy', location_embeddings) | |
| np.save('location_ids.npy', location_ids) | |
| print("\nEmbeddings from Gemini descriptions generated successfully!") | |
| print(f"Embeddings matrix shape: {location_embeddings.shape}") | |
| if __name__ == '__main__': | |
| GEMINI_API_KEY = "AIzaSyBwMSL341arzL_FxPzy_DvhDl4Jc46DlaY" | |
| if GEMINI_API_KEY == "YOUR_API_KEY_HERE": | |
| print("ERROR: Please replace 'YOUR_API_KEY_HERE' with your actual Gemini API key.") | |
| else: | |
| db_connection_params = { | |
| "host": "localhost", | |
| "database": "recommendation_locations", | |
| "user": "postgres", | |
| "password": "nafikova03", | |
| "port": "5432" | |
| } | |
| generate_and_save_embeddings(db_connection_params, GEMINI_API_KEY) |