Spaces:

tueniuu
/

itinerary-recommender-api

Sleeping

App Files Files Community

tueniuu commited on Sep 18, 2025

Commit

8c15eb9

verified ·

1 Parent(s): e53b93b

Upload 4 files

Browse files

Files changed (4) hide show

data_processing/build_index.py +45 -0
data_processing/excel_to_db.py +101 -0
data_processing/generate_embeddings.py +150 -0
data_processing/loc_data.xlsx +0 -0

data_processing/build_index.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import faiss
+import numpy as np
+def build_and_save_index(embedding_file, index_file):
+    """
+    Loads embeddings, NORMALIZES them, builds a FAISS IndexFlatIP index,
+    and saves the index to disk.
+    """
+    try:
+        # Load the embeddings from the .npy file
+        print(f"Loading embeddings from '{embedding_file}'...")
+        embeddings = np.load(embedding_file).astype('float32')
+        print(f"Embeddings loaded. Shape: {embeddings.shape}")
+        # --- FIX 1: NORMALIZE THE EMBEDDINGS ---
+        # This crucial step scales all vectors to a unit length of 1.
+        print("Normalizing embeddings to unit length...")
+        faiss.normalize_L2(embeddings)
+        embedding_dimension = embeddings.shape[1]
+        # --- FIX 2: USE IndexFlatIP FOR COSINE SIMILARITY ---
+        # IndexFlatIP (Inner Product) is the correct index for comparing normalized text vectors.
+        print(f"Building FAISS IndexFlatIP with dimension {embedding_dimension}...")
+        index = faiss.IndexFlatIP(embedding_dimension)
+        # Add the normalized embeddings to the index
+        index.add(embeddings)
+        print(f"Successfully added {index.ntotal} vectors to the index.")
+        # Save the Index
+        print(f"Saving index to '{index_file}'...")
+        faiss.write_index(index, index_file)
+        print(f"Index saved successfully!")
+    except FileNotFoundError:
+        print(f"ERROR: The file '{embedding_file}' was not found.")
+    except Exception as e:
+        print(f"AN UNEXPECTED ERROR OCCURRED: {e}")
+if __name__ == '__main__':
+    embedding_filename = 'location_embeddings.npy'
+    index_filename = 'location_index.faiss'
+    build_and_save_index(embedding_filename, index_filename)

data_processing/excel_to_db.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import pandas as pd
+import psycopg2
+import json
+import numpy as np
+def format_operating_hours(time_str):
+    """
+    Transforms a simple time string (e.g., '11:00-22:00') into a
+    structured JSON object for all days of the week.
+    """
+    if not isinstance(time_str, str) or '-' not in time_str:
+        return None
+    schedule = {
+        "monday": time_str, "tuesday": time_str, "wednesday": time_str,
+        "thursday": time_str, "friday": time_str, "saturday": time_str,
+        "sunday": time_str,
+    }
+    return json.dumps(schedule)
+def parse_period_dates(period_str):
+    """
+    Parses a date range string like '2025.01.04 - 2025.12.27'
+    and returns a tuple of (start_date, end_date).
+    """
+    if not isinstance(period_str, str) or '-' not in period_str:
+        return None, None
+    try:
+        parts = period_str.split('-')
+        start_date = parts[0].strip().replace('.', '-')
+        end_date = parts[1].strip().replace('.', '-')
+        return start_date, end_date
+    except Exception:
+        return None, None
+def load_excel_to_postgres(excel_path, db_params):
+    """
+    Connects to PostgreSQL, reads an Excel file, and inserts the data.
+    """
+    conn = None
+    cur = None
+    try:
+        print(f"Reading data from '{excel_path}'...")
+        df = pd.read_excel(excel_path)
+        df = df.replace({np.nan: None})
+        print("Data read successfully. Preparing for database insertion...")
+        conn = psycopg2.connect(**db_params)
+        cur = conn.cursor()
+        print("Successfully connected to the PostgreSQL database.")
+        for index, row in df.iterrows():
+            operating_hours_json = format_operating_hours(row.get('time'))
+            start_date, end_date = parse_period_dates(row.get('period'))
+            sql = """
+                INSERT INTO locations (
+                    name, address, naver_url, region, primary_category, tags,
+                    price_level, indoor_outdoor, operating_hours,
+                    period_start_date, period_end_date, website, meal_type, geom
+                ) VALUES (
+                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
+                    ST_SetSRID(ST_MakePoint(%s, %s), 4326)
+                ) ON CONFLICT (name) DO NOTHING;
+            """
+            data_tuple = (
+                row.get('name'), row.get('address'), row.get('naver_url'), row.get('region'),
+                row.get('primary_category'), row.get('tags'), row.get('price_level'),
+                row.get('indoor_outdoor'), operating_hours_json, start_date, end_date,
+                row.get('website'), row.get('type'),
+                row.get('longitude'), row.get('latitude')
+            )
+            cur.execute(sql, data_tuple)
+        conn.commit()
+        print(f"\nSuccessfully processed and inserted {len(df)} rows into the 'locations' table.")
+    except FileNotFoundError:
+        print(f"ERROR: The file '{excel_path}' was not found.")
+    except psycopg2.Error as e:
+        print(f"DATABASE ERROR: {e}")
+    except Exception as e:
+        print(f"AN UNEXPECTED ERROR OCCURRED: {e}")
+    finally:
+        if cur is not None:
+            cur.close()
+        if conn is not None:
+            conn.close()
+            print("Database connection closed.")
+if __name__ == '__main__':
+    db_connection_params = {
+        "host": "localhost",
+        "database": "recommendation_locations",
+        "user": "postgres",
+        "password": "nafikova03",
+        "port": "5432"
+    }
+    excel_file_path = "loc_data.xlsx"
+    load_excel_to_postgres(excel_file_path, db_connection_params)

data_processing/generate_embeddings.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import pandas as pd
+import psycopg2
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import sys
+import os
+import time
+import json
+import random
+import google.generativeai as genai
+from google.api_core import exceptions
+# --- (The 'generate_gemini_description_json' function remains the same as the last version with exponential backoff) ---
+def generate_gemini_description_json(row, model):
+    # This function with the retry logic is still needed and is unchanged.
+    location_data = {
+        'name': row.get('name', ''),
+        'category': row.get('primary_category', ''),
+        'tags': row.get('tags', '')
+    }
+    location_data = {k: v for k, v in location_data.items() if v}
+    data_string = ", ".join([f"{key}: {value}" for key, value in location_data.items()])
+    prompt = f"""
+    You are a data enrichment specialist. Your task is to generate a JSON object containing a creative, appealing description for a location in Busan, South Korea.
+    **Instructions:**
+    - Use the provided data to create an engaging, fluent paragraph between 40 and 70 words.
+    - Your output MUST be a valid JSON object with a single key: "description".
+    **Input Data:**
+    {data_string}
+    **Output JSON:**
+    """
+    max_retries = 5
+    base_delay = 60
+    for attempt in range(max_retries):
+        try:
+            generation_config = genai.types.GenerationConfig(response_mime_type="application/json")
+            response = model.generate_content(prompt, generation_config=generation_config)
+            response_json = json.loads(response.text)
+            description = response_json['description']
+            return description.strip()
+        except exceptions.ResourceExhausted as e:
+            print(f"  - Rate limit hit for ID {row.get('id', 'N/A')}. Waiting... (Attempt {attempt + 1}/{max_retries})")
+            delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
+            print(f"    ...backing off for {delay:.2f} seconds.")
+            time.sleep(delay)
+        except (json.JSONDecodeError, KeyError) as e:
+            print(f"  - JSON Parsing Error for ID {row.get('id', 'N/A')}: {e}. Retrying...")
+            time.sleep(5)
+        except Exception as e:
+            print(f"  - Unexpected API Error for ID {row.get('id', 'N/A')}: {e}. Retrying with backoff...")
+            delay = base_delay * (2 ** attempt) + random.uniform(0, 5)
+            time.sleep(delay)
+    print(f"  - All retries failed for ID {row.get('id', 'N/A')}. Using fallback.")
+    return f"{row.get('name', '')}. Tags include: {row.get('tags', '')}"
+# --- THE MAIN ORCHESTRATION SCRIPT ---
+def generate_and_save_embeddings(db_params, api_key):
+    # Configure the Gemini API
+    genai.configure(api_key=api_key)
+    gemini_model = genai.GenerativeModel('gemini-2.5-pro')
+    # --- NEW: RESUME LOGIC ---
+    PROGRESS_FILE = 'descriptions_progress.csv'
+    # 1. Fetch ALL locations from the database first
+    conn = psycopg2.connect(**db_params)
+    sql_query = "SELECT id, name, tags, primary_category, meal_type FROM locations ORDER BY id;"
+    df_all_locations = pd.read_sql_query(sql_query, conn)
+    conn.close()
+    print(f"Total locations to process: {len(df_all_locations)}")
+    # 2. Check for an existing progress file
+    processed_ids = set()
+    if os.path.exists(PROGRESS_FILE):
+        print(f"Found existing progress file: '{PROGRESS_FILE}'. Resuming...")
+        df_progress = pd.read_csv(PROGRESS_FILE)
+        processed_ids = set(df_progress['id'])
+        print(f"{len(processed_ids)} locations already have descriptions.")
+    else:
+        print("No progress file found. Starting a new session.")
+        df_progress = pd.DataFrame(columns=['id', 'description'])
+    # 3. Filter out the locations that are already processed
+    df_to_process = df_all_locations[~df_all_locations['id'].isin(processed_ids)]
+    print(f"{len(df_to_process)} locations remaining to be processed.")
+    if df_to_process.empty:
+        print("All locations have already been processed.")
+    # --- NEW: GRACEFUL SHUTDOWN AND INCREMENTAL SAVING ---
+    newly_processed_data = []
+    try:
+        if not df_to_process.empty:
+            print("\nStarting Gemini description generation...")
+            for index, row in df_to_process.iterrows():
+                print(f"Processing ID: {row['id']}...")
+                description = generate_gemini_description_json(row, gemini_model)
+                newly_processed_data.append({'id': row['id'], 'description': description})
+    except KeyboardInterrupt:
+        print("\n--- KeyboardInterrupt detected! Saving progress before exiting. ---")
+    finally:
+        if newly_processed_data:
+            df_new_progress = pd.DataFrame(newly_processed_data)
+            df_combined = pd.concat([df_progress, df_new_progress], ignore_index=True)
+            df_combined.to_csv(PROGRESS_FILE, index=False)
+            print(f"\nSuccessfully saved {len(newly_processed_data)} new descriptions to '{PROGRESS_FILE}'.")
+            df_progress = df_combined
+        else:
+            print("\nNo new descriptions were generated in this session.")
+    # --- FINAL EMBEDDING GENERATION (runs after the loop is complete) ---
+    print("\n--- All descriptions are now generated. Proceeding to create embeddings. ---")
+    # Merge the final descriptions with the original data to ensure correct order
+    df_final = df_all_locations.merge(df_progress, on='id', how='left')
+    # Check for any locations that might have been missed
+    if df_final['description'].isnull().any():
+        print("WARNING: Some locations are missing descriptions. Using fallback.")
+        df_final['description'].fillna("No description available.", inplace=True)
+    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
+    sentences = df_final['description'].tolist()
+    print(f"Encoding {len(sentences)} final descriptions into vectors...")
+    location_embeddings = sbert_model.encode(sentences, show_progress_bar=True)
+    location_ids = df_final['id'].to_numpy()
+    np.save('location_embeddings.npy', location_embeddings)
+    np.save('location_ids.npy', location_ids)
+    print("\nEmbeddings from Gemini descriptions generated successfully!")
+    print(f"Embeddings matrix shape: {location_embeddings.shape}")
+if __name__ == '__main__':
+    GEMINI_API_KEY = "AIzaSyBwMSL341arzL_FxPzy_DvhDl4Jc46DlaY"
+    if GEMINI_API_KEY == "YOUR_API_KEY_HERE":
+        print("ERROR: Please replace 'YOUR_API_KEY_HERE' with your actual Gemini API key.")
+    else:
+        db_connection_params = {
+            "host": "localhost",
+            "database": "recommendation_locations",
+            "user": "postgres",
+            "password": "nafikova03",
+            "port": "5432"
+        }
+        generate_and_save_embeddings(db_connection_params, GEMINI_API_KEY)

data_processing/loc_data.xlsx ADDED Viewed

Binary file (74.1 kB). View file