matching / index_all_properties.py
Calcifer0323's picture
Fix: Update to RoSBERTa model (1024 dims), remove half precision, increase timeout
93cd57d
"""
Скрипт для индексации всех объектов недвижимости через HuggingFace Spaces сервис
Usage:
python index_all_properties.py # Интерактивный режим
python index_all_properties.py --yes # Автоподтверждение
"""
import psycopg2
import requests
import time
import sys
from typing import List, Dict, Any
# Конфигурация БД
DB_CONFIG = {
'host': 'dpg-d5ht8vi4d50c739akh2g-a.virginia-postgres.render.com',
'port': 5432,
'database': 'lead_exchange_bk',
'user': 'lead_exchange_bk_user',
'password': '8m2gtTRBW0iAr7nY2Aadzz0VcZBEVKYM'
}
# URL сервиса на HuggingFace Spaces
HF_SERVICE_URL = "https://calcifer0323-matching.hf.space"
def get_properties_from_db() -> List[Dict[str, Any]]:
"""Получить все объекты недвижимости из БД"""
print("📥 Fetching properties from database...")
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor()
cursor.execute("""
SELECT property_id, title, description, address, property_type,
area, price, rooms, status
FROM properties
ORDER BY created_at DESC
""")
columns = ['property_id', 'title', 'description', 'address', 'property_type',
'area', 'price', 'rooms', 'status']
properties = []
for row in cursor.fetchall():
prop = dict(zip(columns, row))
properties.append(prop)
cursor.close()
conn.close()
print(f"✅ Fetched {len(properties)} properties")
return properties
def prepare_text_for_property(prop: Dict[str, Any]) -> str:
"""Подготовить текст для генерации эмбеддинга"""
parts = []
if prop.get('title'):
parts.append(f"Название: {prop['title']}")
if prop.get('description'):
parts.append(f"Описание: {prop['description']}")
if prop.get('address'):
parts.append(f"Адрес: {prop['address']}")
# Добавляем структурированные данные
details = []
if prop.get('property_type'):
details.append(f"тип: {prop['property_type']}")
if prop.get('rooms'):
details.append(f"комнат: {prop['rooms']}")
if prop.get('area'):
details.append(f"площадь: {prop['area']} м²")
if prop.get('price'):
details.append(f"цена: {prop['price']:,} ₽")
if details:
parts.append("Характеристики: " + ", ".join(details))
return ". ".join(parts)
def index_batch(properties: List[Dict[str, Any]], batch_size: int = 20) -> Dict[str, Any]:
"""Индексировать батч объектов через HuggingFace Spaces"""
items = []
for prop in properties:
# Подготавливаем данные для эндпоинта /batch
item = {
"entity_id": str(prop['property_id']),
"title": prop.get('title', ''),
"description": prop.get('description', ''),
"price": float(prop['price']) if prop.get('price') else None,
"rooms": int(prop['rooms']) if prop.get('rooms') else None,
"area": float(prop['area']) if prop.get('area') else None,
"address": prop.get('address', ''),
"district": "" # Можно извлечь из address если нужно
}
items.append(item)
payload = {"items": items}
try:
print(f" 📤 Sending batch of {len(items)} items to {HF_SERVICE_URL}/batch")
print(f" Payload size: {len(str(payload))} bytes")
response = requests.post(
f"{HF_SERVICE_URL}/batch",
json=payload,
timeout=120 # 2 минуты на батч (было 5 минут, но timeout на сервере 30с)
)
print(f" Response status: {response.status_code}")
if response.status_code == 200:
result = response.json()
return result
else:
print(f" ❌ Error: {response.status_code}")
print(f" Response: {response.text[:500]}")
# Пробуем получить более детальную информацию об ошибке
try:
error_detail = response.json()
print(f" Detail: {error_detail}")
except:
pass
return None
except requests.exceptions.Timeout:
print(f" ❌ Request timeout (120 seconds)")
return None
except requests.exceptions.ConnectionError as e:
print(f" ❌ Connection error: {e}")
return None
except requests.exceptions.RequestException as e:
print(f" ❌ Request failed: {e}")
return None
def save_embeddings_to_file(results: List[Dict], filename: str = "generated_embeddings.json"):
"""Сохранить результаты индексации в файл (для проверки)"""
import json
with open(filename, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"💾 Saved embeddings to {filename}")
def main():
print("=" * 70)
print("INDEXING PROPERTIES THROUGH HUGGINGFACE SPACES")
print("=" * 70)
# Проверяем параметры командной строки
auto_confirm = '--yes' in sys.argv or '-y' in sys.argv
if auto_confirm:
print("🤖 Auto-confirm mode enabled")
# 1. Получаем объекты из БД
properties = get_properties_from_db()
if not properties:
print("⚠️ No properties found in database")
return
print(f"\n📊 Total properties to index: {len(properties)}")
# Показываем пример
print(f"\n📄 Sample property:")
sample = properties[0]
print(f" ID: {sample['property_id']}")
print(f" Title: {sample.get('title', 'N/A')}")
print(f" Text preview: {prepare_text_for_property(sample)[:150]}...")
# Подтверждение
if not auto_confirm:
print(f"\n🚀 Ready to index {len(properties)} properties")
print(f" Service: {HF_SERVICE_URL}")
print(f" Endpoint: /batch")
try:
response = input("\nProceed? (yes/y/no/n): ")
if response.lower() not in ['yes', 'y']:
print("Cancelled by user")
return
except EOFError:
print("\n❌ Error: EOF when reading input")
print("Run with --yes flag to auto-confirm: python index_all_properties.py --yes")
return
else:
print(f"\n✅ Auto-confirming indexing of {len(properties)} properties")
print(f" Service: {HF_SERVICE_URL}")
print(f" Endpoint: /batch")
# 2. Индексируем батчами
batch_size = 20 # Уменьшено с 50 до 20 (время обработки ~30 сек на сервере)
total_batches = (len(properties) + batch_size - 1) // batch_size
print(f"\n📦 Processing {total_batches} batches (batch size: {batch_size})")
print(f" ⏱️ Each batch will take ~30-40 seconds to process")
print(f" 📊 Total time estimate: ~{(total_batches * 35) // 60} minutes")
all_results = []
successful = 0
failed = 0
for i in range(0, len(properties), batch_size):
batch = properties[i:i + batch_size]
batch_num = i // batch_size + 1
print(f"\n🔄 Batch {batch_num}/{total_batches} ({len(batch)} items)")
result = index_batch(batch, batch_size)
if result:
all_results.append(result)
batch_successful = result.get('successful', 0)
batch_failed = result.get('failed', 0)
successful += batch_successful
failed += batch_failed
print(f" ✅ Success: {batch_successful}/{len(batch)}")
if batch_failed > 0:
print(f" ⚠️ Failed: {batch_failed}")
else:
print(f" ❌ Batch failed completely")
failed += len(batch)
# Задержка между батчами
if i + batch_size < len(properties):
print(f" ⏳ Waiting 10 seconds before next batch...")
time.sleep(10)
# 3. Сохраняем результаты
if all_results:
save_embeddings_to_file(all_results, "indexing_results.json")
# 4. Итоги
print("\n" + "=" * 70)
print("INDEXING COMPLETE")
print("=" * 70)
print(f"✅ Successfully indexed: {successful}/{len(properties)}")
print(f"❌ Failed: {failed}/{len(properties)}")
if successful > 0:
print(f"\n💡 Note: Embeddings were generated on HuggingFace Spaces")
print(f" Results saved to: indexing_results.json")
print(f" Backend should fetch these embeddings and store in DB")
print("\n" + "=" * 70)
if __name__ == '__main__':
main()