Spaces:

Calcifer0323
/

matching

Sleeping

App Files Files Community

matching / index_all_properties.py

Calcifer0323

Fix: Update to RoSBERTa model (1024 dims), remove half precision, increase timeout

93cd57d 3 months ago

raw

history blame contribute delete

9.04 kB

	"""
	Скрипт для индексации всех объектов недвижимости через HuggingFace Spaces сервис

	Usage:
	python index_all_properties.py # Интерактивный режим
	python index_all_properties.py --yes # Автоподтверждение
	"""
	import psycopg2
	import requests
	import time
	import sys
	from typing import List, Dict, Any

	# Конфигурация БД
	DB_CONFIG = {
	'host': 'dpg-d5ht8vi4d50c739akh2g-a.virginia-postgres.render.com',
	'port': 5432,
	'database': 'lead_exchange_bk',
	'user': 'lead_exchange_bk_user',
	'password': '8m2gtTRBW0iAr7nY2Aadzz0VcZBEVKYM'
	}

	# URL сервиса на HuggingFace Spaces
	HF_SERVICE_URL = "https://calcifer0323-matching.hf.space"

	def get_properties_from_db() -> List[Dict[str, Any]]:
	"""Получить все объекты недвижимости из БД"""
	print("📥 Fetching properties from database...")

	conn = psycopg2.connect(**DB_CONFIG)
	cursor = conn.cursor()

	cursor.execute("""
	SELECT property_id, title, description, address, property_type,
	area, price, rooms, status
	FROM properties
	ORDER BY created_at DESC
	""")

	columns = ['property_id', 'title', 'description', 'address', 'property_type',
	'area', 'price', 'rooms', 'status']

	properties = []
	for row in cursor.fetchall():
	prop = dict(zip(columns, row))
	properties.append(prop)

	cursor.close()
	conn.close()

	print(f"✅ Fetched {len(properties)} properties")
	return properties

	def prepare_text_for_property(prop: Dict[str, Any]) -> str:
	"""Подготовить текст для генерации эмбеддинга"""
	parts = []

	if prop.get('title'):
	parts.append(f"Название: {prop['title']}")

	if prop.get('description'):
	parts.append(f"Описание: {prop['description']}")

	if prop.get('address'):
	parts.append(f"Адрес: {prop['address']}")

	# Добавляем структурированные данные
	details = []
	if prop.get('property_type'):
	details.append(f"тип: {prop['property_type']}")
	if prop.get('rooms'):
	details.append(f"комнат: {prop['rooms']}")
	if prop.get('area'):
	details.append(f"площадь: {prop['area']} м²")
	if prop.get('price'):
	details.append(f"цена: {prop['price']:,} ₽")

	if details:
	parts.append("Характеристики: " + ", ".join(details))

	return ". ".join(parts)

	def index_batch(properties: List[Dict[str, Any]], batch_size: int = 20) -> Dict[str, Any]:
	"""Индексировать батч объектов через HuggingFace Spaces"""
	items = []

	for prop in properties:
	# Подготавливаем данные для эндпоинта /batch
	item = {
	"entity_id": str(prop['property_id']),
	"title": prop.get('title', ''),
	"description": prop.get('description', ''),
	"price": float(prop['price']) if prop.get('price') else None,
	"rooms": int(prop['rooms']) if prop.get('rooms') else None,
	"area": float(prop['area']) if prop.get('area') else None,
	"address": prop.get('address', ''),
	"district": "" # Можно извлечь из address если нужно
	}
	items.append(item)

	payload = {"items": items}

	try:
	print(f" 📤 Sending batch of {len(items)} items to {HF_SERVICE_URL}/batch")
	print(f" Payload size: {len(str(payload))} bytes")

	response = requests.post(
	f"{HF_SERVICE_URL}/batch",
	json=payload,
	timeout=120 # 2 минуты на батч (было 5 минут, но timeout на сервере 30с)
	)

	print(f" Response status: {response.status_code}")

	if response.status_code == 200:
	result = response.json()
	return result
	else:
	print(f" ❌ Error: {response.status_code}")
	print(f" Response: {response.text[:500]}")

	# Пробуем получить более детальную информацию об ошибке
	try:
	error_detail = response.json()
	print(f" Detail: {error_detail}")
	except:
	pass

	return None

	except requests.exceptions.Timeout:
	print(f" ❌ Request timeout (120 seconds)")
	return None
	except requests.exceptions.ConnectionError as e:
	print(f" ❌ Connection error: {e}")
	return None
	except requests.exceptions.RequestException as e:
	print(f" ❌ Request failed: {e}")
	return None

	def save_embeddings_to_file(results: List[Dict], filename: str = "generated_embeddings.json"):
	"""Сохранить результаты индексации в файл (для проверки)"""
	import json

	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	print(f"💾 Saved embeddings to {filename}")

	def main():
	print("=" * 70)
	print("INDEXING PROPERTIES THROUGH HUGGINGFACE SPACES")
	print("=" * 70)

	# Проверяем параметры командной строки
	auto_confirm = '--yes' in sys.argv or '-y' in sys.argv

	if auto_confirm:
	print("🤖 Auto-confirm mode enabled")

	# 1. Получаем объекты из БД
	properties = get_properties_from_db()

	if not properties:
	print("⚠️ No properties found in database")
	return

	print(f"\n📊 Total properties to index: {len(properties)}")

	# Показываем пример
	print(f"\n📄 Sample property:")
	sample = properties[0]
	print(f" ID: {sample['property_id']}")
	print(f" Title: {sample.get('title', 'N/A')}")
	print(f" Text preview: {prepare_text_for_property(sample)[:150]}...")

	# Подтверждение
	if not auto_confirm:
	print(f"\n🚀 Ready to index {len(properties)} properties")
	print(f" Service: {HF_SERVICE_URL}")
	print(f" Endpoint: /batch")

	try:
	response = input("\nProceed? (yes/y/no/n): ")
	if response.lower() not in ['yes', 'y']:
	print("Cancelled by user")
	return
	except EOFError:
	print("\n❌ Error: EOF when reading input")
	print("Run with --yes flag to auto-confirm: python index_all_properties.py --yes")
	return
	else:
	print(f"\n✅ Auto-confirming indexing of {len(properties)} properties")
	print(f" Service: {HF_SERVICE_URL}")
	print(f" Endpoint: /batch")

	# 2. Индексируем батчами
	batch_size = 20 # Уменьшено с 50 до 20 (время обработки ~30 сек на сервере)
	total_batches = (len(properties) + batch_size - 1) // batch_size

	print(f"\n📦 Processing {total_batches} batches (batch size: {batch_size})")
	print(f" ⏱️ Each batch will take ~30-40 seconds to process")
	print(f" 📊 Total time estimate: ~{(total_batches * 35) // 60} minutes")

	all_results = []
	successful = 0
	failed = 0

	for i in range(0, len(properties), batch_size):
	batch = properties[i:i + batch_size]
	batch_num = i // batch_size + 1

	print(f"\n🔄 Batch {batch_num}/{total_batches} ({len(batch)} items)")

	result = index_batch(batch, batch_size)

	if result:
	all_results.append(result)
	batch_successful = result.get('successful', 0)
	batch_failed = result.get('failed', 0)
	successful += batch_successful
	failed += batch_failed

	print(f" ✅ Success: {batch_successful}/{len(batch)}")
	if batch_failed > 0:
	print(f" ⚠️ Failed: {batch_failed}")
	else:
	print(f" ❌ Batch failed completely")
	failed += len(batch)

	# Задержка между батчами
	if i + batch_size < len(properties):
	print(f" ⏳ Waiting 10 seconds before next batch...")
	time.sleep(10)

	# 3. Сохраняем результаты
	if all_results:
	save_embeddings_to_file(all_results, "indexing_results.json")

	# 4. Итоги
	print("\n" + "=" * 70)
	print("INDEXING COMPLETE")
	print("=" * 70)
	print(f"✅ Successfully indexed: {successful}/{len(properties)}")
	print(f"❌ Failed: {failed}/{len(properties)}")

	if successful > 0:
	print(f"\n💡 Note: Embeddings were generated on HuggingFace Spaces")
	print(f" Results saved to: indexing_results.json")
	print(f" Backend should fetch these embeddings and store in DB")

	print("\n" + "=" * 70)

	if __name__ == '__main__':
	main()