Spaces:
Sleeping
Sleeping
File size: 6,693 Bytes
93cd57d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import requests
import json
import re
import sqlite3
import time
# Читаем SQL файл
with open('pars_samolet.sql', 'r', encoding='utf-8') as f:
sql_content = f.read()
# Извлекаем CREATE TABLE
create_match = re.search(r'CREATE TABLE[^;]+;', sql_content, re.DOTALL)
if not create_match:
raise ValueError("CREATE TABLE not found")
create_stmt = create_match.group(0)
# Извлекаем все INSERT
insert_pattern = r'INSERT INTO mytable\([^)]+\) VALUES\s*\([^;]+\);'
inserts = re.findall(insert_pattern, sql_content, re.DOTALL)
# Создаем временную базу для парсинга
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
cursor.execute(create_stmt)
# Функция для экранирования SQL-строк
def escape_sql_value(value):
if value is None:
return 'NULL'
# Заменяем одинарные кавычки на две одинарные кавычки
return str(value).replace("'", "''")
# Выполняем INSERT с экранированием
for insert in inserts:
try:
cursor.execute(insert)
except sqlite3.OperationalError as e:
# Если есть ошибка, пытаемся очистить строку от проблемных символов
# Находим VALUES часть
match = re.search(r'VALUES\s*\((.+)\);', insert, re.DOTALL)
if match:
values_str = match.group(1)
# Разбиваем значения по запятым, учитывая строки в кавычках
values = []
current = ''
in_string = False
for char in values_str:
if char == "'" and (len(current) == 0 or current[-1] != '\\'):
in_string = not in_string
current += char
if not in_string and char == ',':
values.append(current[:-1].strip())
current = ''
if current:
values.append(current.strip())
# Экранируем каждое строковое значение
cleaned_values = []
for val in values:
if val.upper() == 'NULL':
cleaned_values.append('NULL')
elif val.startswith("'") and val.endswith("'"):
# Это строковое значение
inner = val[1:-1]
# Удаляем лишние переводы строк и табуляции
inner = inner.replace('\n', ' ').replace('\r', ' ')
inner = ' '.join(inner.split()) # Удаляем лишние пробелы
inner = escape_sql_value(inner)
cleaned_values.append(f"'{inner}'")
else:
# Числовое или другое значение
cleaned_values.append(val)
# Собираем новый INSERT
insert_start = insert[:match.start(1)]
insert_end = insert[match.end(1):]
new_insert = insert_start + ', '.join(cleaned_values) + insert_end
try:
cursor.execute(new_insert)
print(f"Fixed problematic INSERT")
except Exception as e2:
print(f"Still failed to execute INSERT: {e2}")
# Пропускаем проблемную запись
continue
else:
print(f"Could not parse INSERT: {insert[:100]}...")
continue
# Получаем данные
cursor.execute('SELECT * FROM mytable')
rows = cursor.fetchall()
# Получаем имена колонок
cursor.execute("PRAGMA table_info(mytable)")
columns = [col[1] for col in cursor.fetchall()]
# Создаем список словарей
objects = [dict(zip(columns, row)) for row in rows]
conn.close()
print(f"Total objects parsed: {len(objects)}")
# Разделяем на батчи по 10, берем первые 4
batch_size = 50
batches = [objects[i:i + batch_size] for i in range(0, len(objects), batch_size)][:4]
# URL для API
url = 'https://calcifer0323-matching.hf.space/batch'
responses = []
for i, batch in enumerate(batches):
print(f"Sending batch {i+1} with {len(batch)} objects")
# Преобразуем объекты в items для /batch
items = []
for obj in batch:
item = {
"entity_id": str(obj["property_id"]),
"title": str(obj.get("title", "")),
"description": str(obj.get("description", "")),
"price": float(obj.get("price", 0)) if obj.get("price") else None,
"rooms": float(obj.get("rooms", 0)) if obj.get("rooms") else None,
"area": float(obj.get("area", 0)) if obj.get("area") else None,
"address": str(obj.get("address", "")),
"district": str(obj.get("city", ""))
}
items.append(item)
payload = {"items": items}
try:
response = requests.post(url, json=payload, timeout=300)
if response.status_code == 200:
data = response.json()
responses.append(data)
successful = data.get('successful', 0)
total = data.get('total', 0)
print(f"Batch {i+1} successful, embedded {successful}/{total}")
else:
print(f"Batch {i+1} failed: {response.status_code} - {response.text}")
except Exception as e:
print(f"Error sending batch {i+1}: {e}")
# Задержка между батчами
if i < len(batches) - 1:
print("Waiting 10 seconds before next batch...")
time.sleep(10)
# Сохраняем результаты в SQL файл
with open('indexed_objects.sql', 'w', encoding='utf-8') as f:
f.write("CREATE TABLE IF NOT EXISTS indexed_objects (\n")
f.write(" property_id VARCHAR(36) PRIMARY KEY,\n")
f.write(" embedding JSON\n")
f.write(");\n\n")
f.write("DELETE FROM indexed_objects;\n\n")
for resp in responses:
for result in resp.get("results", []):
if result.get("success"):
property_id = result["entity_id"]
embedding = json.dumps(result["embedding"])
# Экранируем для SQL
embedding_escaped = embedding.replace("'", "''")
f.write(f"INSERT INTO indexed_objects (property_id, embedding) VALUES ('{property_id}', '{embedding_escaped}');\n")
print(f"Indexing complete. Results saved to indexed_objects.sql")
print(f"Total batches processed: {len(responses)}") |