Spaces:
Sleeping
Sleeping
Nikolay Ponomarev
commited on
Commit
·
2cec12a
1
Parent(s):
410e024
Item Search
Browse files
app.py
CHANGED
|
@@ -259,14 +259,92 @@ def sort_by_priority(text: str) -> str:
|
|
| 259 |
out = (out + "\n\n" + tail).strip()
|
| 260 |
return out
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
def ensure_doc_finance_subpoints(text: str) -> str:
|
| 264 |
"""
|
| 265 |
-
|
| 266 |
-
|
| 267 |
"""
|
| 268 |
lines = text.splitlines()
|
| 269 |
out = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
i = 0
|
| 271 |
while i < len(lines):
|
| 272 |
ln = lines[i]
|
|
@@ -275,27 +353,27 @@ def ensure_doc_finance_subpoints(text: str) -> str:
|
|
| 275 |
if ln.startswith("- [ ]"):
|
| 276 |
low = ln.lower()
|
| 277 |
|
| 278 |
-
# detect if next
|
| 279 |
j = i + 1
|
| 280 |
-
has_sub =
|
| 281 |
-
while j < len(lines) and lines[j].startswith(" -"):
|
| 282 |
-
has_sub = True
|
| 283 |
-
break
|
| 284 |
|
| 285 |
-
#
|
| 286 |
-
if not has_sub and any(k in low for k in ["документ", "виза", "страхов", "паспорт"]):
|
| 287 |
out.extend([
|
| 288 |
" - Проверь срок действия загранпаспорта и требования к остаточному сроку",
|
| 289 |
" - Составь список нужных документов и сделай копии (бумага + облако)",
|
| 290 |
" - Уточни требования по визе/ВНЖ и собери подтверждения (финансы, жильё, приглашение)",
|
| 291 |
])
|
|
|
|
| 292 |
|
| 293 |
-
|
|
|
|
| 294 |
out.extend([
|
| 295 |
" - Проверь лимиты, комиссии и возможность работы карт за границей",
|
| 296 |
-
" -
|
| 297 |
" - Включи уведомления и двухфакторную защиту в банковских приложениях",
|
| 298 |
])
|
|
|
|
| 299 |
|
| 300 |
i += 1
|
| 301 |
|
|
@@ -369,16 +447,23 @@ def generate_checklist(user_goal: str, category: str, style: str, constraints: s
|
|
| 369 |
# Generation (primary)
|
| 370 |
out = gen_pipe(
|
| 371 |
prompt,
|
| 372 |
-
max_new_tokens=
|
| 373 |
do_sample=True,
|
| 374 |
-
temperature=0.
|
| 375 |
-
top_p=0.
|
| 376 |
-
repetition_penalty=1.
|
|
|
|
| 377 |
return_full_text=False,
|
| 378 |
)
|
| 379 |
text = (out[0].get("generated_text") or "").strip()
|
| 380 |
text = clean_checklist(text)
|
| 381 |
text = sort_by_priority(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
# Retry if too short/poor format
|
| 384 |
if text.count("- [ ]") < 10:
|
|
|
|
| 259 |
out = (out + "\n\n" + tail).strip()
|
| 260 |
return out
|
| 261 |
|
| 262 |
+
def dedupe_checklist(text: str) -> str:
|
| 263 |
+
"""
|
| 264 |
+
Removes duplicate items (and duplicate subpoints) while keeping order.
|
| 265 |
+
"""
|
| 266 |
+
body, tail = split_tail_sections(text)
|
| 267 |
+
|
| 268 |
+
lines = body.splitlines()
|
| 269 |
+
blocks = []
|
| 270 |
+
i = 0
|
| 271 |
+
while i < len(lines):
|
| 272 |
+
if lines[i].startswith("- [ ]"):
|
| 273 |
+
item = lines[i]
|
| 274 |
+
subs = []
|
| 275 |
+
i += 1
|
| 276 |
+
while i < len(lines) and lines[i].startswith(" -"):
|
| 277 |
+
subs.append(lines[i])
|
| 278 |
+
i += 1
|
| 279 |
+
blocks.append((item, subs))
|
| 280 |
+
else:
|
| 281 |
+
i += 1
|
| 282 |
+
|
| 283 |
+
def norm(s: str) -> str:
|
| 284 |
+
s = s.lower()
|
| 285 |
+
s = re.sub(r"\(p[0-2]\)", "", s)
|
| 286 |
+
s = re.sub(r"[^a-zа-я0-9 ]+", " ", s)
|
| 287 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 288 |
+
return s
|
| 289 |
+
|
| 290 |
+
seen_items = set()
|
| 291 |
+
out = []
|
| 292 |
+
for item, subs in blocks:
|
| 293 |
+
k = norm(item)
|
| 294 |
+
if k in seen_items:
|
| 295 |
+
continue
|
| 296 |
+
seen_items.add(k)
|
| 297 |
+
|
| 298 |
+
out.append(item)
|
| 299 |
+
|
| 300 |
+
seen_subs = set()
|
| 301 |
+
for sub in subs:
|
| 302 |
+
sk = norm(sub)
|
| 303 |
+
if sk in seen_subs:
|
| 304 |
+
continue
|
| 305 |
+
seen_subs.add(sk)
|
| 306 |
+
out.append(sub)
|
| 307 |
+
|
| 308 |
+
out_text = "\n".join(out).strip()
|
| 309 |
+
if tail:
|
| 310 |
+
out_text = (out_text + "\n\n" + tail).strip()
|
| 311 |
+
return out_text
|
| 312 |
+
|
| 313 |
+
def trim_incomplete_last_line(text: str) -> str:
|
| 314 |
+
lines = text.splitlines()
|
| 315 |
+
if not lines:
|
| 316 |
+
return text
|
| 317 |
+
|
| 318 |
+
last = lines[-1].strip()
|
| 319 |
+
|
| 320 |
+
# Если последняя строка выглядит оборванной — удаляем её
|
| 321 |
+
if (last.startswith("- [ ]") or last.startswith(" -")) and (
|
| 322 |
+
re.search(r"[,:;—–-]$", last) or len(last.split()) < 3
|
| 323 |
+
):
|
| 324 |
+
lines = lines[:-1]
|
| 325 |
+
|
| 326 |
+
return "\n".join(lines).strip()
|
| 327 |
+
|
| 328 |
|
| 329 |
def ensure_doc_finance_subpoints(text: str) -> str:
|
| 330 |
"""
|
| 331 |
+
Adds docs/finance subpoints only once (to the first suitable item without subpoints),
|
| 332 |
+
to avoid duplication across repeated/near-duplicate items.
|
| 333 |
"""
|
| 334 |
lines = text.splitlines()
|
| 335 |
out = []
|
| 336 |
+
|
| 337 |
+
added_docs = False
|
| 338 |
+
added_fin = False
|
| 339 |
+
|
| 340 |
+
def has_any_subpoints_with_keywords(all_lines, keywords):
|
| 341 |
+
low_all = "\n".join(all_lines).lower()
|
| 342 |
+
return any(k in low_all for k in keywords)
|
| 343 |
+
|
| 344 |
+
# If checklist already contains relevant subpoints somewhere, don't inject more.
|
| 345 |
+
docs_kw_present = has_any_subpoints_with_keywords(lines, ["загранпаспорт", "виза", "внж", "копии", "облако", "приглашение"])
|
| 346 |
+
fin_kw_present = has_any_subpoints_with_keywords(lines, ["комисси", "лимит", "вторая карта", "2fa", "двухфактор", "уведомлен"])
|
| 347 |
+
|
| 348 |
i = 0
|
| 349 |
while i < len(lines):
|
| 350 |
ln = lines[i]
|
|
|
|
| 353 |
if ln.startswith("- [ ]"):
|
| 354 |
low = ln.lower()
|
| 355 |
|
| 356 |
+
# detect if next lines already have subpoints
|
| 357 |
j = i + 1
|
| 358 |
+
has_sub = (j < len(lines) and lines[j].startswith(" -"))
|
|
|
|
|
|
|
|
|
|
| 359 |
|
| 360 |
+
# DOCS
|
| 361 |
+
if (not added_docs) and (not docs_kw_present) and (not has_sub) and any(k in low for k in ["документ", "виза", "внж", "страхов", "паспорт"]):
|
| 362 |
out.extend([
|
| 363 |
" - Проверь срок действия загранпаспорта и требования к остаточному сроку",
|
| 364 |
" - Составь список нужных документов и сделай копии (бумага + облако)",
|
| 365 |
" - Уточни требования по визе/ВНЖ и собери подтверждения (финансы, жильё, приглашение)",
|
| 366 |
])
|
| 367 |
+
added_docs = True
|
| 368 |
|
| 369 |
+
# FINANCE
|
| 370 |
+
if (not added_fin) and (not fin_kw_present) and (not has_sub) and any(k in low for k in ["счёт", "счет", "банк", "карта", "финанс"]):
|
| 371 |
out.extend([
|
| 372 |
" - Проверь лимиты, комиссии и возможность работы карт за границей",
|
| 373 |
+
" - Подготовь резервный доступ к деньгам (вторая карта/наличные/перевод)",
|
| 374 |
" - Включи уведомления и двухфакторную защиту в банковских приложениях",
|
| 375 |
])
|
| 376 |
+
added_fin = True
|
| 377 |
|
| 378 |
i += 1
|
| 379 |
|
|
|
|
| 447 |
# Generation (primary)
|
| 448 |
out = gen_pipe(
|
| 449 |
prompt,
|
| 450 |
+
max_new_tokens=900, # было меньше — из-за этого обрывы
|
| 451 |
do_sample=True,
|
| 452 |
+
temperature=0.55, # ниже = меньше болтовни/повторов
|
| 453 |
+
top_p=0.9,
|
| 454 |
+
repetition_penalty=1.15, # выше = меньше дублей
|
| 455 |
+
no_repeat_ngram_size=4, # очень помогает от повторов
|
| 456 |
return_full_text=False,
|
| 457 |
)
|
| 458 |
text = (out[0].get("generated_text") or "").strip()
|
| 459 |
text = clean_checklist(text)
|
| 460 |
text = sort_by_priority(text)
|
| 461 |
+
text = ensure_doc_finance_subpoints(text)
|
| 462 |
+
text = dedupe_checklist(text) # убрали повторы
|
| 463 |
+
text = trim_incomplete_last_line(text) # убрали оборванный хвост
|
| 464 |
+
text = polish_russian_same_model(text) # шлифовка
|
| 465 |
+
text = dedupe_checklist(text) # на всякий после полировки
|
| 466 |
+
text = trim_incomplete_last_line(text)
|
| 467 |
|
| 468 |
# Retry if too short/poor format
|
| 469 |
if text.count("- [ ]") < 10:
|