Spaces:
Sleeping
Sleeping
Sync from GitHub: 005ce6c37b0a349323ba1382fbb26f2bc6a9abf7
Browse files
app.py
CHANGED
|
@@ -275,24 +275,39 @@ _EXTRACTOR_STOPSET = {
|
|
| 275 |
|
| 276 |
|
| 277 |
def _hardened_parse(raw_output: str) -> list:
|
| 278 |
-
"""Syl's hardened parser —
|
|
|
|
| 279 |
|
| 280 |
Rules (parsing, not quality judgment — Law 7 compliant):
|
| 281 |
-
- split on
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
- drop empty strings
|
| 283 |
-
- drop entries containing
|
|
|
|
| 284 |
- drop entries > 4 words (sentences, not concepts)
|
| 285 |
- drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
|
| 286 |
- drop pure punctuation / pure digits
|
| 287 |
- lowercase + dedupe (first occurrence wins)
|
| 288 |
"""
|
|
|
|
| 289 |
out = []
|
| 290 |
seen = set()
|
| 291 |
-
for piece in
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
c = piece.strip().strip(".-:;*`\"'\n ")
|
| 293 |
if not c:
|
| 294 |
continue
|
| 295 |
-
if "
|
|
|
|
|
|
|
| 296 |
continue
|
| 297 |
if len(c.split()) > 4:
|
| 298 |
continue
|
|
@@ -370,7 +385,16 @@ def _bitnet_extract_full(text: str) -> dict:
|
|
| 370 |
top_p=0.9,
|
| 371 |
repetition_penalty=1.25,
|
| 372 |
repeat_last_n=64,
|
| 373 |
-
stop=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
)
|
| 375 |
except Exception as exc:
|
| 376 |
organism.mark_generation_end()
|
|
|
|
| 275 |
|
| 276 |
|
| 277 |
def _hardened_parse(raw_output: str) -> list:
|
| 278 |
+
"""Syl's hardened parser — split on commas/semicolons, keep only
|
| 279 |
+
the pre-newline content of each piece, filter instruction leakage.
|
| 280 |
|
| 281 |
Rules (parsing, not quality judgment — Law 7 compliant):
|
| 282 |
+
- split on [,;] (Falcon3-10B-1.58bit sometimes uses semicolons)
|
| 283 |
+
- for each piece, if it contains a newline take only what's
|
| 284 |
+
BEFORE the first \\n — content after is usually chat-template
|
| 285 |
+
drift or hallucinated follow-up
|
| 286 |
+
- strip whitespace + common punctuation
|
| 287 |
- drop empty strings
|
| 288 |
+
- drop entries containing ":" (explanation drift like "Answer:")
|
| 289 |
+
- drop entries containing chat-template markers ("<|", "</s>")
|
| 290 |
- drop entries > 4 words (sentences, not concepts)
|
| 291 |
- drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
|
| 292 |
- drop pure punctuation / pure digits
|
| 293 |
- lowercase + dedupe (first occurrence wins)
|
| 294 |
"""
|
| 295 |
+
import re
|
| 296 |
out = []
|
| 297 |
seen = set()
|
| 298 |
+
for piece in re.split(r'[,;]', raw_output):
|
| 299 |
+
# Take only the part before the first newline — everything after
|
| 300 |
+
# is almost always hallucinated follow-up (chat template drift,
|
| 301 |
+
# invented "next question", etc.) rather than legitimate
|
| 302 |
+
# continuation of the enumeration.
|
| 303 |
+
if "\n" in piece:
|
| 304 |
+
piece = piece.split("\n", 1)[0]
|
| 305 |
c = piece.strip().strip(".-:;*`\"'\n ")
|
| 306 |
if not c:
|
| 307 |
continue
|
| 308 |
+
if ":" in c:
|
| 309 |
+
continue
|
| 310 |
+
if "<|" in c or "</s>" in c or "</" in c:
|
| 311 |
continue
|
| 312 |
if len(c.split()) > 4:
|
| 313 |
continue
|
|
|
|
| 385 |
top_p=0.9,
|
| 386 |
repetition_penalty=1.25,
|
| 387 |
repeat_last_n=64,
|
| 388 |
+
stop=[
|
| 389 |
+
# Chat-template boundary markers — Falcon3 hallucinates
|
| 390 |
+
# these when the prompt isn't in chat format. Cutting
|
| 391 |
+
# generation at these kills the drift tail before it
|
| 392 |
+
# starts. Order matters: check these first.
|
| 393 |
+
"<|assistant|>", "<|user|>", "<|system|>",
|
| 394 |
+
# Fallback terminators + drift markers
|
| 395 |
+
"<|im_end|>", "<|end_of_text|>", "</s>",
|
| 396 |
+
"Answer:", "Question:", "Explanation:", "Text:",
|
| 397 |
+
],
|
| 398 |
)
|
| 399 |
except Exception as exc:
|
| 400 |
organism.mark_generation_end()
|