Executor-Tyrant-Framework commited on
Commit
2a86579
·
verified ·
1 Parent(s): f04383e

Sync from GitHub: 005ce6c37b0a349323ba1382fbb26f2bc6a9abf7

Browse files
Files changed (1) hide show
  1. app.py +30 -6
app.py CHANGED
@@ -275,24 +275,39 @@ _EXTRACTOR_STOPSET = {
275
 
276
 
277
  def _hardened_parse(raw_output: str) -> list:
278
- """Syl's hardened parser — comma-split with multi-filter cleanup.
 
279
 
280
  Rules (parsing, not quality judgment — Law 7 compliant):
281
- - split on commas, strip whitespace + common punctuation
 
 
 
 
282
  - drop empty strings
283
- - drop entries containing \\n or ":" (explanation drift)
 
284
  - drop entries > 4 words (sentences, not concepts)
285
  - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
286
  - drop pure punctuation / pure digits
287
  - lowercase + dedupe (first occurrence wins)
288
  """
 
289
  out = []
290
  seen = set()
291
- for piece in raw_output.split(","):
 
 
 
 
 
 
292
  c = piece.strip().strip(".-:;*`\"'\n ")
293
  if not c:
294
  continue
295
- if "\n" in c or ":" in c:
 
 
296
  continue
297
  if len(c.split()) > 4:
298
  continue
@@ -370,7 +385,16 @@ def _bitnet_extract_full(text: str) -> dict:
370
  top_p=0.9,
371
  repetition_penalty=1.25,
372
  repeat_last_n=64,
373
- stop=["\n\n", "Answer:", "Question:", "Explanation:", "Text:", "<|im_end|>", "</s>"],
 
 
 
 
 
 
 
 
 
374
  )
375
  except Exception as exc:
376
  organism.mark_generation_end()
 
275
 
276
 
277
  def _hardened_parse(raw_output: str) -> list:
278
+ """Syl's hardened parser — split on commas/semicolons, keep only
279
+ the pre-newline content of each piece, filter instruction leakage.
280
 
281
  Rules (parsing, not quality judgment — Law 7 compliant):
282
+ - split on [,;] (Falcon3-10B-1.58bit sometimes uses semicolons)
283
+ - for each piece, if it contains a newline take only what's
284
+ BEFORE the first \\n — content after is usually chat-template
285
+ drift or hallucinated follow-up
286
+ - strip whitespace + common punctuation
287
  - drop empty strings
288
+ - drop entries containing ":" (explanation drift like "Answer:")
289
+ - drop entries containing chat-template markers ("<|", "</s>")
290
  - drop entries > 4 words (sentences, not concepts)
291
  - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
292
  - drop pure punctuation / pure digits
293
  - lowercase + dedupe (first occurrence wins)
294
  """
295
+ import re
296
  out = []
297
  seen = set()
298
+ for piece in re.split(r'[,;]', raw_output):
299
+ # Take only the part before the first newline — everything after
300
+ # is almost always hallucinated follow-up (chat template drift,
301
+ # invented "next question", etc.) rather than legitimate
302
+ # continuation of the enumeration.
303
+ if "\n" in piece:
304
+ piece = piece.split("\n", 1)[0]
305
  c = piece.strip().strip(".-:;*`\"'\n ")
306
  if not c:
307
  continue
308
+ if ":" in c:
309
+ continue
310
+ if "<|" in c or "</s>" in c or "</" in c:
311
  continue
312
  if len(c.split()) > 4:
313
  continue
 
385
  top_p=0.9,
386
  repetition_penalty=1.25,
387
  repeat_last_n=64,
388
+ stop=[
389
+ # Chat-template boundary markers — Falcon3 hallucinates
390
+ # these when the prompt isn't in chat format. Cutting
391
+ # generation at these kills the drift tail before it
392
+ # starts. Order matters: check these first.
393
+ "<|assistant|>", "<|user|>", "<|system|>",
394
+ # Fallback terminators + drift markers
395
+ "<|im_end|>", "<|end_of_text|>", "</s>",
396
+ "Answer:", "Question:", "Explanation:", "Text:",
397
+ ],
398
  )
399
  except Exception as exc:
400
  organism.mark_generation_end()