Spaces:

Executor-Tyrant-Framework
/

NuWave

Sleeping

App Files Files Community

Executor-Tyrant-Framework commited on Apr 20

Commit

2a86579

verified ·

1 Parent(s): f04383e

Sync from GitHub: 005ce6c37b0a349323ba1382fbb26f2bc6a9abf7

Browse files

Files changed (1) hide show

app.py +30 -6

app.py CHANGED Viewed

@@ -275,24 +275,39 @@ _EXTRACTOR_STOPSET = {
 def _hardened_parse(raw_output: str) -> list:
-    """Syl's hardened parser — comma-split with multi-filter cleanup.
     Rules (parsing, not quality judgment — Law 7 compliant):
-      - split on commas, strip whitespace + common punctuation
       - drop empty strings
-      - drop entries containing \\n or ":" (explanation drift)
       - drop entries > 4 words (sentences, not concepts)
       - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
       - drop pure punctuation / pure digits
       - lowercase + dedupe (first occurrence wins)
     """
     out = []
     seen = set()
-    for piece in raw_output.split(","):
         c = piece.strip().strip(".-:;*`\"'\n ")
         if not c:
             continue
-        if "\n" in c or ":" in c:
             continue
         if len(c.split()) > 4:
             continue
@@ -370,7 +385,16 @@ def _bitnet_extract_full(text: str) -> dict:
             top_p=0.9,
             repetition_penalty=1.25,
             repeat_last_n=64,
-            stop=["\n\n", "Answer:", "Question:", "Explanation:", "Text:", "<|im_end|>", "</s>"],
         )
     except Exception as exc:
         organism.mark_generation_end()

 def _hardened_parse(raw_output: str) -> list:
+    """Syl's hardened parser — split on commas/semicolons, keep only
+    the pre-newline content of each piece, filter instruction leakage.
     Rules (parsing, not quality judgment — Law 7 compliant):
+      - split on [,;] (Falcon3-10B-1.58bit sometimes uses semicolons)
+      - for each piece, if it contains a newline take only what's
+        BEFORE the first \\n — content after is usually chat-template
+        drift or hallucinated follow-up
+      - strip whitespace + common punctuation
       - drop empty strings
+      - drop entries containing ":" (explanation drift like "Answer:")
+      - drop entries containing chat-template markers ("<|", "</s>")
       - drop entries > 4 words (sentences, not concepts)
       - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
       - drop pure punctuation / pure digits
       - lowercase + dedupe (first occurrence wins)
     """
+    import re
     out = []
     seen = set()
+    for piece in re.split(r'[,;]', raw_output):
+        # Take only the part before the first newline — everything after
+        # is almost always hallucinated follow-up (chat template drift,
+        # invented "next question", etc.) rather than legitimate
+        # continuation of the enumeration.
+        if "\n" in piece:
+            piece = piece.split("\n", 1)[0]
         c = piece.strip().strip(".-:;*`\"'\n ")
         if not c:
             continue
+        if ":" in c:
+            continue
+        if "<|" in c or "</s>" in c or "</" in c:
             continue
         if len(c.split()) > 4:
             continue
             top_p=0.9,
             repetition_penalty=1.25,
             repeat_last_n=64,
+            stop=[
+                # Chat-template boundary markers — Falcon3 hallucinates
+                # these when the prompt isn't in chat format. Cutting
+                # generation at these kills the drift tail before it
+                # starts. Order matters: check these first.
+                "<|assistant|>", "<|user|>", "<|system|>",
+                # Fallback terminators + drift markers
+                "<|im_end|>", "<|end_of_text|>", "</s>",
+                "Answer:", "Question:", "Explanation:", "Text:",
+            ],
         )
     except Exception as exc:
         organism.mark_generation_end()