Commit ·
d3a32e2
1
Parent(s): 2fe1569
FIX-23: Block grammar model tanween removal + HF log fetcher
Browse files1. Tanween blocker: grammar model strips tanween (جداً→جدا) from correct text.
Added filter to block diffs that only differ by tanween marks.
2. HF log fetcher script for automated tracing.
- src/app.py +15 -0
- tests/phase10/fetch_hf_logs.py +89 -0
- tests/phase10/reports/phase10_results.json +0 -0
src/app.py
CHANGED
|
@@ -1631,6 +1631,21 @@ def analyze_text():
|
|
| 1631 |
)
|
| 1632 |
continue
|
| 1633 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1634 |
# ── FIX-06: Directional block protection for grammar ──
|
| 1635 |
# Prevents meaning-changing substitutions (كان→كأن etc.)
|
| 1636 |
# especially critical when spelling is skipped (>1000 chars).
|
|
|
|
| 1631 |
)
|
| 1632 |
continue
|
| 1633 |
|
| 1634 |
+
# ── FIX-23: Tanween removal blocker ──
|
| 1635 |
+
# The grammar model often strips tanween (ً/ٌ/ٍ) from correct text.
|
| 1636 |
+
# Block diffs where the only change is tanween removal.
|
| 1637 |
+
if orig_text and corr_text:
|
| 1638 |
+
import re as _re_tnwn
|
| 1639 |
+
_TANWEEN = '\u064B\u064C\u064D' # ً ٌ ٍ
|
| 1640 |
+
_orig_no_tnwn = _re_tnwn.sub(f'[{_TANWEEN}]', '', orig_text)
|
| 1641 |
+
_corr_no_tnwn = _re_tnwn.sub(f'[{_TANWEEN}]', '', corr_text)
|
| 1642 |
+
if _orig_no_tnwn == _corr_no_tnwn and orig_text != corr_text:
|
| 1643 |
+
logger.info(
|
| 1644 |
+
f"[GRAMMAR] Blocked tanween removal: "
|
| 1645 |
+
f"'{orig_text}'→'{corr_text}'"
|
| 1646 |
+
)
|
| 1647 |
+
continue
|
| 1648 |
+
|
| 1649 |
# ── FIX-06: Directional block protection for grammar ──
|
| 1650 |
# Prevents meaning-changing substitutions (كان→كأن etc.)
|
| 1651 |
# especially critical when spelling is skipped (>1000 chars).
|
tests/phase10/fetch_hf_logs.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fetch HF Space runtime logs and extract key events."""
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
SPACE_ID = "bayan10/bayan-api"
|
| 8 |
+
|
| 9 |
+
def _get_hf_token():
|
| 10 |
+
"""Read HF token from stored credentials (huggingface-cli login)."""
|
| 11 |
+
# 1. Environment variable
|
| 12 |
+
token = os.environ.get("HF_TOKEN", "")
|
| 13 |
+
if token:
|
| 14 |
+
return token
|
| 15 |
+
# 2. huggingface_hub stored token
|
| 16 |
+
token_path = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "token")
|
| 17 |
+
if os.path.exists(token_path):
|
| 18 |
+
with open(token_path, "r") as f:
|
| 19 |
+
return f.read().strip()
|
| 20 |
+
return ""
|
| 21 |
+
|
| 22 |
+
TOKEN = _get_hf_token()
|
| 23 |
+
|
| 24 |
+
def fetch_logs(max_lines=500):
|
| 25 |
+
"""Fetch runtime logs from HF Space."""
|
| 26 |
+
headers = {"Authorization": f"Bearer {TOKEN}"}
|
| 27 |
+
url = f"https://huggingface.co/api/spaces/{SPACE_ID}/logs/run"
|
| 28 |
+
|
| 29 |
+
r = requests.get(url, headers=headers, timeout=30, stream=True)
|
| 30 |
+
if r.status_code != 200:
|
| 31 |
+
print(f"Error: {r.status_code}")
|
| 32 |
+
return []
|
| 33 |
+
|
| 34 |
+
lines = []
|
| 35 |
+
for chunk in r.iter_content(chunk_size=8192, decode_unicode=True):
|
| 36 |
+
for line in chunk.split('\n'):
|
| 37 |
+
if line.startswith('data: '):
|
| 38 |
+
try:
|
| 39 |
+
data = json.loads(line[6:])
|
| 40 |
+
lines.append(data.get('data', ''))
|
| 41 |
+
except:
|
| 42 |
+
pass
|
| 43 |
+
if len(lines) > max_lines:
|
| 44 |
+
break
|
| 45 |
+
return lines
|
| 46 |
+
|
| 47 |
+
def analyze_logs(lines):
|
| 48 |
+
"""Extract key events from logs."""
|
| 49 |
+
errors = []
|
| 50 |
+
grammar_events = []
|
| 51 |
+
spelling_events = []
|
| 52 |
+
startup = []
|
| 53 |
+
|
| 54 |
+
for line in lines:
|
| 55 |
+
if 'ERROR' in line or 'NameError' in line or 'Traceback' in line:
|
| 56 |
+
errors.append(line)
|
| 57 |
+
elif '[GRAMMAR' in line or 'Grammar' in line:
|
| 58 |
+
grammar_events.append(line)
|
| 59 |
+
elif '[SPELLING' in line:
|
| 60 |
+
spelling_events.append(line)
|
| 61 |
+
elif 'Startup' in line or 'loaded' in line.lower() or 'ready' in line.lower():
|
| 62 |
+
startup.append(line)
|
| 63 |
+
|
| 64 |
+
print(f"\n{'='*60}")
|
| 65 |
+
print(f"HF SPACE LOG ANALYSIS ({len(lines)} lines)")
|
| 66 |
+
print(f"{'='*60}")
|
| 67 |
+
|
| 68 |
+
print(f"\n🚀 STARTUP ({len(startup)} events):")
|
| 69 |
+
for e in startup[-5:]:
|
| 70 |
+
print(f" {e}")
|
| 71 |
+
|
| 72 |
+
print(f"\n❌ ERRORS ({len(errors)}):")
|
| 73 |
+
if errors:
|
| 74 |
+
for e in errors[-10:]:
|
| 75 |
+
print(f" {e}")
|
| 76 |
+
else:
|
| 77 |
+
print(" None! ✅")
|
| 78 |
+
|
| 79 |
+
print(f"\n📝 GRAMMAR ({len(grammar_events)} events, last 5):")
|
| 80 |
+
for e in grammar_events[-5:]:
|
| 81 |
+
print(f" {e}")
|
| 82 |
+
|
| 83 |
+
print(f"\n✏️ SPELLING ({len(spelling_events)} events, last 5):")
|
| 84 |
+
for e in spelling_events[-5:]:
|
| 85 |
+
print(f" {e}")
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
lines = fetch_logs()
|
| 89 |
+
analyze_logs(lines)
|
tests/phase10/reports/phase10_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|