youssefreda9 commited on
Commit
d3a32e2
·
1 Parent(s): 2fe1569

FIX-23: Block grammar model tanween removal + HF log fetcher

Browse files

1. Tanween blocker: grammar model strips tanween (جداً→جدا) from correct text.
Added filter to block diffs that only differ by tanween marks.
2. HF log fetcher script for automated tracing.

src/app.py CHANGED
@@ -1631,6 +1631,21 @@ def analyze_text():
1631
  )
1632
  continue
1633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1634
  # ── FIX-06: Directional block protection for grammar ──
1635
  # Prevents meaning-changing substitutions (كان→كأن etc.)
1636
  # especially critical when spelling is skipped (>1000 chars).
 
1631
  )
1632
  continue
1633
 
1634
+ # ── FIX-23: Tanween removal blocker ──
1635
+ # The grammar model often strips tanween (ً/ٌ/ٍ) from correct text.
1636
+ # Block diffs where the only change is tanween removal.
1637
+ if orig_text and corr_text:
1638
+ import re as _re_tnwn
1639
+ _TANWEEN = '\u064B\u064C\u064D' # ً ٌ ٍ
1640
+ _orig_no_tnwn = _re_tnwn.sub(f'[{_TANWEEN}]', '', orig_text)
1641
+ _corr_no_tnwn = _re_tnwn.sub(f'[{_TANWEEN}]', '', corr_text)
1642
+ if _orig_no_tnwn == _corr_no_tnwn and orig_text != corr_text:
1643
+ logger.info(
1644
+ f"[GRAMMAR] Blocked tanween removal: "
1645
+ f"'{orig_text}'→'{corr_text}'"
1646
+ )
1647
+ continue
1648
+
1649
  # ── FIX-06: Directional block protection for grammar ──
1650
  # Prevents meaning-changing substitutions (كان→كأن etc.)
1651
  # especially critical when spelling is skipped (>1000 chars).
tests/phase10/fetch_hf_logs.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fetch HF Space runtime logs and extract key events."""
2
+ import requests
3
+ import json
4
+ import sys
5
+ import os
6
+
7
+ SPACE_ID = "bayan10/bayan-api"
8
+
9
+ def _get_hf_token():
10
+ """Read HF token from stored credentials (huggingface-cli login)."""
11
+ # 1. Environment variable
12
+ token = os.environ.get("HF_TOKEN", "")
13
+ if token:
14
+ return token
15
+ # 2. huggingface_hub stored token
16
+ token_path = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "token")
17
+ if os.path.exists(token_path):
18
+ with open(token_path, "r") as f:
19
+ return f.read().strip()
20
+ return ""
21
+
22
+ TOKEN = _get_hf_token()
23
+
24
+ def fetch_logs(max_lines=500):
25
+ """Fetch runtime logs from HF Space."""
26
+ headers = {"Authorization": f"Bearer {TOKEN}"}
27
+ url = f"https://huggingface.co/api/spaces/{SPACE_ID}/logs/run"
28
+
29
+ r = requests.get(url, headers=headers, timeout=30, stream=True)
30
+ if r.status_code != 200:
31
+ print(f"Error: {r.status_code}")
32
+ return []
33
+
34
+ lines = []
35
+ for chunk in r.iter_content(chunk_size=8192, decode_unicode=True):
36
+ for line in chunk.split('\n'):
37
+ if line.startswith('data: '):
38
+ try:
39
+ data = json.loads(line[6:])
40
+ lines.append(data.get('data', ''))
41
+ except:
42
+ pass
43
+ if len(lines) > max_lines:
44
+ break
45
+ return lines
46
+
47
+ def analyze_logs(lines):
48
+ """Extract key events from logs."""
49
+ errors = []
50
+ grammar_events = []
51
+ spelling_events = []
52
+ startup = []
53
+
54
+ for line in lines:
55
+ if 'ERROR' in line or 'NameError' in line or 'Traceback' in line:
56
+ errors.append(line)
57
+ elif '[GRAMMAR' in line or 'Grammar' in line:
58
+ grammar_events.append(line)
59
+ elif '[SPELLING' in line:
60
+ spelling_events.append(line)
61
+ elif 'Startup' in line or 'loaded' in line.lower() or 'ready' in line.lower():
62
+ startup.append(line)
63
+
64
+ print(f"\n{'='*60}")
65
+ print(f"HF SPACE LOG ANALYSIS ({len(lines)} lines)")
66
+ print(f"{'='*60}")
67
+
68
+ print(f"\n🚀 STARTUP ({len(startup)} events):")
69
+ for e in startup[-5:]:
70
+ print(f" {e}")
71
+
72
+ print(f"\n❌ ERRORS ({len(errors)}):")
73
+ if errors:
74
+ for e in errors[-10:]:
75
+ print(f" {e}")
76
+ else:
77
+ print(" None! ✅")
78
+
79
+ print(f"\n📝 GRAMMAR ({len(grammar_events)} events, last 5):")
80
+ for e in grammar_events[-5:]:
81
+ print(f" {e}")
82
+
83
+ print(f"\n✏️ SPELLING ({len(spelling_events)} events, last 5):")
84
+ for e in spelling_events[-5:]:
85
+ print(f" {e}")
86
+
87
+ if __name__ == "__main__":
88
+ lines = fetch_logs()
89
+ analyze_logs(lines)
tests/phase10/reports/phase10_results.json CHANGED
The diff for this file is too large to render. See raw diff