tlogandesigns commited on
Commit
a2eff80
·
1 Parent(s): 5950325

catagories patch

Browse files
Files changed (2) hide show
  1. checker.py +105 -17
  2. phrases.yaml +0 -4
checker.py CHANGED
@@ -102,24 +102,82 @@ except Exception:
102
  yaml = None
103
 
104
 
105
- PHRASE_PATTERNS = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  PHRASES_ERROR = None
107
 
 
 
 
 
108
  if yaml:
109
  try:
110
  text = PHRASES_PATH.read_text(encoding="utf-8")
111
  data = yaml.safe_load(text) or {}
112
- pats = data.get("patterns") or []
113
- if not isinstance(pats, list):
114
- raise ValueError("phrases.yaml must define a top-level 'patterns' list")
115
- for i, rx in enumerate(pats, 1):
116
- PHRASE_PATTERNS.append(re.compile(str(rx), re.IGNORECASE))
117
- except FileNotFoundError as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
119
  except Exception as e:
120
  PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
121
 
122
 
 
123
  # Optional HF pipeline (disabled by default to keep CPU/lightweight)
124
  hf_pipe = None
125
  if USE_TINY_ML:
@@ -132,26 +190,52 @@ if USE_TINY_ML:
132
  hf_pipe = None
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def fair_housing_flags(text: str) -> List[str]:
136
  flags: List[str] = []
137
  t = text or ""
138
 
139
- # Rule-based first
140
- for pat in PHRASE_PATTERNS:
141
- for m in pat.finditer(t):
142
  snippet = t[max(0, m.start() - 30) : m.end() + 30]
 
143
  flags.append(
144
- f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
145
  )
146
 
147
- # Optional tiny model
148
  if hf_pipe:
149
  try:
150
- pred = hf_pipe(t[:2000]) # keep it small
151
- # Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
152
  lbl = pred[0]["label"]
153
  score = float(pred[0]["score"])
154
- # Assume LABEL_1 = potential violation (adjust to your model labels)
155
  if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
156
  flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
157
  except Exception as e:
@@ -159,6 +243,8 @@ def fair_housing_flags(text: str) -> List[str]:
159
 
160
  return flags
161
 
 
 
162
  hf_pipe = None
163
  if USE_TINY_ML:
164
  try:
@@ -311,11 +397,13 @@ def run_check(
311
  "USE_TINY_ML": USE_TINY_ML,
312
  "HF_REPO": HF_REPO,
313
  "HF_THRESH": HF_THRESH,
314
- "PhrasesLoaded": len(PHRASE_PATTERNS),
315
  "PhrasesPath": str(PHRASES_PATH),
316
  "PhrasesError": PHRASES_ERROR,
317
  "OCR": pytesseract is not None,
318
- }
 
 
319
 
320
  }
321
 
 
102
  yaml = None
103
 
104
 
105
+ # PHRASE_PATTERNS = []
106
+ # PHRASES_ERROR = None
107
+
108
+ # if yaml:
109
+ # try:
110
+ # text = PHRASES_PATH.read_text(encoding="utf-8")
111
+ # data = yaml.safe_load(text) or {}
112
+ # pats = data.get("patterns") or []
113
+ # if not isinstance(pats, list):
114
+ # raise ValueError("phrases.yaml must define a top-level 'patterns' list")
115
+ # for i, rx in enumerate(pats, 1):
116
+ # PHRASE_PATTERNS.append(re.compile(str(rx), re.IGNORECASE))
117
+ # except FileNotFoundError as e:
118
+ # PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
119
+ # except Exception as e:
120
+ # PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
121
+
122
+ from dataclasses import dataclass
123
+ from pathlib import Path
124
+
125
+ @dataclass
126
+ class Rule:
127
+ regex: re.Pattern
128
+ category: str
129
+ suggests: list[str]
130
+
131
+ PHRASE_RULES: list[Rule] = []
132
  PHRASES_ERROR = None
133
 
134
+ # Resolve phrases path relative to this file, but allow override
135
+ BASE_DIR = Path(__file__).parent
136
+ PHRASES_PATH = Path(os.getenv("PHRASES_PATH", BASE_DIR / "phrases.yaml"))
137
+
138
  if yaml:
139
  try:
140
  text = PHRASES_PATH.read_text(encoding="utf-8")
141
  data = yaml.safe_load(text) or {}
142
+
143
+ # Accept both shapes:
144
+ # 1) { patterns: [...] }
145
+ # 2) { categories: { <Category>: { patterns: [...], suggest: [...] } } }
146
+ if isinstance(data, dict) and "categories" in data:
147
+ cats = data["categories"] or {}
148
+ for cat_name, cfg in cats.items():
149
+ if not isinstance(cfg, dict):
150
+ continue
151
+ pats = cfg.get("patterns") or []
152
+ suggests = cfg.get("suggest") or []
153
+ for rx in pats:
154
+ if isinstance(rx, str):
155
+ PHRASE_RULES.append(
156
+ Rule(
157
+ regex=re.compile(rx, re.IGNORECASE),
158
+ category=str(cat_name),
159
+ suggests=[str(s) for s in suggests if isinstance(s, str)],
160
+ )
161
+ )
162
+ else:
163
+ # Fallback to old shape
164
+ pats = data.get("patterns") or []
165
+ for rx in pats:
166
+ if isinstance(rx, str):
167
+ PHRASE_RULES.append(
168
+ Rule(
169
+ regex=re.compile(rx, re.IGNORECASE),
170
+ category="Uncategorized",
171
+ suggests=[],
172
+ )
173
+ )
174
+ except FileNotFoundError:
175
  PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
176
  except Exception as e:
177
  PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
178
 
179
 
180
+
181
  # Optional HF pipeline (disabled by default to keep CPU/lightweight)
182
  hf_pipe = None
183
  if USE_TINY_ML:
 
190
  hf_pipe = None
191
 
192
 
193
+ # def fair_housing_flags(text: str) -> List[str]:
194
+ # flags: List[str] = []
195
+ # t = text or ""
196
+
197
+ # # Rule-based first
198
+ # for pat in PHRASE_PATTERNS:
199
+ # for m in pat.finditer(t):
200
+ # snippet = t[max(0, m.start() - 30) : m.end() + 30]
201
+ # flags.append(
202
+ # f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
203
+ # )
204
+
205
+ # # Optional tiny model
206
+ # if hf_pipe:
207
+ # try:
208
+ # pred = hf_pipe(t[:2000]) # keep it small
209
+ # # Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
210
+ # lbl = pred[0]["label"]
211
+ # score = float(pred[0]["score"])
212
+ # # Assume LABEL_1 = potential violation (adjust to your model labels)
213
+ # if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
214
+ # flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
215
+ # except Exception as e:
216
+ # flags.append(f"MLFlag: inference error: {e}")
217
+
218
+ # return flags
219
+
220
  def fair_housing_flags(text: str) -> List[str]:
221
  flags: List[str] = []
222
  t = text or ""
223
 
224
+ # Rule-based first (category-aware)
225
+ for rule in PHRASE_RULES:
226
+ for m in rule.regex.finditer(t):
227
  snippet = t[max(0, m.start() - 30) : m.end() + 30]
228
+ sugg = f" | Suggest: {rule.suggests[0]}" if rule.suggests else ""
229
  flags.append(
230
+ f"RuleFlag[{rule.category}]: '{rule.regex.pattern}' around: {snippet!r}{sugg}"
231
  )
232
 
233
+ # Optional tiny model (unchanged)
234
  if hf_pipe:
235
  try:
236
+ pred = hf_pipe(t[:2000])
 
237
  lbl = pred[0]["label"]
238
  score = float(pred[0]["score"])
 
239
  if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
240
  flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
241
  except Exception as e:
 
243
 
244
  return flags
245
 
246
+
247
+
248
  hf_pipe = None
249
  if USE_TINY_ML:
250
  try:
 
397
  "USE_TINY_ML": USE_TINY_ML,
398
  "HF_REPO": HF_REPO,
399
  "HF_THRESH": HF_THRESH,
400
+ "PhrasesLoaded": len(PHRASE_RULES),
401
  "PhrasesPath": str(PHRASES_PATH),
402
  "PhrasesError": PHRASES_ERROR,
403
  "OCR": pytesseract is not None,
404
+ # Optional extras:
405
+ "Categories": sorted({r.category for r in PHRASE_RULES}),
406
+ }
407
 
408
  }
409
 
phrases.yaml CHANGED
@@ -1,7 +1,3 @@
1
- # Expanded Fair Housing phrase patterns
2
- # Derived from the South Carolina Press Association 'Alphabetical List of Words/Phrases Connected with Advertisements for Housing'
3
- # Use as guidance only. Not legal advice.
4
-
5
  categories:
6
  Familial status:
7
  patterns:
 
 
 
 
 
1
  categories:
2
  Familial status:
3
  patterns: