Spaces:

tlogandesigns
/

image-text-compliance

Sleeping

App Files Files Community

tlogandesigns commited on Aug 16, 2025

Commit

a2eff80

1 Parent(s): 5950325

catagories patch

Browse files

Files changed (2) hide show

checker.py +105 -17
phrases.yaml +0 -4

checker.py CHANGED Viewed

@@ -102,24 +102,82 @@ except Exception:
     yaml = None
-PHRASE_PATTERNS = []
 PHRASES_ERROR = None
 if yaml:
     try:
         text = PHRASES_PATH.read_text(encoding="utf-8")
         data = yaml.safe_load(text) or {}
-        pats = data.get("patterns") or []
-        if not isinstance(pats, list):
-            raise ValueError("phrases.yaml must define a top-level 'patterns' list")
-        for i, rx in enumerate(pats, 1):
-            PHRASE_PATTERNS.append(re.compile(str(rx), re.IGNORECASE))
-    except FileNotFoundError as e:
         PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
     except Exception as e:
         PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
 # Optional HF pipeline (disabled by default to keep CPU/lightweight)
 hf_pipe = None
 if USE_TINY_ML:
@@ -132,26 +190,52 @@ if USE_TINY_ML:
         hf_pipe = None
 def fair_housing_flags(text: str) -> List[str]:
     flags: List[str] = []
     t = text or ""
-    # Rule-based first
-    for pat in PHRASE_PATTERNS:
-        for m in pat.finditer(t):
             snippet = t[max(0, m.start() - 30) : m.end() + 30]
             flags.append(
-                f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
             )
-    # Optional tiny model
     if hf_pipe:
         try:
-            pred = hf_pipe(t[:2000])  # keep it small
-            # Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
             lbl = pred[0]["label"]
             score = float(pred[0]["score"])
-            # Assume LABEL_1 = potential violation (adjust to your model labels)
             if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
                 flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
         except Exception as e:
@@ -159,6 +243,8 @@ def fair_housing_flags(text: str) -> List[str]:
     return flags
 hf_pipe = None
 if USE_TINY_ML:
     try:
@@ -311,11 +397,13 @@ def run_check(
             "USE_TINY_ML": USE_TINY_ML,
             "HF_REPO": HF_REPO,
             "HF_THRESH": HF_THRESH,
-            "PhrasesLoaded": len(PHRASE_PATTERNS),
             "PhrasesPath": str(PHRASES_PATH),
             "PhrasesError": PHRASES_ERROR,
             "OCR": pytesseract is not None,
-            }
     }

     yaml = None
+# PHRASE_PATTERNS = []
+# PHRASES_ERROR = None
+# if yaml:
+#     try:
+#         text = PHRASES_PATH.read_text(encoding="utf-8")
+#         data = yaml.safe_load(text) or {}
+#         pats = data.get("patterns") or []
+#         if not isinstance(pats, list):
+#             raise ValueError("phrases.yaml must define a top-level 'patterns' list")
+#         for i, rx in enumerate(pats, 1):
+#             PHRASE_PATTERNS.append(re.compile(str(rx), re.IGNORECASE))
+#     except FileNotFoundError as e:
+#         PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
+#     except Exception as e:
+#         PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class Rule:
+    regex: re.Pattern
+    category: str
+    suggests: list[str]
+PHRASE_RULES: list[Rule] = []
 PHRASES_ERROR = None
+# Resolve phrases path relative to this file, but allow override
+BASE_DIR = Path(__file__).parent
+PHRASES_PATH = Path(os.getenv("PHRASES_PATH", BASE_DIR / "phrases.yaml"))
 if yaml:
     try:
         text = PHRASES_PATH.read_text(encoding="utf-8")
         data = yaml.safe_load(text) or {}
+        # Accept both shapes:
+        # 1) { patterns: [...] }
+        # 2) { categories: { <Category>: { patterns: [...], suggest: [...] } } }
+        if isinstance(data, dict) and "categories" in data:
+            cats = data["categories"] or {}
+            for cat_name, cfg in cats.items():
+                if not isinstance(cfg, dict):
+                    continue
+                pats = cfg.get("patterns") or []
+                suggests = cfg.get("suggest") or []
+                for rx in pats:
+                    if isinstance(rx, str):
+                        PHRASE_RULES.append(
+                            Rule(
+                                regex=re.compile(rx, re.IGNORECASE),
+                                category=str(cat_name),
+                                suggests=[str(s) for s in suggests if isinstance(s, str)],
+                            )
+                        )
+        else:
+            # Fallback to old shape
+            pats = data.get("patterns") or []
+            for rx in pats:
+                if isinstance(rx, str):
+                    PHRASE_RULES.append(
+                        Rule(
+                            regex=re.compile(rx, re.IGNORECASE),
+                            category="Uncategorized",
+                            suggests=[],
+                        )
+                    )
+    except FileNotFoundError:
         PHRASES_ERROR = f"phrases.yaml not found at {PHRASES_PATH}"
     except Exception as e:
         PHRASES_ERROR = f"phrases.yaml load/parse error: {e}"
 # Optional HF pipeline (disabled by default to keep CPU/lightweight)
 hf_pipe = None
 if USE_TINY_ML:
         hf_pipe = None
+# def fair_housing_flags(text: str) -> List[str]:
+#     flags: List[str] = []
+#     t = text or ""
+#     # Rule-based first
+#     for pat in PHRASE_PATTERNS:
+#         for m in pat.finditer(t):
+#             snippet = t[max(0, m.start() - 30) : m.end() + 30]
+#             flags.append(
+#                 f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
+#             )
+#     # Optional tiny model
+#     if hf_pipe:
+#         try:
+#             pred = hf_pipe(t[:2000])  # keep it small
+#             # Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
+#             lbl = pred[0]["label"]
+#             score = float(pred[0]["score"])
+#             # Assume LABEL_1 = potential violation (adjust to your model labels)
+#             if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
+#                 flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
+#         except Exception as e:
+#             flags.append(f"MLFlag: inference error: {e}")
+#     return flags
 def fair_housing_flags(text: str) -> List[str]:
     flags: List[str] = []
     t = text or ""
+    # Rule-based first (category-aware)
+    for rule in PHRASE_RULES:
+        for m in rule.regex.finditer(t):
             snippet = t[max(0, m.start() - 30) : m.end() + 30]
+            sugg = f" | Suggest: {rule.suggests[0]}" if rule.suggests else ""
             flags.append(
+                f"RuleFlag[{rule.category}]: '{rule.regex.pattern}' around: {snippet!r}{sugg}"
             )
+    # Optional tiny model (unchanged)
     if hf_pipe:
         try:
+            pred = hf_pipe(t[:2000])
             lbl = pred[0]["label"]
             score = float(pred[0]["score"])
             if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
                 flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
         except Exception as e:
     return flags
 hf_pipe = None
 if USE_TINY_ML:
     try:
             "USE_TINY_ML": USE_TINY_ML,
             "HF_REPO": HF_REPO,
             "HF_THRESH": HF_THRESH,
+            "PhrasesLoaded": len(PHRASE_RULES),
             "PhrasesPath": str(PHRASES_PATH),
             "PhrasesError": PHRASES_ERROR,
             "OCR": pytesseract is not None,
+            # Optional extras:
+            "Categories": sorted({r.category for r in PHRASE_RULES}),
+        }
     }

phrases.yaml CHANGED Viewed

@@ -1,7 +1,3 @@
-# Expanded Fair Housing phrase patterns
-# Derived from the South Carolina Press Association 'Alphabetical List of Words/Phrases Connected with Advertisements for Housing'
-# Use as guidance only. Not legal advice.
 categories:
   Familial status:
     patterns:

 categories:
   Familial status:
     patterns: