tuklu
/

SASC

@@ -14,6 +14,16 @@ import sys
 import argparse
 import json
 # ── Argument parsing ────────────────────────────────────────────────────────
 parser = argparse.ArgumentParser(description="SASC Hate Speech Detector")
@@ -28,10 +38,14 @@ args = parser.parse_args()
 # ── Interactive prompts if args not provided ─────────────────────────────────
-def ask(prompt, default=None):
     suffix = f" [{default}]" if default else ""
-    val = input(f"{prompt}{suffix}: ").strip()
-    return val if val else default
 print("\n=== SASC Hate Speech Detector ===\n")
@@ -39,7 +53,7 @@ print("\n=== SASC Hate Speech Detector ===\n")
 # Model path
 model_path = args.model
 if not model_path:
-    model_path = ask("Model path (.h5)", "model.h5")
 if not os.path.exists(model_path):
     print(f"Model not found: {model_path}")
@@ -50,7 +64,7 @@ tokenizer_path = args.tokenizer
 if not tokenizer_path:
     # look next to model file first
     candidate = os.path.join(os.path.dirname(model_path), "tokenizer.json")
-    tokenizer_path = ask("Tokenizer path", candidate if os.path.exists(candidate) else "tokenizer.json")
 if not os.path.exists(tokenizer_path):
     print(f"Tokenizer not found: {tokenizer_path}")
@@ -65,16 +79,23 @@ if not args.threshold and not args.text and not args.input:
     except ValueError:
         threshold = 0.5
-print(f"\nLoading model from {model_path}...")
 import tensorflow as tf
-model = tf.keras.models.load_model(model_path)
-print(f"Loading tokenizer from {tokenizer_path}...")
 from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 with open(tokenizer_path) as f:
     tokenizer = tokenizer_from_json(f.read())
 MAX_LEN = 100
 def predict(texts):
@@ -128,14 +149,14 @@ if not input_path:
         print(results.to_string(index=False))
         print("="*60)
-        out = args.output or ask("Save results to CSV? (leave blank to skip)", "")
         if out:
             results.to_csv(out, index=False)
             print(f"Saved to {out}")
         sys.exit(0)
     else:
-        input_path = ask("CSV file path")
 if not os.path.exists(input_path):
     print(f"File not found: {input_path}")
@@ -178,7 +199,7 @@ print(df[[text_col, "predicted_label", "confidence"]].head(10).to_string(index=F
 output_path = args.output
 if not output_path:
     default_out = input_path.replace(".csv", "_predictions.csv")
-    output_path = ask(f"\nSave full results to CSV", default_out)
 if output_path:
     df.to_csv(output_path, index=False)

 import argparse
 import json
+# suppress TF logs
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+from prompt_toolkit import prompt
+from prompt_toolkit.completion import PathCompleter
+from prompt_toolkit.shortcuts import prompt as pt_prompt
+path_completer = PathCompleter(expanduser=True)
 # ── Argument parsing ────────────────────────────────────────────────────────
 parser = argparse.ArgumentParser(description="SASC Hate Speech Detector")
 # ── Interactive prompts if args not provided ─────────────────────────────────
+def ask(message, default=None, is_path=False):
     suffix = f" [{default}]" if default else ""
+    if is_path:
+        val = pt_prompt(f"{message}{suffix}: ", completer=path_completer).strip()
+    else:
+        val = input(f"{message}{suffix}: ").strip()
+    val = val if val else default
+    return os.path.expanduser(val) if val else val
 print("\n=== SASC Hate Speech Detector ===\n")
 # Model path
 model_path = args.model
 if not model_path:
+    model_path = ask("Model path (.h5)", "model.h5", is_path=True)
 if not os.path.exists(model_path):
     print(f"Model not found: {model_path}")
 if not tokenizer_path:
     # look next to model file first
     candidate = os.path.join(os.path.dirname(model_path), "tokenizer.json")
+    tokenizer_path = ask("Tokenizer path", candidate if os.path.exists(candidate) else "tokenizer.json", is_path=True)
 if not os.path.exists(tokenizer_path):
     print(f"Tokenizer not found: {tokenizer_path}")
     except ValueError:
         threshold = 0.5
+print(f"\nLoading model from   {model_path}")
+print(f"Loading tokenizer from {tokenizer_path}")
+import warnings
+warnings.filterwarnings("ignore")
 import tensorflow as tf
+import logging
+tf.get_logger().setLevel(logging.ERROR)
+model = tf.keras.models.load_model(model_path, compile=False)
 from tensorflow.keras.preprocessing.text import tokenizer_from_json
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 with open(tokenizer_path) as f:
     tokenizer = tokenizer_from_json(f.read())
+print(f"Model loaded — vocab size: {len(tokenizer.word_index)}")
 MAX_LEN = 100
 def predict(texts):
         print(results.to_string(index=False))
         print("="*60)
+        out = args.output or ask("Save results to CSV? (leave blank to skip)", "", is_path=True)
         if out:
             results.to_csv(out, index=False)
             print(f"Saved to {out}")
         sys.exit(0)
     else:
+        input_path = ask("CSV file path", is_path=True)
 if not os.path.exists(input_path):
     print(f"File not found: {input_path}")
 output_path = args.output
 if not output_path:
     default_out = input_path.replace(".csv", "_predictions.csv")
+    output_path = ask(f"\nSave full results to CSV", default_out, is_path=True)
 if output_path:
     df.to_csv(output_path, index=False)

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff