Spaces:
Build error
Build error
talexm
commited on
Commit
·
595bead
1
Parent(s):
9a25cef
adding model for sec query
Browse files- .gitignore +4 -0
- rag_sec/rag_chagu_demo.py +14 -12
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rag_sec/__pycache*
|
| 2 |
+
|
| 3 |
+
rag_sec/__pycache__/rag_chagu_demo.*
|
| 4 |
+
|
rag_sec/rag_chagu_demo.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
from difflib import get_close_matches
|
|
|
|
| 4 |
|
| 5 |
class DocumentSearcher:
|
| 6 |
def __init__(self):
|
| 7 |
self.documents = []
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
def load_imdb_data(self):
|
| 11 |
-
|
| 12 |
-
home_dir = Path(os.getenv("HOME", "/")) # Fallback to root if HOME is not set
|
| 13 |
data_dir = home_dir / "data-sets/aclImdb/train"
|
| 14 |
-
|
| 15 |
pos_dir = data_dir / "pos"
|
| 16 |
neg_dir = data_dir / "neg"
|
| 17 |
|
|
@@ -23,12 +23,10 @@ class DocumentSearcher:
|
|
| 23 |
if not neg_dir.exists() or not any(neg_dir.iterdir()):
|
| 24 |
print("No negative reviews found.")
|
| 25 |
|
| 26 |
-
# Load positive reviews
|
| 27 |
for filename in pos_dir.iterdir():
|
| 28 |
with open(filename, "r", encoding="utf-8") as file:
|
| 29 |
self.documents.append(file.read())
|
| 30 |
|
| 31 |
-
# Load negative reviews
|
| 32 |
for filename in neg_dir.iterdir():
|
| 33 |
with open(filename, "r", encoding="utf-8") as file:
|
| 34 |
self.documents.append(file.read())
|
|
@@ -44,7 +42,6 @@ class DocumentSearcher:
|
|
| 44 |
print("No .txt files directory found.")
|
| 45 |
return
|
| 46 |
|
| 47 |
-
# Load all .txt files
|
| 48 |
for filename in txt_dir.glob("*.txt"):
|
| 49 |
with open(filename, "r", encoding="utf-8") as file:
|
| 50 |
self.documents.append(file.read())
|
|
@@ -52,15 +49,20 @@ class DocumentSearcher:
|
|
| 52 |
print(f"Loaded additional {len(self.documents)} documents from .txt files.")
|
| 53 |
|
| 54 |
def is_query_malicious(self, query):
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
return False
|
| 60 |
|
| 61 |
def search_documents(self, query):
|
| 62 |
if self.is_query_malicious(query):
|
| 63 |
-
return [{"document": "ANOMALY: Query blocked due to detected malicious
|
| 64 |
|
| 65 |
# Use fuzzy matching for normal queries
|
| 66 |
matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
from difflib import get_close_matches
|
| 4 |
+
from transformers import pipeline
|
| 5 |
|
| 6 |
class DocumentSearcher:
|
| 7 |
def __init__(self):
|
| 8 |
self.documents = []
|
| 9 |
+
# Load a pre-trained model for malicious intent detection
|
| 10 |
+
self.malicious_detector = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
|
| 11 |
|
| 12 |
def load_imdb_data(self):
|
| 13 |
+
home_dir = Path(os.getenv("HOME", "/"))
|
|
|
|
| 14 |
data_dir = home_dir / "data-sets/aclImdb/train"
|
|
|
|
| 15 |
pos_dir = data_dir / "pos"
|
| 16 |
neg_dir = data_dir / "neg"
|
| 17 |
|
|
|
|
| 23 |
if not neg_dir.exists() or not any(neg_dir.iterdir()):
|
| 24 |
print("No negative reviews found.")
|
| 25 |
|
|
|
|
| 26 |
for filename in pos_dir.iterdir():
|
| 27 |
with open(filename, "r", encoding="utf-8") as file:
|
| 28 |
self.documents.append(file.read())
|
| 29 |
|
|
|
|
| 30 |
for filename in neg_dir.iterdir():
|
| 31 |
with open(filename, "r", encoding="utf-8") as file:
|
| 32 |
self.documents.append(file.read())
|
|
|
|
| 42 |
print("No .txt files directory found.")
|
| 43 |
return
|
| 44 |
|
|
|
|
| 45 |
for filename in txt_dir.glob("*.txt"):
|
| 46 |
with open(filename, "r", encoding="utf-8") as file:
|
| 47 |
self.documents.append(file.read())
|
|
|
|
| 49 |
print(f"Loaded additional {len(self.documents)} documents from .txt files.")
|
| 50 |
|
| 51 |
def is_query_malicious(self, query):
|
| 52 |
+
# Use the pre-trained model to check if the query has malicious intent
|
| 53 |
+
result = self.malicious_detector(query)[0]
|
| 54 |
+
label = result['label']
|
| 55 |
+
score = result['score']
|
| 56 |
+
|
| 57 |
+
# Consider the query malicious if the sentiment is negative with high confidence
|
| 58 |
+
if label == "NEGATIVE" and score > 0.8:
|
| 59 |
+
print(f"Warning: Malicious query detected - Confidence: {score:.4f}")
|
| 60 |
+
return True
|
| 61 |
return False
|
| 62 |
|
| 63 |
def search_documents(self, query):
|
| 64 |
if self.is_query_malicious(query):
|
| 65 |
+
return [{"document": "ANOMALY: Query blocked due to detected malicious intent.", "similarity": 0.0}]
|
| 66 |
|
| 67 |
# Use fuzzy matching for normal queries
|
| 68 |
matches = get_close_matches(query, self.documents, n=5, cutoff=0.3)
|