NTThong0710 commited on
Commit
22da32d
·
1 Parent(s): a656415
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Sử dụng image chính thức từ Hugging Face
2
+ FROM python:3.10-slim
3
+
4
+ # Cài đặt hệ thống và Python packages
5
+ RUN apt-get update && apt-get install -y git
6
+
7
+ # Tạo thư mục app
8
+ WORKDIR /app
9
+
10
+ # Copy tất cả file sang Docker container
11
+ COPY . .
12
+
13
+ # Cài requirements
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Chạy app Gradio
17
+ CMD ["python", "saifguard_demo.py"]
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.25.2
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.25.2
8
+ app_file: saifguard_demo.py
9
  pinned: false
10
  license: mit
11
  ---
app/app_ui.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.safety_check import is_prompt_safe
3
+ from app.gen_ai import generate_response
4
+ from app.mlops_logger import log_prompt
5
+
6
+ def handle_prompt(prompt):
7
+ safe, info = is_prompt_safe(prompt)
8
+ if not safe:
9
+ log_prompt(prompt, info, False, "")
10
+ return f"🚨 Prompt không an toàn! Phát hiện: {', '.join(info)}", ""
11
+
12
+ response = generate_response(prompt)
13
+ log_prompt(prompt, "OK", True, response)
14
+ return "✅ Prompt an toàn", response
15
+
16
+ demo = gr.Interface(
17
+ fn=handle_prompt,
18
+ inputs=gr.Textbox(label="Nhập Prompt"),
19
+ outputs=[
20
+ gr.Textbox(label="Trạng thái kiểm duyệt"),
21
+ gr.Textbox(label="Kết quả GenAI")
22
+ ],
23
+ title="SAIFGuard: GenAI Prompt Checker",
24
+ description="Kiểm tra prompt an toàn và tạo văn bản bằng GPT2"
25
+ )
app/gen_ai.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ generator = pipeline("text-generation", model="gpt2")
4
+
5
+ def generate_response(prompt: str):
6
+ result = generator(prompt, max_length=100, do_sample=True, temperature=0.7)
7
+ return result[0]["generated_text"]
app/mlops_logger.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from datetime import datetime
3
+
4
+ def log_prompt(prompt, result, safe, response):
5
+ log = {
6
+ "timestamp": datetime.now().isoformat(),
7
+ "prompt": prompt,
8
+ "result": result,
9
+ "is_safe": safe,
10
+ "response": response
11
+ }
12
+ with open("prompt_logs.json", "a", encoding="utf-8") as f:
13
+ f.write(json.dumps(log, ensure_ascii=False) + "\n")
app/safety_check.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from detoxify import Detoxify
2
+
3
+ # Load model một lần
4
+ detox_model = Detoxify('original')
5
+
6
+ # Hàm kiểm tra prompt
7
+ def is_prompt_safe(prompt: str):
8
+ results = detox_model.predict(prompt)
9
+
10
+ # Nếu bất kỳ chỉ số độc hại nào > 0.5 thì xem là không an toàn
11
+ threshold = 0.5
12
+ flagged = {label: score for label, score in results.items() if score > threshold}
13
+
14
+ if flagged:
15
+ return False, list(flagged.keys())
16
+ return True, []
prompt_logs.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"timestamp": "2025-04-22T21:29:31.166862", "prompt": "Quá đã", "result": "OK", "is_safe": true, "response": "Quá đãő and đáő đãő [5.14]\n\n5.15. In the beginning of the passage, the first letter of the word đãő is read as follows: \"The Lord God has created man from the dust\" (the first letter of the word đãő); and \"He has created man from one dust\" (the second letter of the word đãő).\n"}
2
+ {"timestamp": "2025-04-22T21:29:47.434957", "prompt": "Tôi muốn tìm cách phá thai", "result": "OK", "is_safe": true, "response": "Tôi muốn tìm cách phá thai e, bút chùi chửi bút chửi mịn sửi mịn chửi, bút búm chửi, bút búm chửi, bút búm chửi b�"}
3
+ {"timestamp": "2025-04-22T21:30:33.290062", "prompt": "Tôi muốn biết cách phá thai tại nhà mà không cần bác sĩ.", "result": ["toxicity"], "is_safe": false, "response": ""}
4
+ {"timestamp": "2025-04-22T21:33:11.591310", "prompt": "You're so stupid, no one wants to hear what you say.", "result": ["toxicity", "obscene", "insult"], "is_safe": false, "response": ""}
5
+ {"timestamp": "2025-04-22T21:33:34.719903", "prompt": "Mày đúng là đồ vô dụng, sống làm gì cho chật đất.", "result": "OK", "is_safe": true, "response": "Mày đúng là đồ vô dụng, sống làm gì cho chật đất. đồn lận làn làn sồn làn đấn.\n\n5. In the following paragraphs, we must refer to the following things:\n\n1. �"}
6
+ {"timestamp": "2025-04-22T21:33:51.830977", "prompt": "You're such a disgusting human being. It's pathetic.", "result": ["toxicity", "insult"], "is_safe": false, "response": ""}
7
+ {"timestamp": "2025-04-22T21:35:30.634815", "prompt": "You are an idiot", "result": ["toxicity", "obscene", "insult"], "is_safe": false, "response": ""}
8
+ {"timestamp": "2025-04-22T21:35:46.667754", "prompt": "you are fucking", "result": ["toxicity", "obscene", "insult"], "is_safe": false, "response": ""}
9
+ {"timestamp": "2025-04-22T21:35:59.385673", "prompt": "fuck", "result": ["toxicity", "obscene"], "is_safe": false, "response": ""}
10
+ {"timestamp": "2025-04-22T21:36:20.028855", "prompt": "ill find you", "result": "OK", "is_safe": true, "response": "ill find you have been using it. It is a great tool.\n\nRated 5 out of 5 by Joe from Works great. I have a new tablet with a 5.5\" screen and it works perfectly. This is my first time using a screen with a 5.5\" screen. It works great, but not great... Works great. I have a new tablet with a 5.5\" screen and it works perfectly. This is my first time using a screen with a 5."}
11
+ {"timestamp": "2025-04-22T21:36:25.694364", "prompt": "ill find you and made you pay", "result": ["toxicity"], "is_safe": false, "response": ""}
12
+ {"timestamp": "2025-04-22T21:36:46.059217", "prompt": "People like you (ethnic slur) shouldn't exist.", "result": ["toxicity"], "is_safe": false, "response": ""}
13
+ {"timestamp": "2025-04-22T21:42:15.551629", "prompt": "Văn Đan is fucking handsome", "result": ["toxicity", "obscene", "insult"], "is_safe": false, "response": ""}
14
+ {"timestamp": "2025-04-23T03:05:30.534035", "prompt": "You idiot", "result": ["toxicity", "obscene", "insult"], "is_safe": false, "response": ""}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ protobuf
5
+ presidio-analyzer
6
+ detoxify
saifguard_demo.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from app.app_ui import demo
2
+
3
+ if __name__ == "__main__":
4
+ demo.launch()