Spaces:
Runtime error
Runtime error
Jay Prajapati commited on
Commit ·
8f885c1
1
Parent(s): 6d9d1a1
v1.0.0
Browse files- .gitignore +174 -0
- app.py +39 -0
- requirements.txt +6 -0
- src/evaluator.py +10 -0
- src/extractor.py +24 -0
- src/model.py +80 -0
.gitignore
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py,cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
|
| 110 |
+
# pdm
|
| 111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 112 |
+
#pdm.lock
|
| 113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 114 |
+
# in version control.
|
| 115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 116 |
+
.pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 121 |
+
__pypackages__/
|
| 122 |
+
|
| 123 |
+
# Celery stuff
|
| 124 |
+
celerybeat-schedule
|
| 125 |
+
celerybeat.pid
|
| 126 |
+
|
| 127 |
+
# SageMath parsed files
|
| 128 |
+
*.sage.py
|
| 129 |
+
|
| 130 |
+
# Environments
|
| 131 |
+
.env
|
| 132 |
+
.venv
|
| 133 |
+
env/
|
| 134 |
+
venv/
|
| 135 |
+
ENV/
|
| 136 |
+
env.bak/
|
| 137 |
+
venv.bak/
|
| 138 |
+
|
| 139 |
+
# Spyder project settings
|
| 140 |
+
.spyderproject
|
| 141 |
+
.spyproject
|
| 142 |
+
|
| 143 |
+
# Rope project settings
|
| 144 |
+
.ropeproject
|
| 145 |
+
|
| 146 |
+
# mkdocs documentation
|
| 147 |
+
/site
|
| 148 |
+
|
| 149 |
+
# mypy
|
| 150 |
+
.mypy_cache/
|
| 151 |
+
.dmypy.json
|
| 152 |
+
dmypy.json
|
| 153 |
+
|
| 154 |
+
# Pyre type checker
|
| 155 |
+
.pyre/
|
| 156 |
+
|
| 157 |
+
# pytype static type analyzer
|
| 158 |
+
.pytype/
|
| 159 |
+
|
| 160 |
+
# Cython debug symbols
|
| 161 |
+
cython_debug/
|
| 162 |
+
|
| 163 |
+
# PyCharm
|
| 164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 168 |
+
#.idea/
|
| 169 |
+
|
| 170 |
+
# Ruff stuff:
|
| 171 |
+
.ruff_cache/
|
| 172 |
+
|
| 173 |
+
# PyPI configuration file
|
| 174 |
+
.pypirc
|
app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from src.evaluator import website_analyzer, pdf_analyzer
|
| 3 |
+
|
| 4 |
+
with gr.Blocks(theme=gr.themes.Soft(), css="style.css") as demo:
|
| 5 |
+
with gr.Tab("Website Analyzer"):
|
| 6 |
+
with gr.Row():
|
| 7 |
+
with gr.Column():
|
| 8 |
+
website_input = gr.Textbox(label="Website URL")
|
| 9 |
+
with gr.Column():
|
| 10 |
+
website_output = gr.Image(label="Safety Score")
|
| 11 |
+
analyze_website_btn = gr.Button("Analyze Website")
|
| 12 |
+
analyze_website_btn.click(
|
| 13 |
+
fn=website_analyzer,
|
| 14 |
+
inputs=website_input,
|
| 15 |
+
outputs=website_output,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
examples = ["https://www.nytimes.com/", "https://www.bbc.com/news", "https://www.theguardian.com/international", "https://www.reuters.com", "https://www.aljazeera.com"]
|
| 19 |
+
|
| 20 |
+
gr.Examples(
|
| 21 |
+
examples=examples,
|
| 22 |
+
inputs=website_input,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
with gr.Tab("Document Analyzer"):
|
| 26 |
+
with gr.Row():
|
| 27 |
+
with gr.Column():
|
| 28 |
+
document_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
|
| 29 |
+
with gr.Column():
|
| 30 |
+
document_output = gr.Image(label="Safety Score")
|
| 31 |
+
analyze_document_btn = gr.Button("Analyze Document")
|
| 32 |
+
analyze_document_btn.click(
|
| 33 |
+
fn=pdf_analyzer,
|
| 34 |
+
inputs=document_input,
|
| 35 |
+
outputs=document_output,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
demo.queue(max_size=10).launch(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
gradio
|
| 3 |
+
pyPDF2
|
| 4 |
+
matplotlib
|
| 5 |
+
transformers
|
| 6 |
+
beautifulsoup4
|
src/evaluator.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.model import saftey_scorer
|
| 2 |
+
from src.extractor import get_text_from_pdf, get_text_from_website
|
| 3 |
+
|
| 4 |
+
def website_analyzer(website_url):
|
| 5 |
+
text = get_text_from_website(website_url)
|
| 6 |
+
return saftey_scorer(text)
|
| 7 |
+
|
| 8 |
+
def pdf_analyzer(website_url):
|
| 9 |
+
text = get_text_from_pdf(website_url)
|
| 10 |
+
return saftey_scorer(text)
|
src/extractor.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import PyPDF2
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
def get_text_from_website(url):
|
| 6 |
+
response = requests.get(url)
|
| 7 |
+
|
| 8 |
+
if response.status_code == 200:
|
| 9 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 10 |
+
content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
|
| 11 |
+
text = ' '.join([element.get_text(separator=' ', strip=True) for element in content_elements])
|
| 12 |
+
return text
|
| 13 |
+
else:
|
| 14 |
+
return f"Failed to retrieve the webpage. Status code: {response.status_code}"
|
| 15 |
+
|
| 16 |
+
def get_text_from_pdf(file_path):
|
| 17 |
+
with open(file_path, 'rb') as pdf_file:
|
| 18 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
| 19 |
+
text = ''
|
| 20 |
+
for page_num in range(len(pdf_reader.pages)):
|
| 21 |
+
page = pdf_reader.pages[page_num]
|
| 22 |
+
text += page.extract_text()
|
| 23 |
+
|
| 24 |
+
return text
|
src/model.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import torch
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from torch.nn.functional import softmax
|
| 6 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 7 |
+
|
| 8 |
+
tokenizer = AutoTokenizer.from_pretrained("google/shieldgemma-2b")
|
| 9 |
+
|
| 10 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 11 |
+
"google/shieldgemma-2b",
|
| 12 |
+
device_map="auto",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
def get_safety_score(content, safety_policy):
|
| 16 |
+
|
| 17 |
+
max_length = 1024
|
| 18 |
+
content = content[:max_length]
|
| 19 |
+
|
| 20 |
+
prompt = f"""You are a policy expert trying to help determine whether a user
|
| 21 |
+
prompt is in violation of the defined safety policies.
|
| 22 |
+
|
| 23 |
+
<start_of_turn>
|
| 24 |
+
Content: {content.strip()}
|
| 25 |
+
<end_of_turn>
|
| 26 |
+
|
| 27 |
+
Our safety principle is defined in the below:
|
| 28 |
+
|
| 29 |
+
{safety_policy.strip()}
|
| 30 |
+
|
| 31 |
+
Does the content violate the above principle? Your answer must start
|
| 32 |
+
with 'Yes' or 'No'. And then walk through step by step to be sure we answer
|
| 33 |
+
correctly.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
| 37 |
+
with torch.no_grad():
|
| 38 |
+
logits = model(**inputs).logits
|
| 39 |
+
|
| 40 |
+
vocab = tokenizer.get_vocab()
|
| 41 |
+
selected_logits = logits[0, -1, [vocab['Yes'], vocab['No']]]
|
| 42 |
+
|
| 43 |
+
probabilities = softmax(selected_logits, dim=0)
|
| 44 |
+
|
| 45 |
+
score = probabilities[0].item()
|
| 46 |
+
return score
|
| 47 |
+
|
| 48 |
+
def saftey_scorer(text):
|
| 49 |
+
safety_policies = [
|
| 50 |
+
""" "No Harassment in Content": The content retrieved from the provided URL or document shall not contain or seek to promote malicious, intimidating, bullying, or abusive content targeting an individual or group (e.g., physical threats, denial of tragic events, disparaging victims of violence).""",
|
| 51 |
+
""" "No Violence in Content": The content retrieved from the provided URL or document shall not contain or promote any form of violence, harm, or threats against individuals or groups. This includes explicit depictions of violence, threats of physical harm, or any content that encourages violent acts.""",
|
| 52 |
+
""" "No Misinformation in Content": The content retrieved from the provided URL or document shall not contain or spread false or misleading information. This includes intentional misrepresentation of facts, events, or data. This policy also covers the prevention of fake news, conspiracy theories, and any content that intentionally manipulates or distorts the truth.""",
|
| 53 |
+
""" "No Adult Content in Content": The content retrieved from the provided URL or document shall not contain sexually explicit, pornographic, or any material intended to arouse sexual interest. This includes explicit descriptions, images, videos, or discussions of nudity or sexual acts that violate acceptable content guidelines."""
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
safety_scores = []
|
| 57 |
+
keys = ["Harassment", "Violence", "Misinformation", "Adult Content"]
|
| 58 |
+
|
| 59 |
+
for safety_policy in safety_policies:
|
| 60 |
+
score = get_safety_score(text, safety_policy)
|
| 61 |
+
safety_scores.append(score)
|
| 62 |
+
|
| 63 |
+
safety_scores = [round(x * 100, 2) for x in safety_scores]
|
| 64 |
+
|
| 65 |
+
plt.bar(keys, safety_scores)
|
| 66 |
+
plt.ylabel('Score')
|
| 67 |
+
plt.title('Safety Scores')
|
| 68 |
+
|
| 69 |
+
# Add the scores on top of the bars
|
| 70 |
+
for i, score in enumerate(safety_scores):
|
| 71 |
+
plt.text(i, score + 0.01, str(score) + '%', ha='center')
|
| 72 |
+
|
| 73 |
+
# Save the plot to a temporary file
|
| 74 |
+
buf = io.BytesIO()
|
| 75 |
+
plt.savefig(buf, format='png')
|
| 76 |
+
buf.seek(0)
|
| 77 |
+
plt.close()
|
| 78 |
+
|
| 79 |
+
image = Image.open(buf)
|
| 80 |
+
return image
|