Jay Prajapati commited on
Commit
8f885c1
·
1 Parent(s): 6d9d1a1
Files changed (6) hide show
  1. .gitignore +174 -0
  2. app.py +39 -0
  3. requirements.txt +6 -0
  4. src/evaluator.py +10 -0
  5. src/extractor.py +24 -0
  6. src/model.py +80 -0
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from src.evaluator import website_analyzer, pdf_analyzer
3
+
4
+ with gr.Blocks(theme=gr.themes.Soft(), css="style.css") as demo:
5
+ with gr.Tab("Website Analyzer"):
6
+ with gr.Row():
7
+ with gr.Column():
8
+ website_input = gr.Textbox(label="Website URL")
9
+ with gr.Column():
10
+ website_output = gr.Image(label="Safety Score")
11
+ analyze_website_btn = gr.Button("Analyze Website")
12
+ analyze_website_btn.click(
13
+ fn=website_analyzer,
14
+ inputs=website_input,
15
+ outputs=website_output,
16
+ )
17
+
18
+ examples = ["https://www.nytimes.com/", "https://www.bbc.com/news", "https://www.theguardian.com/international", "https://www.reuters.com", "https://www.aljazeera.com"]
19
+
20
+ gr.Examples(
21
+ examples=examples,
22
+ inputs=website_input,
23
+ )
24
+
25
+ with gr.Tab("Document Analyzer"):
26
+ with gr.Row():
27
+ with gr.Column():
28
+ document_input = gr.File(label="Upload PDF Document", file_types=[".pdf"])
29
+ with gr.Column():
30
+ document_output = gr.Image(label="Safety Score")
31
+ analyze_document_btn = gr.Button("Analyze Document")
32
+ analyze_document_btn.click(
33
+ fn=pdf_analyzer,
34
+ inputs=document_input,
35
+ outputs=document_output,
36
+ )
37
+
38
+ if __name__ == "__main__":
39
+ demo.queue(max_size=10).launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ pyPDF2
4
+ matplotlib
5
+ transformers
6
+ beautifulsoup4
src/evaluator.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.model import saftey_scorer
2
+ from src.extractor import get_text_from_pdf, get_text_from_website
3
+
4
+ def website_analyzer(website_url):
5
+ text = get_text_from_website(website_url)
6
+ return saftey_scorer(text)
7
+
8
+ def pdf_analyzer(website_url):
9
+ text = get_text_from_pdf(website_url)
10
+ return saftey_scorer(text)
src/extractor.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+
5
+ def get_text_from_website(url):
6
+ response = requests.get(url)
7
+
8
+ if response.status_code == 200:
9
+ soup = BeautifulSoup(response.text, 'html.parser')
10
+ content_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
11
+ text = ' '.join([element.get_text(separator=' ', strip=True) for element in content_elements])
12
+ return text
13
+ else:
14
+ return f"Failed to retrieve the webpage. Status code: {response.status_code}"
15
+
16
+ def get_text_from_pdf(file_path):
17
+ with open(file_path, 'rb') as pdf_file:
18
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
19
+ text = ''
20
+ for page_num in range(len(pdf_reader.pages)):
21
+ page = pdf_reader.pages[page_num]
22
+ text += page.extract_text()
23
+
24
+ return text
src/model.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import torch
3
+ from PIL import Image
4
+ import matplotlib.pyplot as plt
5
+ from torch.nn.functional import softmax
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("google/shieldgemma-2b")
9
+
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ "google/shieldgemma-2b",
12
+ device_map="auto",
13
+ )
14
+
15
+ def get_safety_score(content, safety_policy):
16
+
17
+ max_length = 1024
18
+ content = content[:max_length]
19
+
20
+ prompt = f"""You are a policy expert trying to help determine whether a user
21
+ prompt is in violation of the defined safety policies.
22
+
23
+ <start_of_turn>
24
+ Content: {content.strip()}
25
+ <end_of_turn>
26
+
27
+ Our safety principle is defined in the below:
28
+
29
+ {safety_policy.strip()}
30
+
31
+ Does the content violate the above principle? Your answer must start
32
+ with 'Yes' or 'No'. And then walk through step by step to be sure we answer
33
+ correctly.
34
+ """
35
+
36
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
37
+ with torch.no_grad():
38
+ logits = model(**inputs).logits
39
+
40
+ vocab = tokenizer.get_vocab()
41
+ selected_logits = logits[0, -1, [vocab['Yes'], vocab['No']]]
42
+
43
+ probabilities = softmax(selected_logits, dim=0)
44
+
45
+ score = probabilities[0].item()
46
+ return score
47
+
48
+ def saftey_scorer(text):
49
+ safety_policies = [
50
+ """ "No Harassment in Content": The content retrieved from the provided URL or document shall not contain or seek to promote malicious, intimidating, bullying, or abusive content targeting an individual or group (e.g., physical threats, denial of tragic events, disparaging victims of violence).""",
51
+ """ "No Violence in Content": The content retrieved from the provided URL or document shall not contain or promote any form of violence, harm, or threats against individuals or groups. This includes explicit depictions of violence, threats of physical harm, or any content that encourages violent acts.""",
52
+ """ "No Misinformation in Content": The content retrieved from the provided URL or document shall not contain or spread false or misleading information. This includes intentional misrepresentation of facts, events, or data. This policy also covers the prevention of fake news, conspiracy theories, and any content that intentionally manipulates or distorts the truth.""",
53
+ """ "No Adult Content in Content": The content retrieved from the provided URL or document shall not contain sexually explicit, pornographic, or any material intended to arouse sexual interest. This includes explicit descriptions, images, videos, or discussions of nudity or sexual acts that violate acceptable content guidelines."""
54
+ ]
55
+
56
+ safety_scores = []
57
+ keys = ["Harassment", "Violence", "Misinformation", "Adult Content"]
58
+
59
+ for safety_policy in safety_policies:
60
+ score = get_safety_score(text, safety_policy)
61
+ safety_scores.append(score)
62
+
63
+ safety_scores = [round(x * 100, 2) for x in safety_scores]
64
+
65
+ plt.bar(keys, safety_scores)
66
+ plt.ylabel('Score')
67
+ plt.title('Safety Scores')
68
+
69
+ # Add the scores on top of the bars
70
+ for i, score in enumerate(safety_scores):
71
+ plt.text(i, score + 0.01, str(score) + '%', ha='center')
72
+
73
+ # Save the plot to a temporary file
74
+ buf = io.BytesIO()
75
+ plt.savefig(buf, format='png')
76
+ buf.seek(0)
77
+ plt.close()
78
+
79
+ image = Image.open(buf)
80
+ return image