Spaces:

KayO
/

WebsiteImageSafetyAnalyzer

Runtime error

App Files Files Community

KayO commited on Nov 4, 2022

Commit

28ff501

1 Parent(s): 983a157

Mark 1

Browse files

Files changed (5) hide show

README.md +2 -2
app.py +71 -0
nsfw_model.pkl +3 -0
requirements.txt +2 -0
test.ipynb +162 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
-title: WebsiteImageSafetyAnalyzer
-emoji: 📈
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

 ---
+title: Website Image Safety Analyzer
+emoji: 🧐
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from fastai.vision.all import *
+import gradio as gr
+import requests
+import base64
+from bs4 import BeautifulSoup
+import os
+# Load the trained model
+learn = load_learner('nsfw_model.pkl')
+labels = learn.dls.vocab
+def analyze(url):
+    """Analyzer function that classifies the images found at the given URL"""
+    # Make sure URL starts with http or https
+    # TODO: confirm that the url points to a web page, and not some resource.
+    # Regex could be useful here
+    if not url.startswith(('http://','https://')):
+        url = 'http://'+url
+    safety = 'safe' # our return variable
+    # Extract html and all img tags
+    html = requests.get(url)
+    soup = BeautifulSoup(html.text, "html.parser")
+    img_elements = soup.find_all("img")
+    # Save all src urls that we can clearly tell are img urls.
+    # A better approach would be to use regex here
+    srcs = []
+    for img in img_elements:
+        for v in img.attrs.values():
+            if isinstance(v, str):
+                if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):
+                    srcs.append(v)
+    # Get the images from the urls and classify
+    # If there is a single unsafe image, report it.
+    for src_url in srcs:
+        try:
+            img_data = requests.get(src_url).content
+            temp = 'temp.' + src_url.lower().split('.')[-1]
+            with open(temp, 'wb') as handler:
+                handler.write(img_data)
+            is_nsfw,_,probs = learn.predict(PILImage.create(temp))
+            os.remove(temp)
+            if is_nsfw == "unsafe_searches":
+                safety = 'NOT safe'
+                return safety
+        except Exception as e:
+            pass
+    return safety
+title = "Website Safety Analyzer"
+description = "**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate.  \n"+\
+                "This is step one in an attempt to solve that. An image classifier that audits every image at a URL.  \n"+\
+                "In this iteration, I classify sites with sexually explicit content as **'NOT safe'**.  \n\n"+\
+                "There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc.  \n"+\
+                "I welcome any help on this. 🙂"
+examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']
+enable_queue=True
+iface = gr.Interface(
+    fn=analyze,
+    inputs="text",
+    outputs="text",
+    title=title,
+    description=description,
+    examples=examples,
+)
+iface.launch(enable_queue=enable_queue)

nsfw_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:018578406ed833284ff69a8198f71c4c71ce537afb0861a602f2240bd3cb3110
+size 46972399

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fastai
2	+ beautifulsoup4

test.ipynb ADDED Viewed

	@@ -0,0 +1,162 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastai.vision.all import *\n",
+    "import gradio as gr\n",
+    "import requests\n",
+    "import base64\n",
+    "from bs4 import BeautifulSoup\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the trained model\n",
+    "learn = load_learner('nsfw_model.pkl')\n",
+    "labels = learn.dls.vocab\n",
+    "\n",
+    "def analyze(url):\n",
+    "    \"\"\"Analyzer function that classifies the images found at the given URL\"\"\"\n",
+    "    \n",
+    "    # Make sure URL starts with http or https\n",
+    "    # TODO: confirm that the url points to a web page, and not some resource.\n",
+    "    # Regex could be useful here\n",
+    "    if not url.startswith(('http://','https://')):\n",
+    "        url = 'http://'+url\n",
+    "    \n",
+    "    safety = 'safe' # our return variable\n",
+    "\n",
+    "    # Extract html and all img tags\n",
+    "    html = requests.get(url)\n",
+    "    soup = BeautifulSoup(html.text, \"html.parser\")\n",
+    "    img_elements = soup.find_all(\"img\")\n",
+    "\n",
+    "    # Save all src urls that we can clearly tell are img urls.\n",
+    "    # A better approach would be to use regex here\n",
+    "    srcs = []\n",
+    "    for img in img_elements:\n",
+    "        for v in img.attrs.values():\n",
+    "            if isinstance(v, str):\n",
+    "                if v.lower().endswith(('jpg', 'png', 'gif', 'jpeg')):\n",
+    "                    srcs.append(v)\n",
+    "    \n",
+    "    # Get the images from the urls and classify\n",
+    "    # If there is a single unsafe image, report it.\n",
+    "    for src_url in srcs:\n",
+    "        try:\n",
+    "            img_data = requests.get(src_url).content\n",
+    "            temp = 'temp.' + src_url.lower().split('.')[-1]\n",
+    "            with open(temp, 'wb') as handler:\n",
+    "                handler.write(img_data)\n",
+    "            is_nsfw,_,probs = learn.predict(PILImage.create(temp))\n",
+    "            os.remove(temp) \n",
+    "            if is_nsfw == \"unsafe_searches\":\n",
+    "                safety = 'NOT safe'\n",
+    "                return safety\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "    return safety"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7867\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7867/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(<gradio.routes.App at 0x7f0da61cb1f0>, 'http://127.0.0.1:7867/', None)"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "title = \"Website Safety Analyzer\"\n",
+    "description = \"**The internet is not safe for children**. Even if we know the 'bad' sites, social media is hard to regulate.  \\n\"+\\\n",
+    "                \"This is step one in an attempt to solve that. An image classifier that audits every image at a URL.  \\n\"+\\\n",
+    "                \"In this iteration, I classify sites with sexually explicit content as **'NOT safe'**.  \\n\\n\"+\\\n",
+    "                \"There is a long way to go with NLP for profanity, cyber-bullying, as well as CV for violence, substance abuse, etc.  \\n\"+\\\n",
+    "                \"I welcome any help on this. 🙂\"\n",
+    "examples = ['porhub.com', 'cnn.com', 'xvideos.com', 'www.pinterest.com']\n",
+    "enable_queue=True\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=analyze, \n",
+    "    inputs=\"text\", \n",
+    "    outputs=\"text\",\n",
+    "    title=title,\n",
+    "    description=description,\n",
+    "    examples=examples,\n",
+    ")\n",
+    "iface.launch(enable_queue=enable_queue)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "ed0e91aaffcefde6eb9bcd4f55fe7652d77471dc031ce772257aa5eb4a54e8f2"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}