File size: 4,856 Bytes

a3abb69

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d5c4b5c2-8c0a-4cbe-8997-1a98c14be2e4",
   "metadata": {},
   "source": [
    "A text classification model using libraries like NLTK or SpaCy. It includes some PII data within the code (e.g., hard-coded email addresses or phone numbers for testing purposes), and include a few API tokens/secrets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f95fa380-34d0-455d-8002-ebe5f829542c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Vulnerable libraries\n",
    "!pip install django==1.11.15\n",
    "!pip install flask==0.12.2\n",
    "!pip install numpy==1.16.0\n",
    "!pip install requests==2.19.1\n",
    "!pip install scikit-learn==0.19.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25315022-9da9-4c29-8326-6532d261dd56",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Non-permissible licensed libraries\n",
    "import gmpy2\n",
    "import oct2py\n",
    "import pygsl\n",
    "from PyQt5 import QtCore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "489ad824-285c-4219-afc6-073192d54f3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Required Libraries for our task\n",
    "import nltk\n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "109d2f98-4d6d-42d9-acb4-2f195af051d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PII Data\n",
    "email = \"john.doe@example.com\"\n",
    "phone = \"123-456-7890\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d637e295-0953-4980-bf99-c7e7e509e876",
   "metadata": {},
   "outputs": [],
   "source": [
    "# API Keys and secrets\n",
    "fb_app_secret = \"3e4a22bb7e6b2c38b7809234b3ee782b\"\n",
    "db_credentials = \"username:password@localhost:5432/mydatabase\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6493567-ad7f-4b87-95e4-5068a09fca92",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download nltk data\n",
    "nltk.download('punkt', download_dir='/nltk_data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f94e191-bfe7-4e54-9dbf-4d2484b0dbe9",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Text Classification\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8552e84a-e164-4519-8ce8-959c7dd277ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Data\n",
    "categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']\n",
    "twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abc38386-e63f-4d22-81dc-1785ac8f043b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature Extraction\n",
    "count_vect = CountVectorizer()\n",
    "X_train_counts = count_vect.fit_transform(twenty_train.data)\n",
    "tfidf_transformer = TfidfTransformer()\n",
    "X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ea57698-12ff-48b3-a8b6-bb8dffabbc5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Model\n",
    "clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5fa6532-594c-4790-a630-83388c556591",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict\n",
    "docs_new = ['God is love', 'OpenGL on the GPU is fast']\n",
    "X_new_counts = count_vect.transform(docs_new)\n",
    "X_new_tfidf = tfidf_transformer.transform(X_new_counts)\n",
    "predicted = clf.predict(X_new_tfidf)\n",
    "for doc, category in zip(docs_new, predicted):\n",
    "    print('%r => %s' % (doc, twenty_train.target_names[category]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}