backup

Browse files

Files changed (9) hide show

README.md +11 -0
api_secrets.py +2 -0
get_text.ipynb +298 -0
get_txt.py +18 -0
lamitization.py +37 -0
main.py +45 -0
new.py +182 -0
nlp_api.py +196 -0
updated_api.py +181 -0

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# BAT NLP Campaign Audio Data
+config the ngrok auth: ngrok config add-authtoken 2Qm8hS1zPhVXiLjEdlI4738tLzF_2QJwGJMK5oTbQD33QSVXS
+ngrok http --domain=batnlp.ngrok.app 1111
+--------------------------------------------------------------------------------------------------------------------------------
+# Old App
+config the ngrok auth: ngrok config add-authtoken 2Qm8hS1zPhVXiLjEdlI4738tLzF_2QJwGJMK5oTbQD33QSVXS
+ngrok http --domain=hawkeyes.ngrok.app 8020

api_secrets.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ API_KEY_ASSEMBLYAI = '5bd662961e754f148a581e0070f09c88'
2	+ YOUR_API_TOKEN = '5bd662961e754f148a581e0070f09c88'

get_text.ipynb ADDED Viewed

	@@ -0,0 +1,298 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'certifi'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[1;32md:\\Projects\\BAT\\BAT_NLP_Campaign\\get_text.ipynb Cell 1\u001b[0m line \u001b[0;36m2\n\u001b[0;32m      <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mre\u001b[39;00m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mupdated_api\u001b[39;00m \u001b[39mimport\u001b[39;00m \u001b[39m*\u001b[39m\n\u001b[0;32m      <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtyping_extensions\u001b[39;00m \u001b[39mimport\u001b[39;00m Annotated\n\u001b[0;32m      <a href='vscode-notebook-cell:/d%3A/Projects/BAT/BAT_NLP_Campaign/get_text.ipynb#W0sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnltk\u001b[39;00m\n",
+      "File \u001b[1;32md:\\Projects\\BAT\\BAT_NLP_Campaign\\updated_api.py:9\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39muvicorn\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnltk\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mhttpx\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mfastapi\u001b[39;00m \u001b[39mimport\u001b[39;00m FastAPI\n\u001b[0;32m     11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpydantic\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseModel\n",
+      "File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\__init__.py:2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m__version__\u001b[39;00m \u001b[39mimport\u001b[39;00m __description__, __title__, __version__\n\u001b[1;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_api\u001b[39;00m \u001b[39mimport\u001b[39;00m delete, get, head, options, patch, post, put, request, stream\n\u001b[0;32m      3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_auth\u001b[39;00m \u001b[39mimport\u001b[39;00m Auth, BasicAuth, DigestAuth, NetRCAuth\n\u001b[0;32m      4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_client\u001b[39;00m \u001b[39mimport\u001b[39;00m USE_CLIENT_DEFAULT, AsyncClient, Client\n",
+      "File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_api.py:4\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtyping\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mcontextlib\u001b[39;00m \u001b[39mimport\u001b[39;00m contextmanager\n\u001b[1;32m----> 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_client\u001b[39;00m \u001b[39mimport\u001b[39;00m Client\n\u001b[0;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DEFAULT_TIMEOUT_CONFIG\n\u001b[0;32m      6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_models\u001b[39;00m \u001b[39mimport\u001b[39;00m Response\n",
+      "File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_client.py:11\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m__version__\u001b[39;00m \u001b[39mimport\u001b[39;00m __version__\n\u001b[0;32m     10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_auth\u001b[39;00m \u001b[39mimport\u001b[39;00m Auth, BasicAuth, FunctionAuth\n\u001b[1;32m---> 11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_config\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[0;32m     12\u001b[0m     DEFAULT_LIMITS,\n\u001b[0;32m     13\u001b[0m     DEFAULT_MAX_REDIRECTS,\n\u001b[0;32m     14\u001b[0m     DEFAULT_TIMEOUT_CONFIG,\n\u001b[0;32m     15\u001b[0m     Limits,\n\u001b[0;32m     16\u001b[0m     Proxy,\n\u001b[0;32m     17\u001b[0m     Timeout,\n\u001b[0;32m     18\u001b[0m )\n\u001b[0;32m     19\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_decoders\u001b[39;00m \u001b[39mimport\u001b[39;00m SUPPORTED_DECODERS\n\u001b[0;32m     20\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_exceptions\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[0;32m     21\u001b[0m     InvalidURL,\n\u001b[0;32m     22\u001b[0m     RemoteProtocolError,\n\u001b[0;32m     23\u001b[0m     TooManyRedirects,\n\u001b[0;32m     24\u001b[0m     request_context,\n\u001b[0;32m     25\u001b[0m )\n",
+      "File \u001b[1;32mc:\\Users\\naymm\\miniconda3\\envs\\nlpBat\\lib\\site-packages\\httpx\\_config.py:7\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtyping\u001b[39;00m\n\u001b[0;32m      5\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpathlib\u001b[39;00m \u001b[39mimport\u001b[39;00m Path\n\u001b[1;32m----> 7\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mcertifi\u001b[39;00m\n\u001b[0;32m      9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_compat\u001b[39;00m \u001b[39mimport\u001b[39;00m set_minimum_tls_version_1_2\n\u001b[0;32m     10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_models\u001b[39;00m \u001b[39mimport\u001b[39;00m Headers\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'certifi'"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "from updated_api import *\n",
+    "from typing_extensions import Annotated\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "import string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n",
+    "nltk.download('wordnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patterns = {\n",
+    "    'Unique Capsule': r\"\\b(((u(?:nit|niq).*?)\\s+(?:capsul))|(?:.*?uni.*?capsul))\",\n",
+    "    'Refreshing Taste and Smell': r\"\\b((((ref|rif|rip|rep|ep|pre).*?)\\s+t(?:a|e|i|y)s(.*?)\\s+(sm|(?:.*?(sm|m)))(?:el|il|al|ol|.*?))|((?:in.*?)\\s+t(?:a|e|i|y)s.*?\\s+(.*?)(sm|m)(?:el|il|al|ol|ail|eal)))\",\n",
+    "    'Benson & Hadges Breeze':r\"\\b((b|p|v|f)(?:(an|en|a|e)(?:s|ch|t)(?:on|an|en).*?)\\s+h(?:.*?)\\s+(b|p|v|f)(?:re|ee|e))\",\n",
+    "}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nlp_bat(text):\n",
+    "    results = {}\n",
+    "    all_match = {}\n",
+    "    for name, pattern in patterns.items():\n",
+    "        matches = re.compile(pattern, text, re.IGNORECASE)\n",
+    "        m = {name:matches}\n",
+    "        all_match.update(m)\n",
+    "        count = len(matches)\n",
+    "        results[name] = count\n",
+    "    \n",
+    "    \n",
+    "    print(all_match)    \n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<coroutine object detect_audio at 0x00000255D1384900>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filename = input(\"Give Audio Name: \")\n",
+    "audio_url = upload(filename)\n",
+    "\n",
+    "detect_audio(audio_url, 'file_title')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patterns = {\n",
+    "    'Unique Capsule': r\"unique capsul|unit capsul|uniq...capsul|uni..capsul\\b\",\n",
+    "    'Refreshing Taste and Smell': r\"refreshing taste smell|refreshing taste milk|refreshing test smell|ripe singh taste|repressing taste smell\\b\",\n",
+    "    'Benson & Hadges Breeze': r\"benson.hage.bree|benson.hage..bree|banson.hage.bree|banson.hage..bree|benson he.es breez|benson hess breez|benson he..e breez|benson haze breez|benson hezes bee|banson breez|banson hedge breathe|banson hedge bridge|benson hedge bre|benson hedge bridge| benson haze brie|banson haze breeze|banson hedge breez\\b\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patterns = {\n",
+    "    'Unique Capsule': r\"\\b(?:uni(?:que)?|unit|uniq\\.+|uni\\.+)\\s*capsul\",\n",
+    "    'Refreshing Taste and Smell': r\"\\b(?:refreshing|ripe|repressing)\\s+(?:taste\\s+(?:smell|milk)|test\\s+smell)\",\n",
+    "    'Benson & Hadges Breeze':r\"\\b(?:benson\\s+h(?:ess|aze|ezes|edge)\\s+breez|banson\\s+(?:haze\\s+breez|hedge\\s+(?:breez|bre))|benson\\s+h(?:aze\\s+brie|edge\\s+bridge))\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patterns = {\n",
+    "    'Unique Capsule': r\"\\b(?:uni(?:que)?|unit|uniq\\.+|uni\\.+)\\s*capsul\",\n",
+    "    'Refreshing Taste and Smell': r\"\\b(?:refreshing|ripe|repressing)\\s+(?:taste\\s+(?:smell|milk)|test\\s+smell)\",\n",
+    "    'Benson & Hadges Breeze':r\"\\b(?:((b|p|v|f)(a|e).*?son)\\s+(h(?:.*?))\\s+(br))\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nlp_bat(text):\n",
+    "    results = {}\n",
+    "    all_match = {}\n",
+    "    for name, pattern in patterns.items():\n",
+    "        matches = re.findall(pattern, text, re.IGNORECASE)\n",
+    "        m = {name:matches}\n",
+    "        all_match.update(m)\n",
+    "        count = len(matches)\n",
+    "        results[name] = count\n",
+    "    \n",
+    "    \n",
+    "    print(all_match)    \n",
+    "\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'Unique Capsule': ['unique capsul'], 'Refreshing Taste and Smell': ['refreshing taste smell'], 'Benson & Hadges Breeze': [('banson', 'b', 'a', 'hages niyashe ekti unique capsule offer panson hages', 'br'), ('panson', 'p', 'a', 'hages', 'br')]}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'Unique Capsule': 1,\n",
+       " 'Refreshing Taste and Smell': 1,\n",
+       " 'Benson & Hadges Breeze': 2}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text = \"Clean text :  apnea janet kushihaban banson hages niyashe ekti unique capsule offer panson hages bridge panson hages breeze air capsule atom agnoton tharna refreshing taste smell darn offer tea trial cora jonu apnea ekti trial kit nitaparin thunobat\"\n",
+    "\n",
+    "nlp_bat(text)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "old_patterns = {\n",
+    "    'Unique Capsule': r\"\\b(?:uni(?:que)?|unit|uniq\\.+|uni\\.+)\\s*capsul\",\n",
+    "    'Refreshing Taste and Smell': r\"\\b(?:refreshing|ripe|repressing)\\s+(?:taste\\s+(?:smell|milk)|test\\s+smell)\",\n",
+    "    'Benson & Hadges Breeze': r\"\\b(?:((b|p|v|f)(a|e).*?son)\\s+(h(?:.*?))\\s+(br))\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newPattern = {\n",
+    "    'Unique Capsule': r\"\\b(((u(?:nit|niq).*?)\\s+(?:capsul))|(?:.*?uni.*?capsul))\",\n",
+    "    'Refreshing Taste and Smell': r\"\\b((((ref|rif|rip|rep|ep|pre).*?)\\s+t(?:a|e|i|y)s(.*?)\\s+(sm|(?:.*?(sm|m)))(?:el|il|al|ol|.*?))|((?:in.*?)\\s+t(?:a|e|i|y)s.*?\\s+(.*?)(sm|m)(?:el|il|al|ol|ail|eal)))\",\n",
+    "    'Benson & Hadges Breeze':r\"\\b((b|p|v|f)(?:(an|en|a|e)(?:s|ch|t)(?:on|an|en).*?)\\s+h(?:.*?)\\s+(b|p|v|f)(?:re|ee|e|ri))\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install assemblyai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<assemblyai.transcriber.Transcript object at 0x0000029377EFD480>\n"
+     ]
+    }
+   ],
+   "source": [
+    "import assemblyai as aai\n",
+    "from updated_api import *\n",
+    "\n",
+    "# Replace with your API key\n",
+    "aai.settings.api_key = \"5bd662961e754f148a581e0070f09c88\"\n",
+    "\n",
+    "# URL of the file to transcribe\n",
+    "FILE_URL = \"https://form.hedigital.online/file-1702199439520-529630625.mp4\"\n",
+    "\n",
+    "# You can also transcribe a local file by passing in a file path\n",
+    "# FILE_URL = './path/to/file.mp3'\n",
+    "\n",
+    "transcriber = aai.Transcriber()\n",
+    "transcript = transcriber.transcribe(FILE_URL)\n",
+    "print(transcript)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nlpBat",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

get_txt.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import assemblyai as aai
+aai.settings.api_key = "5bd662961e754f148a581e0070f09c88"
+transcriber = aai.Transcriber()
+audio_url = (
+    "https://bat.hedigital.online/file-1703669708657-351786808.mpeg"
+)
+config = aai.TranscriptionConfig(speaker_labels=True)
+transcript = transcriber.transcribe(audio_url, config)
+print(transcript.text)
+for utterance in transcript.utterances:
+    print(f"Speaker {utterance.speaker}: {utterance.text}")

lamitization.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+def lemmatize_and_clean(text):
+    # Tokenize the text into words
+    words = nltk.word_tokenize(text)
+    # Remove punctuation and convert to lowercase
+    words = [word.lower() for word in words if word.isalpha()]
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    words = [word for word in words if word not in stop_words]
+    # Lemmatize the words
+    lemmatizer = WordNetLemmatizer()
+    words = [lemmatizer.lemmatize(word) for word in words]
+    # Join the words back into a cleaned text
+    cleaned_text = ' '.join(words)
+    return cleaned_text
+# Example usage
+input_text = "kushir cover. kushir cover benson and hezes nih unique capsule of our janum benson and hesses breeze aprajanara kushiha benjay a capsule roche egg thorne refreshing taste and smell arapnajudiya trial kotachan tahal ajinita parnakti trial kit donnabat."
+cleaned_text = lemmatize_and_clean(input_text)
+print("Original Text:")
+print(input_text)
+print("\nCleaned Text:")
+print(cleaned_text)

main.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import re
+from nlp_api import *
+from typing_extensions import Annotated
+import string
+patterns = {
+    'Unique Capsule': r"unique capsul|unit capsul|uniq...capsul|uni..capsul\b",
+    'Refreshing Taste and Smell': r"refreshing taste smell|refreshing taste milk|refreshing test smell|ripe singh taste|repressing taste smell\b",
+    'Benson & Hadges Breeze': r"benson he.es breez|benson hess breez|benson he..e breez|benson haze breez|benson hezes bee|banson breez|banson hedge breathe|banson hedge bridge|benson hedge bre|benson hedge bridge| benson haze brie|banson haze breeze|banson hedge breez\b"
+}
+    # Find and count matches for each pattern
+def nlp_bat(text):
+    results = {}
+    all_match = {}
+    for name, pattern in patterns.items():
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        m = {name:matches}
+        all_match.update(m)
+        count = len(matches)
+        results[name] = count
+    print(all_match)
+    return results
+# # input
+filename = input("Give Audio Name: ")
+audio_url = upload(filename)
+# # transcribe
+detect_audio(audio_url, 'file_title')
+# print(text_det)
+# print("xxxxxxxxx",text_det)
+# text = text_det
+# print(text)/
+# result = nlp_bat(text)
+# print(result)
+# print(result)
+# print(text)

new.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import asyncio
+import json
+import re
+from typing import List, Union
+import aiofiles
+import uvicorn
+import nltk
+import httpx
+from fastapi import FastAPI
+from pydantic import BaseModel
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import logging
+import pytz
+from datetime import datetime
+from api_secrets import API_KEY_ASSEMBLYAI
+# logging.basicConfig(filename0="BAT_NLP_Campaign.log",
+#                     filemode='w')
+# logger = logging.getLogger("BAT")
+# logger.setLevel(logging.DEBUG)
+# file_handler = logging.FileHandler("BAT_NLP_Campaign.log")
+# logger.addHandler(file_handler)
+# total_done = 0
+# total_error = 0
+def get_bd_time():
+    bd_timezone = pytz.timezone("Asia/Dhaka")
+    time_now = datetime.now(bd_timezone)
+    current_time = time_now.strftime("%I:%M:%S %p")
+    return current_time
+app = FastAPI()
+CHUNK_SIZE = 5_242_880  # 5MB
+upload_endpoint = 'https://api.assemblyai.com/v2/upload'
+transcript_endpoint = 'https://api.assemblyai.com/v2/transcript'
+headers_auth_only = {'authorization': API_KEY_ASSEMBLYAI}
+headers = {
+    "authorization": API_KEY_ASSEMBLYAI,
+    "content-type": "application/json"
+}
+class Item(BaseModel):
+    url: str
+async def lemmatize_and_clean(text):
+    words = nltk.word_tokenize(text.lower())
+    words = [word for word in words if word.isalpha() and word not in set(stopwords.words('english'))]
+    lemmatizer = WordNetLemmatizer()
+    words = [await asyncio.to_thread(lemmatizer.lemmatize, word) for word in words]
+    return ' '.join(words)
+patterns = {
+    'Unique Capsule': r"\b(((u(?:nit|niq).*?)\s+(?:capsul))|(?:.*?uni.*?capsul))",
+    'Refreshing Taste and Smell': r"\b((((ref|rif|rip|rep|ep|pre).*?)\s+t(?:a|e|i|y)s(.*?)\s+(sm|(?:.*?(sm|m)))(?:el|il|al|ol|.*?))|((?:in.*?)\s+t(?:a|e|i|y)s.*?\s+(.*?)(sm|m)(?:el|il|al|ol|ail|eal)))",
+    'Benson & Hadges Breeze':r"\b((b|p|v|f)(?:(an|en|a|e)(?:s|ch|t)(?:on|an|en).*?)\s+h(?:.*?)\s+(b|p|v|f)(?:re|ee|e|ri))",
+}
+async def nlp_bat(text):
+    results = {}
+    all_match = {}
+    for name, pattern in patterns.items():
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        all_match[name] = matches
+        results[name] = len(matches)
+    print(all_match)
+    return results
+async def read_file(filename):
+    async with aiofiles.open(filename, 'rb') as f:
+        while True:
+            data = await f.read(CHUNK_SIZE)
+            if not data:
+                break
+            yield data
+async def upload(filename):
+    async with httpx.AsyncClient() as client:
+        async for data in read_file(filename):
+            upload_response = await client.post(upload_endpoint, headers=headers_auth_only, data=data)
+    return upload_response.json()['upload_url']
+async def transcribe(audio_url):
+    transcript_request = {'audio_url': audio_url}
+    async with httpx.AsyncClient() as client:
+        transcript_response = await client.post(transcript_endpoint, json=transcript_request, headers=headers)
+    return transcript_response.json()['id']
+async def poll(transcript_id):
+    polling_endpoint = f'{transcript_endpoint}/{transcript_id}'
+    async with httpx.AsyncClient() as client:
+        polling_response = await client.get(polling_endpoint, headers=headers)
+    return polling_response.json()
+async def get_transcription_result_url(url):
+    transcribe_id = await transcribe(url)
+    while True:
+        data = await poll(transcribe_id)
+        if data['status'] == 'completed':
+            return data, None
+        elif data['status'] == 'error':
+            return data, data['error']
+        print("Processing Audio")
+        await asyncio.sleep(2)
+async def detect_audio(url, title):
+    data, error = await get_transcription_result_url(url)
+    text_det = data['text']
+    print("main text : ", text_det)
+    lmtz = await lemmatize_and_clean(text_det)
+    print("Clean text : ", lmtz)
+    txt = lmtz.lower()
+    r = await nlp_bat(txt)
+    return r
+async def process_item(item: Item):
+    try:
+        print(item.url)
+        result = await detect_audio(item.url, title="file")
+        result = json.dumps(result)
+        return json.loads(result)
+    finally:
+        pass
+async def process_items(items: Union[Item, List[Item]]):
+    if isinstance(items, list):
+        coroutines = [process_item(item) for item in items]
+        results_dict = await asyncio.gather(*coroutines)
+        results = {}
+        for item in results_dict:
+            results.update(item)
+    else:
+        results = await process_item(items)
+    return results
+@app.post("/nlp")
+async def create_items(items: Union[Item, List[Item]]):
+    try:
+        results = await process_items(items)
+        print("Result Sent to User:", results)
+        return results
+    except Exception as e:
+        # global total_error
+        # total_error += 1
+        # logger.info(f"Time:{get_bd_time()}, Execution Failed and Total Failed Execution : {total_error}, Payload:{items}, Response : {results}")
+        # logger.error(str(e))
+        return {"AI": f"Error: {str(e)}"}
+    finally:
+        # global total_done
+        # total_done +=1
+        # logger.info(f"Time:{get_bd_time()}, Execution Done and Total Successfull Execution : {total_done}, Payload:{items}, Response : {results}")
+        pass
+if __name__ == "__main__":
+    try:
+        uvicorn.run(app, host="127.0.0.1", port=8020)
+    finally:
+        pass

nlp_api.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# files after part 2
+import requests
+import time
+from api_secrets import API_KEY_ASSEMBLYAI
+import re
+from fastapi import FastAPI
+from pydantic import BaseModel
+import asyncio
+from typing import List, Union
+import uvicorn
+import json
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+# nltk.download('punkt')
+# nltk.download('stopwords')
+# nltk.download('wordnet')
+app = FastAPI()
+class Item(BaseModel):
+    url: str
+upload_endpoint = 'https://api.assemblyai.com/v2/upload'
+transcript_endpoint = 'https://api.assemblyai.com/v2/transcript'
+headers_auth_only = {'authorization': API_KEY_ASSEMBLYAI}
+headers = {
+    "authorization": API_KEY_ASSEMBLYAI,
+    "content-type": "application/json"
+}
+CHUNK_SIZE = 5_242_880  # 5MB
+def lemmatize_and_clean(text):
+    # Tokenize the text into words
+    words = nltk.word_tokenize(text)
+    # Remove punctuation and convert to lowercase
+    words = [word.lower() for word in words if word.isalpha()]
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    words = [word for word in words if word not in stop_words]
+    # Lemmatize the words
+    lemmatizer = WordNetLemmatizer()
+    words = [lemmatizer.lemmatize(word) for word in words]
+    # Join the words back into a cleaned text
+    cleaned_text = ' '.join(words)
+    return cleaned_text
+# Patterns
+# patterns = {
+#     'smoker': r"sm.k.r|s.m.k.r\b",
+#     'dhumpai': r"d.m.a.|d..mp..|.om.a.|umpa.\b",
+#     'alchemy': r"al.k.m|.lch.m.\b",
+#     'benson': r"..ns.n\b",
+#     'goldleaf': r"go.lb|gol..lea.|g.l...|g.l../b",
+#     'dunhil': r"d.n.h.l|d.nh.l|.an.i.l|.an.i.l\b",
+#     'smooth': r".m..th|sm.d\b",
+#     'thanda_flvr': r"th.nd..fl.v|t.nd...fl.v|th.nd...fl.v|t.nd..fl.v|..de.fl.v|.and..fl.v|..anda.fl..\b",
+#     'best_tobacco': r".est.t.b..|.est..a.o|.est.o.a.o|.est.o.\b"
+# }
+# patterns = {
+#     'Unique Capsule': r"unique capsul|unit capsul|uniq...capsul|uni..capsul\b",
+#     'Refreshing Taste and Smell': r"refreshing taste smell|refreshing taste milk\b",
+#     'Benson & Hadges Breeze': r"benson he.es breez|benson hess breez|benson he..e breez|benson haze breez|benson hezes bee|banson breez|banson hedge breathe|banson hedge bridge|benson hedge bre|benson hedge bridge\b"
+# }
+# patterns = {
+#     'Unique Capsule': r"unique capsul|unit capsul|uniq...capsul|uni..capsul\b",
+#     'Refreshing Taste and Smell': r"refreshing taste smell|refreshing taste milk|refreshing test smell|ripe singh taste|repressing taste smell\b",
+#     'Benson & Hadges Breeze': r"benson.hage.bree|benson.hage..bree|banson.hage.bree|banson.hage..bree|benson he.es breez|benson hess breez|benson he..e breez|benson haze breez|benson hezes bee|banson breez|banson hedge breathe|banson hedge bridge|benson hedge bre|benson hedge bridge| benson haze brie|banson haze breeze|banson hedge breez\b"
+# }
+patterns = {
+    'Unique Capsule': r'\b(?:uni(?:que)?|unit|uniq\.+|uni\.+)\s*capsul',
+    'Refreshing Taste and Smell': r'\b(?:refreshing|ripe|repressing)\s+(?:taste\s+(?:smell|milk)|test\s+smell)\b',
+    'Benson & Hadges Breeze': r'\b(?:benson\s+h(?:ess|aze|ezes|edge)\s+breez|banson\s+(?:haze\s+breez|hedge\s+(?:breez|bre))|benson\s+h(?:aze\s+brie|edge\s+bridge))\b',
+}
+# Find and count matches for each pattern
+def nlp_bat(text):
+    results = {}
+    all_match = {}
+    for name, pattern in patterns.items():
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        m = {name:matches}
+        all_match.update(m)
+        count = len(matches)
+        results[name] = count
+    print(all_match)
+    return results
+def upload(filename):
+    def read_file(filename):
+        with open(filename, 'rb') as f:
+            while True:
+                data = f.read(CHUNK_SIZE)
+                if not data:
+                    break
+                yield data
+    upload_response = requests.post(upload_endpoint, headers=headers_auth_only, data=read_file(filename))
+    return upload_response.json()['upload_url']
+def transcribe(audio_url):
+    transcript_request = {
+        'audio_url': audio_url
+    }
+    transcript_response = requests.post(transcript_endpoint, json=transcript_request, headers=headers)
+    return transcript_response.json()['id']
+def poll(transcript_id):
+    polling_endpoint = transcript_endpoint + '/' + transcript_id
+    polling_response = requests.get(polling_endpoint, headers=headers)
+    return polling_response.json()
+def get_transcription_result_url(url):
+    transcribe_id = transcribe(url)
+    while True:
+        data = poll(transcribe_id)
+        if data['status'] == 'completed':
+            return data, None
+        elif data['status'] == 'error':
+            return data, data['error']
+        print("Processing Audio")
+        time.sleep(2)
+def detect_audio(url, title):
+    data, error = get_transcription_result_url(url)
+    text_det = data['text']
+    lmtz = lemmatize_and_clean(text_det)
+    print(lmtz)
+    txt = lmtz.lower()
+    r = nlp_bat(txt)
+    # print(txt)
+    # print(r)
+    return r
+async def process_item(item: Item):
+    try:
+        print(item.url)
+        result = detect_audio(item.url,title="file")
+        result = json.dumps(result)
+        res = json.loads(result)
+        return res
+    finally:
+        pass
+async def process_items(items: Union[Item, List[Item]]):
+    if isinstance(items, list):
+        coroutines = [process_item(item) for item in items]
+        results_dict = await asyncio.gather(*coroutines)
+        results = {}
+        for item in results_dict:
+            results.update(item)
+    else:
+        results = await process_item(items)
+    return results
+@app.post("/nlp")
+async def create_items(items: Union[Item, List[Item]]):
+    try:
+        results = await process_items(items)
+        print("Result Sent to User:", results)
+        return results
+    finally:
+        pass
+if __name__ == "__main__":
+    try:
+        uvicorn.run(app, host="127.0.0.1", port=8020)
+    finally:
+        pass

updated_api.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import asyncio
+import json
+import re
+from typing import List, Union
+import aiofiles
+import uvicorn
+import nltk
+import httpx
+from fastapi import FastAPI
+from pydantic import BaseModel
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import logging
+import pytz
+from datetime import datetime
+from api_secrets import API_KEY_ASSEMBLYAI
+# logging.basicConfig(filename0="BAT_NLP_Campaign.log",
+#                     filemode='w')
+# logger = logging.getLogger("BAT")
+# logger.setLevel(logging.DEBUG)
+# file_handler = logging.FileHandler("BAT_NLP_Campaign.log")
+# logger.addHandler(file_handler)
+# total_done = 0
+# total_error = 0
+def get_bd_time():
+    bd_timezone = pytz.timezone("Asia/Dhaka")
+    time_now = datetime.now(bd_timezone)
+    current_time = time_now.strftime("%I:%M:%S %p")
+    return current_time
+app = FastAPI()
+CHUNK_SIZE = 5_242_880  # 5MB
+upload_endpoint = 'https://api.assemblyai.com/v2/upload'
+transcript_endpoint = 'https://api.assemblyai.com/v2/transcript'
+headers_auth_only = {'authorization': API_KEY_ASSEMBLYAI}
+headers = {
+    "authorization": API_KEY_ASSEMBLYAI,
+    "content-type": "application/json"
+}
+class Item(BaseModel):
+    url: str
+async def lemmatize_and_clean(text):
+    words = nltk.word_tokenize(text.lower())
+    words = [word for word in words if word.isalpha() and word not in set(stopwords.words('english'))]
+    lemmatizer = WordNetLemmatizer()
+    words = [await asyncio.to_thread(lemmatizer.lemmatize, word) for word in words]
+    return ' '.join(words)
+patterns = {
+    'Unique Capsule': r"\b(((u(?:nit|niq).*?)\s+(?:capsul))|(?:.*?uni.*?capsul))",
+    'Refreshing Taste and Smell': r"\b((((ref|rif|rip|rep|ep|pre).*?)\s+t(?:a|e|i|y)s(.*?)\s+(sm|(?:.*?(sm|m)))(?:el|il|al|ol|.*?))|((?:in.*?)\s+t(?:a|e|i|y)s.*?\s+(.*?)(sm|m)(?:el|il|al|ol|ail|eal)))",
+    'Benson & Hadges Breeze':r"\b((b|p|v|f)(?:(an|en|a|e)(?:s|ch|t)(?:on|an|en).*?)\s+h(?:.*?)\s+(b|p|v|f)(?:re|ee|e|ri))",
+}
+async def nlp_bat(text):
+    results = {}
+    all_match = {}
+    for name, pattern in patterns.items():
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        all_match[name] = matches
+        results[name] = len(matches)
+    print(all_match)
+    return results
+async def read_file(filename):
+    async with aiofiles.open(filename, 'rb') as f:
+        while True:
+            data = await f.read(CHUNK_SIZE)
+            if not data:
+                break
+            yield data
+async def upload(filename):
+    async with httpx.AsyncClient() as client:
+        async for data in read_file(filename):
+            upload_response = await client.post(upload_endpoint, headers=headers_auth_only, data=data)
+    return upload_response.json()['upload_url']
+async def transcribe(audio_url):
+    transcript_request = {'audio_url': audio_url}
+    async with httpx.AsyncClient() as client:
+        transcript_response = await client.post(transcript_endpoint, json=transcript_request, headers=headers)
+    return transcript_response.json()['id']
+async def poll(transcript_id):
+    polling_endpoint = f'{transcript_endpoint}/{transcript_id}'
+    async with httpx.AsyncClient() as client:
+        polling_response = await client.get(polling_endpoint, headers=headers)
+    return polling_response.json()
+async def get_transcription_result_url(url):
+    transcribe_id = await transcribe(url)
+    while True:
+        data = await poll(transcribe_id)
+        if data['status'] == 'completed':
+            return data, None
+        elif data['status'] == 'error':
+            return data, data['error']
+        print("Processing Audio")
+        await asyncio.sleep(2)
+async def detect_audio(url, title):
+    data, error = await get_transcription_result_url(url)
+    text_det = data['text']
+    print("main text : ", text_det)
+    lmtz = await lemmatize_and_clean(text_det)
+    print("Clean text : ", lmtz)
+    txt = lmtz.lower()
+    r = await nlp_bat(txt)
+    return r
+async def process_item(item: Item):
+    try:
+        print(item.url)
+        result = await detect_audio(item.url, title="file")
+        result = json.dumps(result)
+        return json.loads(result)
+    finally:
+        pass
+async def process_items(items: Union[Item, List[Item]]):
+    if isinstance(items, list):
+        coroutines = [process_item(item) for item in items]
+        results_dict = await asyncio.gather(*coroutines)
+        results = {}
+        for item in results_dict:
+            results.update(item)
+    else:
+        results = await process_item(items)
+    return results
+@app.post("/nlp")
+async def create_items(items: Union[Item, List[Item]]):
+    try:
+        results = await process_items(items)
+        print("Result Sent to User:", results)
+        return results
+    except Exception as e:
+        # global total_error
+        # total_error += 1
+        # logger.info(f"Time:{get_bd_time()}, Execution Failed and Total Failed Execution : {total_error}, Payload:{items}, Response : {results}")
+        # logger.error(str(e))
+        return {"AI": f"Error: {str(e)}"}
+    finally:
+        # global total_done
+        # total_done +=1
+        # logger.info(f"Time:{get_bd_time()}, Execution Done and Total Successfull Execution : {total_done}, Payload:{items}, Response : {results}")
+        pass
+if __name__ == "__main__":
+    try:
+        uvicorn.run(app, host="127.0.0.1", port=1111)
+    finally:
+        pass