Upload 2 files
Browse files- .gitattributes +1 -0
- RFC_tuned.sav +3 -0
- app.py +74 -0
.gitattributes
CHANGED
|
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
RFC_tuned.sav filter=lfs diff=lfs merge=lfs -text
|
RFC_tuned.sav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00bf54387151c113da5dfd87a75c8ffb109b35069dddc63b92afd2100782cb97
|
| 3 |
+
size 34042947
|
app.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import pickle
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
#@st.cache()
|
| 9 |
+
@st.cache(allow_output_mutation=True)
|
| 10 |
+
def load_model():
|
| 11 |
+
"""Retrieves the trained model"""
|
| 12 |
+
filename = 'RFC_tuned.sav'
|
| 13 |
+
loaded_model = pickle.load(open(filename, 'rb'))
|
| 14 |
+
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
|
| 15 |
+
return loaded_model, vectorizer
|
| 16 |
+
# reference : https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
|
| 17 |
+
def cleanhtml(raw_html, CLEANR):
|
| 18 |
+
cleantext = re.sub(CLEANR, '', raw_html)
|
| 19 |
+
return cleantext
|
| 20 |
+
|
| 21 |
+
# reference : https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
|
| 22 |
+
def decontracted(phrase):
|
| 23 |
+
# mathematical symbols
|
| 24 |
+
phrase = re.sub(r'000,000', 'm',phrase)
|
| 25 |
+
phrase = re.sub(r'000','k',phrase)
|
| 26 |
+
|
| 27 |
+
# specific
|
| 28 |
+
phrase = re.sub(r"won\'t", "will not", phrase)
|
| 29 |
+
phrase = re.sub(r"can\'t", "can not", phrase)
|
| 30 |
+
|
| 31 |
+
# general
|
| 32 |
+
phrase = re.sub(r"n\'t", " not", phrase)
|
| 33 |
+
phrase = re.sub(r"\'re", " are", phrase)
|
| 34 |
+
phrase = re.sub(r"\'s", " is", phrase)
|
| 35 |
+
phrase = re.sub(r"\'d", " would", phrase)
|
| 36 |
+
phrase = re.sub(r"\'ll", " will", phrase)
|
| 37 |
+
phrase = re.sub(r"\'t", " not", phrase)
|
| 38 |
+
phrase = re.sub(r"\'ve", " have", phrase)
|
| 39 |
+
phrase = re.sub(r"\'m", " am", phrase)
|
| 40 |
+
return phrase
|
| 41 |
+
|
| 42 |
+
def preprocessText(text):
|
| 43 |
+
# convert text to lower case
|
| 44 |
+
text = text.lower()
|
| 45 |
+
# remove html tags and unknown unicode characters like &nbsm etc
|
| 46 |
+
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
|
| 47 |
+
text = cleanhtml(text, CLEANR)
|
| 48 |
+
# remove contractions
|
| 49 |
+
text = decontracted(text)
|
| 50 |
+
return text
|
| 51 |
+
|
| 52 |
+
def getSimilarity(q1,q2, model, vectorizer):
|
| 53 |
+
q1, q2 = preprocessText(q1), preprocessText(q2)
|
| 54 |
+
vect1 = vectorizer.encode(q1)
|
| 55 |
+
vect2 = vectorizer.encode(q2)
|
| 56 |
+
vect = np.concatenate((vect1,vect2))
|
| 57 |
+
prediction = model.predict(vect.reshape(1,-1))[0]
|
| 58 |
+
prediction_proba = round(model.predict_proba(vect.reshape(1,-1))[0][1]*100,2)
|
| 59 |
+
if prediction == 0 :
|
| 60 |
+
return 'Given questions are not similar with similarity score of ' + str(prediction_proba) + '%'
|
| 61 |
+
elif prediction == 1 :
|
| 62 |
+
return 'Given questions are similar with similarity score of ' + str(prediction_proba) + '%'
|
| 63 |
+
|
| 64 |
+
def main():
|
| 65 |
+
model, vectorizer = load_model()
|
| 66 |
+
st.title("Welcome to Questions similarity prediction app")
|
| 67 |
+
q1 = st.text_input('Enter question one')
|
| 68 |
+
q2 = st.text_input('Enter question two')
|
| 69 |
+
if st.button('submit'):
|
| 70 |
+
output = getSimilarity(q1,q2, model, vectorizer)
|
| 71 |
+
st.write(output)
|
| 72 |
+
|
| 73 |
+
if __name__ == '__main__' :
|
| 74 |
+
main()
|