rishikesh commited on
Commit
d5adbb0
·
1 Parent(s): 8bd6e9b

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. RFC_tuned.sav +3 -0
  3. app.py +74 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ RFC_tuned.sav filter=lfs diff=lfs merge=lfs -text
RFC_tuned.sav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00bf54387151c113da5dfd87a75c8ffb109b35069dddc63b92afd2100782cb97
3
+ size 34042947
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pickle
5
+ from sentence_transformers import SentenceTransformer
6
+ import re
7
+
8
+ #@st.cache()
9
+ @st.cache(allow_output_mutation=True)
10
+ def load_model():
11
+ """Retrieves the trained model"""
12
+ filename = 'RFC_tuned.sav'
13
+ loaded_model = pickle.load(open(filename, 'rb'))
14
+ vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
15
+ return loaded_model, vectorizer
16
+ # reference : https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
17
+ def cleanhtml(raw_html, CLEANR):
18
+ cleantext = re.sub(CLEANR, '', raw_html)
19
+ return cleantext
20
+
21
+ # reference : https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
22
+ def decontracted(phrase):
23
+ # mathematical symbols
24
+ phrase = re.sub(r'000,000', 'm',phrase)
25
+ phrase = re.sub(r'000','k',phrase)
26
+
27
+ # specific
28
+ phrase = re.sub(r"won\'t", "will not", phrase)
29
+ phrase = re.sub(r"can\'t", "can not", phrase)
30
+
31
+ # general
32
+ phrase = re.sub(r"n\'t", " not", phrase)
33
+ phrase = re.sub(r"\'re", " are", phrase)
34
+ phrase = re.sub(r"\'s", " is", phrase)
35
+ phrase = re.sub(r"\'d", " would", phrase)
36
+ phrase = re.sub(r"\'ll", " will", phrase)
37
+ phrase = re.sub(r"\'t", " not", phrase)
38
+ phrase = re.sub(r"\'ve", " have", phrase)
39
+ phrase = re.sub(r"\'m", " am", phrase)
40
+ return phrase
41
+
42
+ def preprocessText(text):
43
+ # convert text to lower case
44
+ text = text.lower()
45
+ # remove html tags and unknown unicode characters like &nbsm etc
46
+ CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
47
+ text = cleanhtml(text, CLEANR)
48
+ # remove contractions
49
+ text = decontracted(text)
50
+ return text
51
+
52
+ def getSimilarity(q1,q2, model, vectorizer):
53
+ q1, q2 = preprocessText(q1), preprocessText(q2)
54
+ vect1 = vectorizer.encode(q1)
55
+ vect2 = vectorizer.encode(q2)
56
+ vect = np.concatenate((vect1,vect2))
57
+ prediction = model.predict(vect.reshape(1,-1))[0]
58
+ prediction_proba = round(model.predict_proba(vect.reshape(1,-1))[0][1]*100,2)
59
+ if prediction == 0 :
60
+ return 'Given questions are not similar with similarity score of ' + str(prediction_proba) + '%'
61
+ elif prediction == 1 :
62
+ return 'Given questions are similar with similarity score of ' + str(prediction_proba) + '%'
63
+
64
+ def main():
65
+ model, vectorizer = load_model()
66
+ st.title("Welcome to Questions similarity prediction app")
67
+ q1 = st.text_input('Enter question one')
68
+ q2 = st.text_input('Enter question two')
69
+ if st.button('submit'):
70
+ output = getSimilarity(q1,q2, model, vectorizer)
71
+ st.write(output)
72
+
73
+ if __name__ == '__main__' :
74
+ main()