Spaces:

rishikesh
/

QuestionPairSimilarityPredictor

Sleeping

App Files Files Community

rishikesh commited on Dec 4, 2022

Commit

d5adbb0

1 Parent(s): 8bd6e9b

Upload 2 files

Browse files

Files changed (3) hide show

.gitattributes +1 -0
RFC_tuned.sav +3 -0
app.py +74 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+RFC_tuned.sav filter=lfs diff=lfs merge=lfs -text

RFC_tuned.sav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00bf54387151c113da5dfd87a75c8ffb109b35069dddc63b92afd2100782cb97
+size 34042947

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import streamlit as st
+import numpy as np
+import pandas as pd
+import pickle
+from sentence_transformers import SentenceTransformer
+import re
+#@st.cache()
+@st.cache(allow_output_mutation=True)
+def load_model():
+	"""Retrieves the trained model"""
+	filename = 'RFC_tuned.sav'
+	loaded_model = pickle.load(open(filename, 'rb'))
+	vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
+	return loaded_model, vectorizer
+# reference : https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
+def cleanhtml(raw_html, CLEANR):
+	cleantext = re.sub(CLEANR, '', raw_html)
+	return cleantext
+# reference : https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
+def decontracted(phrase):
+	# mathematical symbols
+	phrase = re.sub(r'000,000', 'm',phrase)
+	phrase = re.sub(r'000','k',phrase)
+	# specific
+	phrase = re.sub(r"won\'t", "will not", phrase)
+	phrase = re.sub(r"can\'t", "can not", phrase)
+	# general
+	phrase = re.sub(r"n\'t", " not", phrase)
+	phrase = re.sub(r"\'re", " are", phrase)
+	phrase = re.sub(r"\'s", " is", phrase)
+	phrase = re.sub(r"\'d", " would", phrase)
+	phrase = re.sub(r"\'ll", " will", phrase)
+	phrase = re.sub(r"\'t", " not", phrase)
+	phrase = re.sub(r"\'ve", " have", phrase)
+	phrase = re.sub(r"\'m", " am", phrase)
+	return phrase
+def preprocessText(text):
+	# convert text to lower case
+	text = text.lower()
+	# remove html tags and unknown unicode characters like &nbsm etc
+	CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
+	text = cleanhtml(text, CLEANR)
+	# remove contractions
+	text = decontracted(text)
+	return text
+def getSimilarity(q1,q2, model, vectorizer):
+	q1, q2 = preprocessText(q1), preprocessText(q2)
+	vect1 = vectorizer.encode(q1)
+	vect2 = vectorizer.encode(q2)
+	vect = np.concatenate((vect1,vect2))
+	prediction = model.predict(vect.reshape(1,-1))[0]
+	prediction_proba = round(model.predict_proba(vect.reshape(1,-1))[0][1]*100,2)
+	if prediction == 0 :
+		return 'Given questions are not similar with similarity score of ' + str(prediction_proba) + '%'
+	elif prediction == 1 :
+		return 'Given questions are similar with similarity score of ' + str(prediction_proba) + '%'
+def main():
+	model, vectorizer = load_model()
+	st.title("Welcome to Questions similarity prediction app")
+	q1 = st.text_input('Enter question one')
+	q2 = st.text_input('Enter question two')
+	if st.button('submit'):
+		output = getSimilarity(q1,q2,  model, vectorizer)
+		st.write(output)
+if __name__ == '__main__' :
+	main()