Spaces:
Runtime error
Runtime error
Commit ·
7eea2bf
1
Parent(s): d7743fd
Create streamlit app
Browse files
app.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from InstructorEmbedding import INSTRUCTOR
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
|
| 6 |
+
# if 'model' is not in st.session_state:
|
| 7 |
+
# st.session_state['model'] = INSTRUCTOR('hkunlp/instructor-large')
|
| 8 |
+
|
| 9 |
+
@st.cache_resource
|
| 10 |
+
def load_model():
|
| 11 |
+
return INSTRUCTOR('hkunlp/instructor-large')
|
| 12 |
+
|
| 13 |
+
model = load_model()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def process_data(df, desc, message, embed=False):
|
| 17 |
+
data = [
|
| 18 |
+
[
|
| 19 |
+
f'Represent the document for retrieval of {x[desc]]} information : ',
|
| 20 |
+
x[message]
|
| 21 |
+
] for _,x in df.iterrows()
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
if embed :
|
| 25 |
+
corpus_embeddings = embed_data(data)
|
| 26 |
+
|
| 27 |
+
question = st.text_input("Question : ")
|
| 28 |
+
btn_q = st.button("Submit")
|
| 29 |
+
|
| 30 |
+
if btn_q :
|
| 31 |
+
query = [['Represent the question for retrieving supporting documents: ',question]]
|
| 32 |
+
query_embeddings = model.encode(query)
|
| 33 |
+
|
| 34 |
+
similarities = cosine_similarity(query_embeddings,corpus_embeddings)
|
| 35 |
+
retrieved_doc_id = np.argmax(similarities)
|
| 36 |
+
st.text(f"{data[retrieved_doc_id][-1]}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
opt = st.radio("Choose Data : ", ["intent.csv", "upload file CSV"], captions=["LMD CSV intent data", "Custom upload CSV data"])
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
if opt == "intent.csv":
|
| 43 |
+
df = pd.read_csv("intent.csv", delimiter=";")
|
| 44 |
+
process_data(df, desc, message, embed=True)
|
| 45 |
+
|
| 46 |
+
else :
|
| 47 |
+
f = st.file_uploader("Upload CSV File with at least 2 columns", ['xlsx', 'csv'])
|
| 48 |
+
delim = st.text_input('CSV File Delimiter')
|
| 49 |
+
btn = st.button("Submit")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@st.cache_data(presist='disk')
|
| 54 |
+
def embed_data(data):
|
| 55 |
+
corpus_embeddings = model.encode(data)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if btn:
|
| 59 |
+
df = pd.read_csv(f, delimiter=delim)
|
| 60 |
+
cols = list(df.columns)
|
| 61 |
+
|
| 62 |
+
if len(cols) < 2 :
|
| 63 |
+
st.write("FAILED! At least 2 columns needed. Please check your dataset")
|
| 64 |
+
else :
|
| 65 |
+
desc = st.radio("Description Column", cols)
|
| 66 |
+
message = st.radio("Template Column", cols)
|
| 67 |
+
btn_col = st.button("Submit")
|
| 68 |
+
if btn_col:
|
| 69 |
+
process_data(df, desc, message, embed=True)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|