Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,3 +11,27 @@ df = pd.read_csv(input_datapath, index_col=0)
|
|
| 11 |
|
| 12 |
st.title("Semanti Search")
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
st.title("Semanti Search")
|
| 13 |
|
| 14 |
+
|
| 15 |
+
#adding another column having the summary as title and the actual text as content
|
| 16 |
+
df["combined"] = (
|
| 17 |
+
"Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# embedding model parameters
|
| 22 |
+
embedding_model = "text-embedding-ada-002"
|
| 23 |
+
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
|
| 24 |
+
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
encoding = tiktoken.get_encoding(embedding_encoding)
|
| 28 |
+
top_n = 500
|
| 29 |
+
# omit reviews that are too long to embed
|
| 30 |
+
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
|
| 31 |
+
df = df[df.n_tokens <= max_tokens].tail(top_n)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
datafile_path = "fine_food_reviews_with_embeddings_1k.csv"
|
| 35 |
+
df = pd.read_csv(datafile_path)
|
| 36 |
+
df["embedding"] = df.embedding.apply(eval).apply(np.array)
|
| 37 |
+
|