Spaces:
Runtime error
Runtime error
Initial commit
Browse files
app.py
CHANGED
|
@@ -2,8 +2,7 @@ import warnings
|
|
| 2 |
import torchvision
|
| 3 |
import torch
|
| 4 |
import pandas as pd
|
| 5 |
-
from transformers
|
| 6 |
-
from transformers import AutoTokenizer, AutoModel
|
| 7 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
import streamlit as st
|
| 9 |
|
|
@@ -11,56 +10,41 @@ import streamlit as st
|
|
| 11 |
torchvision.disable_beta_transforms_warning()
|
| 12 |
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
|
| 13 |
|
| 14 |
-
# Initialize fill-mask pipeline and model/tokenizer for embedding
|
| 15 |
-
pipe = pipeline(
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
|
| 18 |
|
| 19 |
# Function to generate embeddings for full sentences
|
| 20 |
def get_embedding(text):
|
| 21 |
-
inputs = tokenizer(text, return_tensors="pt")
|
| 22 |
with torch.no_grad():
|
| 23 |
outputs = model(**inputs)
|
| 24 |
return outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 25 |
|
| 26 |
# Streamlit app setup
|
| 27 |
st.title("Thai Full Sentence Similarity App")
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
In this example,
|
| 35 |
-
-
|
| 36 |
-
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
### Potential Predictions
|
| 41 |
-
Here are some possible predictions the model might generate for `<mask>`:
|
| 42 |
-
1. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` - Chiang Mai
|
| 43 |
-
2. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เขาใหญ่ เพื่อสัมผัสธรรมชาติ"` - Khao Yai
|
| 44 |
-
3. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เกาะสมุย เพื่อสัมผัสธรรมชาติ"` - Koh Samui
|
| 45 |
-
4. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน ภูเก็ต เพื่อสัมผัสธรรมชาติ"` - Phuket
|
| 46 |
-
|
| 47 |
-
### Results Table
|
| 48 |
-
For each prediction, the app calculates:
|
| 49 |
-
- **Similarity Score**: Indicates how similar the predicted sentence is to the original input.
|
| 50 |
-
- **Model Score**: Represents the model's confidence in the predicted word for `<mask>`.
|
| 51 |
-
|
| 52 |
-
### Most Similar Prediction
|
| 53 |
-
The app will display the most contextually similar prediction based on the similarity score. For example:
|
| 54 |
-
- **Most Similar Prediction**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"`
|
| 55 |
-
- **Similarity Score**: 0.89
|
| 56 |
-
- **Model Score**: 0.16
|
| 57 |
-
|
| 58 |
-
Feel free to enter your own sentence with `<mask>` and explore the predictions!
|
| 59 |
""")
|
| 60 |
|
| 61 |
# User input box
|
| 62 |
st.subheader("Input Text")
|
| 63 |
-
input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "
|
| 64 |
|
| 65 |
# Ensure the input includes a `<mask>`
|
| 66 |
if "<mask>" not in input_text:
|
|
|
|
| 2 |
import torchvision
|
| 3 |
import torch
|
| 4 |
import pandas as pd
|
| 5 |
+
from transformers import pipeline, AutoTokenizer, AutoModel
|
|
|
|
| 6 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
import streamlit as st
|
| 8 |
|
|
|
|
| 10 |
torchvision.disable_beta_transforms_warning()
|
| 11 |
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
|
| 12 |
|
| 13 |
+
# Initialize fill-mask pipeline and model/tokenizer for embedding with slow tokenizer
|
| 14 |
+
pipe = pipeline(
|
| 15 |
+
"fill-mask",
|
| 16 |
+
model="airesearch/wangchanberta-base-att-spm-uncased",
|
| 17 |
+
tokenizer=AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False),
|
| 18 |
+
framework="pt"
|
| 19 |
+
)
|
| 20 |
model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
|
| 21 |
|
| 22 |
# Function to generate embeddings for full sentences
|
| 23 |
def get_embedding(text):
|
| 24 |
+
inputs = pipe.tokenizer(text, return_tensors="pt")
|
| 25 |
with torch.no_grad():
|
| 26 |
outputs = model(**inputs)
|
| 27 |
return outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
| 28 |
|
| 29 |
# Streamlit app setup
|
| 30 |
st.title("Thai Full Sentence Similarity App")
|
| 31 |
+
|
| 32 |
+
# Explanation of example usage
|
| 33 |
+
st.markdown("""
|
| 34 |
+
### Example Sentence with Mask:
|
| 35 |
+
**Input:** `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"`
|
| 36 |
+
|
| 37 |
+
In this example, the model will replace `<mask>` with possible locations in Thailand, such as:
|
| 38 |
+
- "เชียงใหม่" for "Chiang Mai"
|
| 39 |
+
- "เขาใหญ่" for "Khao Yai"
|
| 40 |
+
- "ภูเก็ต" for "Phuket"
|
| 41 |
+
|
| 42 |
+
The app will compute the similarity between the full sentences generated and the baseline sentence without `<mask>`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
""")
|
| 44 |
|
| 45 |
# User input box
|
| 46 |
st.subheader("Input Text")
|
| 47 |
+
input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "นักท่อ���เที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ")
|
| 48 |
|
| 49 |
# Ensure the input includes a `<mask>`
|
| 50 |
if "<mask>" not in input_text:
|