File size: 3,189 Bytes
8e40d74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a74d5ea
37c0d9a
 
 
 
8e40d74
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import pandas as pd
from sentence_transformers import SentenceTransformer, util

sentences = [
    "Molly ate a fish",
    "Давайте посетим море",
    "I would like to sell you a house",
    "Я пытаюсь купить дачу", # I'm trying to buy a summer home
    "J'aimerais vous louer un grand appartement", # I would like to rent a large apartment to you
    "This is a wonderful investment opportunity",
    "これは素晴らしい投資機会です", # investment opportunity
    "野球はあなたが思うよりも面白いことがあります", # baseball can be more interesting than you think
]
translations = {
    "Давайте посетим море": "Let's visit the seaside",
    "Я пытаюсь купить дачу": "I'm trying to buy a summer home",
    "J'aimerais vous louer un grand appartement": "I would like to rent a large apartment to you",
    "これは素晴らしい投資機会です": "This is a great investment opportunity",
    "野球はあなたが思うよりも面白いことがあります": "Baseball can be more interesting than you think"
}
samples = '\n'.join(sentences)

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model2 = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")


def do_action(query, documents):
    sentences = documents.split("\n")

    query_embedding = model.encode([query])
    corpus_embeddings = model.encode(sentences)
    rankings = util.semantic_search(query_embedding, corpus_embeddings)[0]

    query_embedding2 = model2.encode([query])
    corpus_embeddings2 = model2.encode(sentences)
    rankings2 = util.semantic_search(query_embedding2, corpus_embeddings2)[0]


    results = []
    for ranking, ranking2 in zip(rankings, rankings2):
        text = sentences[ranking['corpus_id']]
        if text in translations:
            text = f"{text} [english: {translations[text]}]"
        result = {
            'text': text,
            'score_multi': ranking['score'],
            'score_en': ranking2['score'],
        }
        results.append(result)

    title = f"## Matches for \"{query}\""
    
    return title, pd.DataFrame(results) \
                    .sort_values(by='score_multi', ascending=False) \
                    .style.format(precision=2) \
                    .background_gradient('YlGnBu')

query_input = gr.Textbox(value="Get rich quick by flipping cheap houses")
docs_input = gr.Textbox(value=samples, label="Sentences")
output_scores = gr.DataFrame(label="", wrap=True, scale=2)

title = "Multilingual Semantic Search/Similarity Comparison"
desc = """
A small demo to compare the [all-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) and [paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) embedding models. Notice how the multilingual model scores similar sentences higher, even if they aren't in the same language!
"""
demo = gr.Interface(fn=do_action, title=title, description=desc, inputs=[query_input, docs_input], outputs=["markdown", output_scores])
demo.launch()