Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -116,7 +116,10 @@ class FileHandler:
|
|
| 116 |
def simple_tokenize(text):
|
| 117 |
return text.split()
|
| 118 |
|
| 119 |
-
def preprocess_text(text, lang='german'):
|
|
|
|
|
|
|
|
|
|
| 120 |
text = text.lower()
|
| 121 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 122 |
|
|
@@ -141,13 +144,29 @@ def preprocess_text(text, lang='german'):
|
|
| 141 |
|
| 142 |
return ' '.join(tokens)
|
| 143 |
|
| 144 |
-
def phonetic_match(text, query, method='levenshtein_distance'):
|
|
|
|
|
|
|
| 145 |
if method == 'levenshtein_distance':
|
| 146 |
text_phonetic = jellyfish.soundex(text)
|
| 147 |
query_phonetic = jellyfish.soundex(query)
|
| 148 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 149 |
return 0
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 152 |
tokenized_texts = [text.split() for text in texts]
|
| 153 |
|
|
@@ -399,7 +418,7 @@ def rerank_results(results, query, reranker):
|
|
| 399 |
return reranked_results
|
| 400 |
|
| 401 |
# Main Comparison Function
|
| 402 |
-
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, use_reranking=False):
|
| 403 |
all_results = []
|
| 404 |
all_stats = []
|
| 405 |
settings = {
|
|
@@ -431,6 +450,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 431 |
overlap_size,
|
| 432 |
custom_separators.split(',') if custom_separators else None,
|
| 433 |
lang,
|
|
|
|
| 434 |
custom_tokenizer_file,
|
| 435 |
custom_tokenizer_model,
|
| 436 |
int(custom_tokenizer_vocab_size),
|
|
@@ -442,12 +462,7 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 442 |
chunks = optimized_chunks
|
| 443 |
|
| 444 |
if use_query_optimization:
|
| 445 |
-
|
| 446 |
-
model_id="google/flan-t5-base",
|
| 447 |
-
task="text2text-generation",
|
| 448 |
-
model_kwargs={"temperature": 0, "max_length": 64},
|
| 449 |
-
)
|
| 450 |
-
optimized_queries = optimize_query(query, llm)
|
| 451 |
query = " ".join(optimized_queries)
|
| 452 |
|
| 453 |
results, search_time, vector_store, results_raw = search_embeddings(
|
|
@@ -458,9 +473,10 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 458 |
query,
|
| 459 |
top_k,
|
| 460 |
lang,
|
|
|
|
| 461 |
phonetic_weight
|
| 462 |
)
|
| 463 |
-
|
| 464 |
if use_reranking:
|
| 465 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 466 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
@@ -506,7 +522,15 @@ def launch_interface(share=True):
|
|
| 506 |
with gr.Tab("Simple"):
|
| 507 |
file_input = gr.File(label="Upload File (Optional)")
|
| 508 |
query_input = gr.Textbox(label="Search Query")
|
| 509 |
-
embedding_models_input = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 511 |
|
| 512 |
with gr.Tab("Advanced"):
|
|
@@ -520,13 +544,16 @@ def launch_interface(share=True):
|
|
| 520 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 521 |
|
| 522 |
with gr.Tab("Optional"):
|
|
|
|
| 523 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
|
|
|
| 524 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
| 525 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
| 526 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
| 527 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
| 528 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 529 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
|
|
|
| 530 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
| 531 |
|
| 532 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
|
@@ -540,13 +567,15 @@ def launch_interface(share=True):
|
|
| 540 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
| 541 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
| 542 |
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
|
|
|
| 546 |
],
|
| 547 |
outputs=[results_output, stats_output, plot_output]
|
| 548 |
)
|
| 549 |
|
|
|
|
| 550 |
tutorial_md = """
|
| 551 |
# Advanced Embedding Comparison Tool Tutorial
|
| 552 |
|
|
|
|
| 116 |
def simple_tokenize(text):
|
| 117 |
return text.split()
|
| 118 |
|
| 119 |
+
def preprocess_text(text, lang='german', apply_preprocessing=True):
|
| 120 |
+
if not apply_preprocessing:
|
| 121 |
+
return text
|
| 122 |
+
|
| 123 |
text = text.lower()
|
| 124 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 125 |
|
|
|
|
| 144 |
|
| 145 |
return ' '.join(tokens)
|
| 146 |
|
| 147 |
+
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=True):
|
| 148 |
+
if not apply_phonetic:
|
| 149 |
+
return 0
|
| 150 |
if method == 'levenshtein_distance':
|
| 151 |
text_phonetic = jellyfish.soundex(text)
|
| 152 |
query_phonetic = jellyfish.soundex(query)
|
| 153 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 154 |
return 0
|
| 155 |
|
| 156 |
+
def optimize_query(query, llm_model):
|
| 157 |
+
llm = HuggingFacePipeline.from_model_id(
|
| 158 |
+
model_id=llm_model,
|
| 159 |
+
task="text2text-generation",
|
| 160 |
+
model_kwargs={"temperature": 0, "max_length": 64},
|
| 161 |
+
)
|
| 162 |
+
multi_query_retriever = MultiQueryRetriever.from_llm(
|
| 163 |
+
retriever=get_retriever(vector_store, search_type, search_kwargs),
|
| 164 |
+
llm=llm
|
| 165 |
+
)
|
| 166 |
+
optimized_queries = multi_query_retriever.generate_queries(query)
|
| 167 |
+
return optimized_queries
|
| 168 |
+
|
| 169 |
+
|
| 170 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 171 |
tokenized_texts = [text.split() for text in texts]
|
| 172 |
|
|
|
|
| 418 |
return reranked_results
|
| 419 |
|
| 420 |
# Main Comparison Function
|
| 421 |
+
def compare_embeddings(file, query, embedding_models, custom_embedding_model, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', apply_preprocessing=True, optimize_vocab=False, apply_phonetic=True, phonetic_weight=0.3, custom_tokenizer_file=None, custom_tokenizer_model=None, custom_tokenizer_vocab_size=10000, custom_tokenizer_special_tokens=None, use_query_optimization=False, query_optimization_model="google/flan-t5-base", use_reranking=False):
|
| 422 |
all_results = []
|
| 423 |
all_stats = []
|
| 424 |
settings = {
|
|
|
|
| 450 |
overlap_size,
|
| 451 |
custom_separators.split(',') if custom_separators else None,
|
| 452 |
lang,
|
| 453 |
+
apply_preprocessing,
|
| 454 |
custom_tokenizer_file,
|
| 455 |
custom_tokenizer_model,
|
| 456 |
int(custom_tokenizer_vocab_size),
|
|
|
|
| 462 |
chunks = optimized_chunks
|
| 463 |
|
| 464 |
if use_query_optimization:
|
| 465 |
+
optimized_queries = optimize_query(query, query_optimization_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
query = " ".join(optimized_queries)
|
| 467 |
|
| 468 |
results, search_time, vector_store, results_raw = search_embeddings(
|
|
|
|
| 473 |
query,
|
| 474 |
top_k,
|
| 475 |
lang,
|
| 476 |
+
apply_phonetic,
|
| 477 |
phonetic_weight
|
| 478 |
)
|
| 479 |
+
|
| 480 |
if use_reranking:
|
| 481 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 482 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
|
| 522 |
with gr.Tab("Simple"):
|
| 523 |
file_input = gr.File(label="Upload File (Optional)")
|
| 524 |
query_input = gr.Textbox(label="Search Query")
|
| 525 |
+
embedding_models_input = gr.CheckboxGroup(
|
| 526 |
+
choices=[
|
| 527 |
+
"HuggingFace:paraphrase-miniLM",
|
| 528 |
+
"HuggingFace:paraphrase-mpnet",
|
| 529 |
+
"OpenAI:text-embedding-ada-002",
|
| 530 |
+
"Cohere:embed-multilingual-v2.0"
|
| 531 |
+
],
|
| 532 |
+
label="Embedding Models"
|
| 533 |
+
)
|
| 534 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 535 |
|
| 536 |
with gr.Tab("Advanced"):
|
|
|
|
| 544 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 545 |
|
| 546 |
with gr.Tab("Optional"):
|
| 547 |
+
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=True)
|
| 548 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
| 549 |
+
apply_phonetic_input = gr.Checkbox(label="Apply Phonetic Matching", value=True)
|
| 550 |
phonetic_weight_input = gr.Slider(0, 1, step=0.1, value=0.3, label="Phonetic Matching Weight")
|
| 551 |
custom_tokenizer_file_input = gr.File(label="Custom Tokenizer File (Optional)")
|
| 552 |
custom_tokenizer_model_input = gr.Textbox(label="Custom Tokenizer Model (e.g., WordLevel, BPE, Unigram)")
|
| 553 |
custom_tokenizer_vocab_size_input = gr.Textbox(label="Custom Tokenizer Vocab Size", value="10000")
|
| 554 |
custom_tokenizer_special_tokens_input = gr.Textbox(label="Custom Tokenizer Special Tokens (comma-separated)")
|
| 555 |
use_query_optimization_input = gr.Checkbox(label="Use Query Optimization", value=False)
|
| 556 |
+
query_optimization_model_input = gr.Textbox(label="Query Optimization Model", value="google/flan-t5-base")
|
| 557 |
use_reranking_input = gr.Checkbox(label="Use Reranking", value=False)
|
| 558 |
|
| 559 |
results_output = gr.Dataframe(label="Results", interactive=False)
|
|
|
|
| 567 |
file_input, query_input, embedding_models_input, custom_embedding_model_input,
|
| 568 |
split_strategy_input, chunk_size_input, overlap_size_input, custom_separators_input,
|
| 569 |
vector_store_type_input, search_type_input, top_k_input, lang_input,
|
| 570 |
+
apply_preprocessing_input, optimize_vocab_input, apply_phonetic_input,
|
| 571 |
+
phonetic_weight_input, custom_tokenizer_file_input, custom_tokenizer_model_input,
|
| 572 |
+
custom_tokenizer_vocab_size_input, custom_tokenizer_special_tokens_input,
|
| 573 |
+
use_query_optimization_input, query_optimization_model_input, use_reranking_input
|
| 574 |
],
|
| 575 |
outputs=[results_output, stats_output, plot_output]
|
| 576 |
)
|
| 577 |
|
| 578 |
+
|
| 579 |
tutorial_md = """
|
| 580 |
# Advanced Embedding Comparison Tool Tutorial
|
| 581 |
|