Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -41,8 +41,8 @@ from huggingface_hub import login
|
|
| 41 |
from typing import List, Tuple, Optional
|
| 42 |
|
| 43 |
|
| 44 |
-
hf_token = os.getenv("hf_token")
|
| 45 |
-
login(token=hf_token)
|
| 46 |
|
| 47 |
# Define the model pipeline with additional generation parameters
|
| 48 |
#model_pipeline = pipeline(
|
|
@@ -154,28 +154,28 @@ class ModelManager:
|
|
| 154 |
}
|
| 155 |
}
|
| 156 |
|
| 157 |
-
|
| 158 |
def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
|
| 159 |
"""Update model ranking based on performance and optional feedback"""
|
| 160 |
current_score = self.rankings.get(model_id, 0.0)
|
| 161 |
# Weighted average of current score and new score
|
| 162 |
self.rankings[model_id] = 0.7 * current_score + 0.3 * score
|
| 163 |
-
|
| 164 |
if feedback:
|
| 165 |
if model_id not in self.model_stats:
|
| 166 |
self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
|
| 167 |
self.model_stats[model_id]["feedback_count"] += 1
|
| 168 |
self.model_stats[model_id]["feedback"].append(feedback)
|
| 169 |
-
|
| 170 |
def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
|
| 171 |
"""Get top n ranked models"""
|
| 172 |
return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
|
| 173 |
-
|
| 174 |
def get_model_stats(self, model_id: str) -> Dict[str, Any]:
|
| 175 |
"""Get statistics for a specific model"""
|
| 176 |
return self.model_stats.get(model_id, {})
|
| 177 |
|
| 178 |
-
|
| 179 |
def add_model(self, provider, name, model_path):
|
| 180 |
if provider not in self.models:
|
| 181 |
self.models[provider] = {}
|
|
@@ -286,29 +286,29 @@ def simple_tokenize(text):
|
|
| 286 |
def preprocess_text(text, lang='german', apply_preprocessing=False):
|
| 287 |
if not apply_preprocessing:
|
| 288 |
return text
|
| 289 |
-
|
| 290 |
text = text.lower()
|
| 291 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 292 |
-
|
| 293 |
try:
|
| 294 |
tokens = word_tokenize(text, language=lang)
|
| 295 |
except LookupError:
|
| 296 |
print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
|
| 297 |
tokens = simple_tokenize(text)
|
| 298 |
-
|
| 299 |
try:
|
| 300 |
stop_words = set(stopwords.words(lang))
|
| 301 |
except LookupError:
|
| 302 |
print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
|
| 303 |
stop_words = set()
|
| 304 |
tokens = [token for token in tokens if token not in stop_words]
|
| 305 |
-
|
| 306 |
try:
|
| 307 |
stemmer = SnowballStemmer(lang)
|
| 308 |
tokens = [stemmer.stem(token) for token in tokens]
|
| 309 |
except ValueError:
|
| 310 |
print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
|
| 311 |
-
|
| 312 |
return ' '.join(tokens)
|
| 313 |
|
| 314 |
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
|
|
@@ -341,7 +341,7 @@ def optimize_query(
|
|
| 341 |
) -> str:
|
| 342 |
"""
|
| 343 |
CPU-optimized version of query expansion using a small language model.
|
| 344 |
-
|
| 345 |
Args:
|
| 346 |
query: Original search query
|
| 347 |
query_optimization_model: Name or path of the model to use for optimization
|
|
@@ -351,17 +351,17 @@ def optimize_query(
|
|
| 351 |
search_type: Type of search being performed
|
| 352 |
top_k: Number of expansion terms to add
|
| 353 |
use_gpu: Whether to use GPU if available (defaults to False for CPU)
|
| 354 |
-
|
| 355 |
Returns:
|
| 356 |
Expanded query string
|
| 357 |
"""
|
| 358 |
try:
|
| 359 |
# Set device
|
| 360 |
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
| 361 |
-
|
| 362 |
# 1. Basic text preprocessing (CPU-based)
|
| 363 |
tokens = word_tokenize(query.lower())
|
| 364 |
-
|
| 365 |
# 2. WordNet synonyms expansion (CPU-based)
|
| 366 |
expanded_terms = set()
|
| 367 |
for token in tokens:
|
|
@@ -370,7 +370,7 @@ def optimize_query(
|
|
| 370 |
for syn in synsets:
|
| 371 |
# Limit number of lemmas
|
| 372 |
expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
|
| 373 |
-
|
| 374 |
# 3. Use provided model with reduced complexity
|
| 375 |
try:
|
| 376 |
# Load model with reduced memory footprint
|
|
@@ -384,11 +384,11 @@ def optimize_query(
|
|
| 384 |
low_cpu_mem_usage=True,
|
| 385 |
device_map="cpu"
|
| 386 |
)
|
| 387 |
-
|
| 388 |
# Move model to CPU and eval mode
|
| 389 |
model = model.to(device)
|
| 390 |
model.eval()
|
| 391 |
-
|
| 392 |
# Prepare input with reduced length
|
| 393 |
prompt = f"Enhance this search query with relevant terms: {query}"
|
| 394 |
inputs = tokenizer(
|
|
@@ -398,7 +398,7 @@ def optimize_query(
|
|
| 398 |
truncation=True,
|
| 399 |
padding=True
|
| 400 |
)
|
| 401 |
-
|
| 402 |
# Generate with minimal parameters
|
| 403 |
with torch.no_grad():
|
| 404 |
outputs = model.generate(
|
|
@@ -409,41 +409,41 @@ def optimize_query(
|
|
| 409 |
do_sample=False,
|
| 410 |
early_stopping=True
|
| 411 |
)
|
| 412 |
-
|
| 413 |
enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 414 |
-
|
| 415 |
# Clear CUDA cache if GPU was used
|
| 416 |
if device == "cuda":
|
| 417 |
torch.cuda.empty_cache()
|
| 418 |
-
|
| 419 |
except Exception as model_error:
|
| 420 |
print(f"Model-based expansion failed: {str(model_error)}")
|
| 421 |
enhanced_query = query
|
| 422 |
-
|
| 423 |
# 4. Combine original and expanded terms
|
| 424 |
final_terms = set(tokens)
|
| 425 |
final_terms.update(expanded_terms)
|
| 426 |
if enhanced_query != query:
|
| 427 |
final_terms.update(word_tokenize(enhanced_query.lower()))
|
| 428 |
-
|
| 429 |
# 5. Remove stopwords and select top_k most relevant terms
|
| 430 |
stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
|
| 431 |
final_terms = [term for term in final_terms if term not in stopwords]
|
| 432 |
-
|
| 433 |
# Combine with original query
|
| 434 |
expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
|
| 435 |
-
|
| 436 |
# Clean up
|
| 437 |
del model
|
| 438 |
del tokenizer
|
| 439 |
if device == "cuda":
|
| 440 |
torch.cuda.empty_cache()
|
| 441 |
-
|
| 442 |
-
return [Document(page_content=expanded_query.strip())]
|
| 443 |
-
|
| 444 |
except Exception as e:
|
| 445 |
print(f"Query optimization failed: {str(e)}")
|
| 446 |
-
return [Document(page_content=query)] # Return original query if optimization fails
|
| 447 |
|
| 448 |
|
| 449 |
|
|
@@ -458,27 +458,27 @@ optimized_query = optimize_query(
|
|
| 458 |
use_gpu=False # Explicitly use CPU
|
| 459 |
)
|
| 460 |
"""
|
| 461 |
-
|
| 462 |
|
| 463 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 464 |
tokenized_texts = [text.split() for text in texts]
|
| 465 |
-
|
| 466 |
if model_type == 'word2vec':
|
| 467 |
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 468 |
elif model_type == 'fasttext':
|
| 469 |
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 470 |
else:
|
| 471 |
raise ValueError("Unsupported model type")
|
| 472 |
-
|
| 473 |
return model
|
| 474 |
|
| 475 |
class CustomEmbeddings(HuggingFaceEmbeddings):
|
| 476 |
def __init__(self, model_path):
|
| 477 |
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
| 478 |
-
|
| 479 |
def embed_documents(self, texts):
|
| 480 |
return [self.model.wv[text.split()] for text in texts]
|
| 481 |
-
|
| 482 |
def embed_query(self, text):
|
| 483 |
return self.model.wv[text.split()]
|
| 484 |
|
|
@@ -520,7 +520,7 @@ def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separator
|
|
| 520 |
chunk_size=chunk_size,
|
| 521 |
chunk_overlap=overlap_size,
|
| 522 |
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
| 523 |
-
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
|
| 524 |
separators=custom_separators or ["\n\n", "\n", " ", ""]
|
| 525 |
)
|
| 526 |
else:
|
|
@@ -534,7 +534,7 @@ def get_embedding_model(model_type, model_name):
|
|
| 534 |
multi_process=True,
|
| 535 |
# model_kwargs={"device": "cpu"},
|
| 536 |
#encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
|
| 537 |
-
)
|
| 538 |
elif model_type == 'OpenAI':
|
| 539 |
return OpenAIEmbeddings(model=model_path)
|
| 540 |
elif model_type == 'Cohere':
|
|
@@ -566,10 +566,10 @@ def custom_similarity(query_embedding, doc_embedding, query, doc_text, phonetic_
|
|
| 566 |
phonetic_sim = phonetic_match(doc_text, query)
|
| 567 |
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
| 568 |
return combined_sim
|
| 569 |
-
|
| 570 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
| 571 |
chunks = list(chunks_tuple)
|
| 572 |
-
|
| 573 |
if vector_store_type == 'FAISS':
|
| 574 |
return FAISS.from_texts(chunks, embedding_model)
|
| 575 |
elif vector_store_type == 'Chroma':
|
|
@@ -587,7 +587,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
| 587 |
for file in os.listdir(FILES_DIR):
|
| 588 |
file_path = os.path.join(FILES_DIR, file)
|
| 589 |
text += FileHandler.extract_text(file_path)
|
| 590 |
-
|
| 591 |
if custom_tokenizer_file:
|
| 592 |
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
| 593 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
|
@@ -603,7 +603,7 @@ def process_files(file_path, model_type, model_name, split_strategy, chunk_size,
|
|
| 603 |
|
| 604 |
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
|
| 605 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 606 |
-
|
| 607 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
| 608 |
retriever = get_retriever(vector_store, search_type, {"k": top_k})
|
| 609 |
|
|
@@ -613,10 +613,10 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 613 |
#this should be optional
|
| 614 |
def score_result(doc):
|
| 615 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
| 616 |
-
|
| 617 |
# Add bonus for containing expected result
|
| 618 |
expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
|
| 619 |
-
|
| 620 |
if apply_phonetic:
|
| 621 |
phonetic_score = phonetic_match(doc.page_content, query)
|
| 622 |
return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
|
|
@@ -645,7 +645,7 @@ def search_embeddings(chunks, embedding_model, vector_store_type, search_type, q
|
|
| 645 |
# Enhanced Result Analysis
|
| 646 |
class ResultAnalyzer:
|
| 647 |
@staticmethod
|
| 648 |
-
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
| 649 |
top_k, expected_result=None, model_feedback=None):
|
| 650 |
stats = {
|
| 651 |
"num_results": len(results),
|
|
@@ -657,7 +657,7 @@ class ResultAnalyzer:
|
|
| 657 |
"embedding_dimension": len(embedding_model.embed_query(query)),
|
| 658 |
"top_k": top_k,
|
| 659 |
}
|
| 660 |
-
|
| 661 |
# Add vector store statistics
|
| 662 |
try:
|
| 663 |
if hasattr(vector_store, '_index'):
|
|
@@ -666,13 +666,13 @@ class ResultAnalyzer:
|
|
| 666 |
stats["vector_store_size"] = len(vector_store._collection.get())
|
| 667 |
except:
|
| 668 |
stats["vector_store_size"] = "N/A"
|
| 669 |
-
|
| 670 |
# Add expected result statistics if provided
|
| 671 |
if expected_result:
|
| 672 |
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
| 673 |
-
stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
|
| 674 |
if expected_result in doc.page_content), -1) + 1
|
| 675 |
-
|
| 676 |
# Calculate diversity metrics for larger result sets
|
| 677 |
if len(results) > 3: # Changed from 1000 to make it more practical
|
| 678 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
@@ -681,7 +681,7 @@ class ResultAnalyzer:
|
|
| 681 |
else:
|
| 682 |
stats["result_diversity"] = "N/A"
|
| 683 |
stats["silhouette_score"] = "N/A"
|
| 684 |
-
|
| 685 |
# Add ranking correlation
|
| 686 |
query_embedding = embedding_model.embed_query(query)
|
| 687 |
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
@@ -691,20 +691,20 @@ class ResultAnalyzer:
|
|
| 691 |
stats["rank_correlation"] = rank_correlation
|
| 692 |
else:
|
| 693 |
stats["rank_correlation"] = "N/A"
|
| 694 |
-
|
| 695 |
# Add model feedback if provided
|
| 696 |
if model_feedback:
|
| 697 |
stats["model_feedback"] = model_feedback
|
| 698 |
-
|
| 699 |
return stats
|
| 700 |
-
|
| 701 |
@staticmethod
|
| 702 |
def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
|
| 703 |
"""Calculate diversity score for embeddings"""
|
| 704 |
embeddings_array = np.array(embeddings)
|
| 705 |
pairwise_similarities = np.inner(embeddings_array, embeddings_array)
|
| 706 |
return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 707 |
-
|
| 708 |
@staticmethod
|
| 709 |
def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
|
| 710 |
"""Calculate silhouette score for embeddings"""
|
|
@@ -724,13 +724,13 @@ def visualize_results(results_df, stats_df):
|
|
| 724 |
# Add model column if not present
|
| 725 |
if 'model' not in stats_df.columns:
|
| 726 |
stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
|
| 727 |
-
|
| 728 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
| 729 |
-
|
| 730 |
# Handle empty dataframe case
|
| 731 |
if len(stats_df) == 0:
|
| 732 |
return fig
|
| 733 |
-
|
| 734 |
# Create plots with error handling
|
| 735 |
try:
|
| 736 |
sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
|
|
@@ -738,36 +738,36 @@ def visualize_results(results_df, stats_df):
|
|
| 738 |
axs[0, 0].tick_params(axis='x', rotation=45)
|
| 739 |
except Exception as e:
|
| 740 |
print(f"Error in search time plot: {e}")
|
| 741 |
-
|
| 742 |
try:
|
| 743 |
-
sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
|
| 744 |
hue='model', ax=axs[0, 1])
|
| 745 |
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
| 746 |
except Exception as e:
|
| 747 |
print(f"Error in diversity plot: {e}")
|
| 748 |
-
|
| 749 |
try:
|
| 750 |
sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
|
| 751 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
| 752 |
axs[1, 0].tick_params(axis='x', rotation=45)
|
| 753 |
except Exception as e:
|
| 754 |
print(f"Error in content length plot: {e}")
|
| 755 |
-
|
| 756 |
try:
|
| 757 |
valid_embeddings = results_df['embedding'].dropna().values
|
| 758 |
if len(valid_embeddings) > 1:
|
| 759 |
tsne = TSNE(n_components=2, random_state=42)
|
| 760 |
embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
|
| 761 |
-
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
|
| 762 |
-
hue=results_df['Model'][:len(valid_embeddings)],
|
| 763 |
ax=axs[1, 1])
|
| 764 |
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
| 765 |
else:
|
| 766 |
-
axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
|
| 767 |
ha='center', va='center')
|
| 768 |
except Exception as e:
|
| 769 |
print(f"Error in embedding visualization: {e}")
|
| 770 |
-
|
| 771 |
plt.tight_layout()
|
| 772 |
return fig
|
| 773 |
|
|
@@ -778,56 +778,56 @@ def visualize_results(results_df, stats_df):
|
|
| 778 |
#plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
|
| 779 |
#plt.show()
|
| 780 |
|
| 781 |
-
|
| 782 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 783 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 784 |
|
| 785 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 786 |
-
|
| 787 |
optimized_texts = [
|
| 788 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
| 789 |
for text in texts
|
| 790 |
]
|
| 791 |
-
|
| 792 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 793 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 794 |
-
|
| 795 |
return tokenizer, optimized_texts
|
| 796 |
-
|
| 797 |
import numpy as np
|
| 798 |
from transformers import TextClassificationPipeline
|
| 799 |
from typing import List, Union, Any
|
| 800 |
|
| 801 |
-
|
| 802 |
|
| 803 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 804 |
|
| 805 |
|
| 806 |
def rerank_results(
|
| 807 |
-
results: List[Any],
|
| 808 |
-
query: str,
|
| 809 |
reranker: Union[TextClassificationPipeline, Any]
|
| 810 |
) -> List[Any]:
|
| 811 |
"""
|
| 812 |
-
|
| 813 |
"""
|
| 814 |
if not results:
|
| 815 |
return results
|
| 816 |
-
|
| 817 |
# Step 1: Encode the query and documents using SentenceTransformer
|
| 818 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
| 819 |
doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
|
| 820 |
doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
|
| 821 |
-
|
| 822 |
# Step 2: Compute cosine similarities between query and document embeddings
|
| 823 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
|
| 824 |
-
|
| 825 |
# Step 3: Sort documents by similarity score in descending order
|
| 826 |
-
reranked_idx = np.argsort(cosine_scores.numpy())[::-1]
|
| 827 |
-
|
| 828 |
# Step 4: Return the reranked documents
|
| 829 |
reranked_results = [results[i] for i in reranked_idx]
|
| 830 |
-
|
| 831 |
return reranked_results
|
| 832 |
|
| 833 |
|
|
@@ -878,13 +878,13 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 878 |
if optimize_vocab:
|
| 879 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 880 |
chunks = optimized_chunks
|
| 881 |
-
|
| 882 |
search_query = query
|
| 883 |
-
|
| 884 |
if use_query_optimization:
|
| 885 |
optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
|
| 886 |
#query = " ".join(optimized_queries)
|
| 887 |
-
search_query = " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
|
| 888 |
|
| 889 |
results, search_time, vector_store, results_raw = search_embeddings(
|
| 890 |
chunks,
|
|
@@ -897,8 +897,8 @@ def compare_embeddings(file, query, embedding_models, custom_embedding_model, sp
|
|
| 897 |
lang,
|
| 898 |
apply_phonetic,
|
| 899 |
phonetic_weight
|
| 900 |
-
)
|
| 901 |
-
|
| 902 |
if use_reranking:
|
| 903 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 904 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
@@ -953,7 +953,7 @@ from tqdm import tqdm
|
|
| 953 |
def automated_testing(file, query, test_params, expected_result=None):
|
| 954 |
all_results = []
|
| 955 |
all_stats = []
|
| 956 |
-
|
| 957 |
param_grid = ParameterGrid(test_params)
|
| 958 |
print(param_grid)
|
| 959 |
for params in tqdm(param_grid, desc="Running tests"):
|
|
@@ -995,7 +995,7 @@ def automated_testing(file, query, test_params, expected_result=None):
|
|
| 995 |
params['apply_phonetic'],
|
| 996 |
params['phonetic_weight']
|
| 997 |
)
|
| 998 |
-
|
| 999 |
if params['use_reranking']:
|
| 1000 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 1001 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
@@ -1022,17 +1022,27 @@ def analyze_results(stats_df):
|
|
| 1022 |
'contains_expected': 0.5, # High weight for containing the expected result
|
| 1023 |
'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
|
| 1024 |
}
|
| 1025 |
-
|
|
|
|
|
|
|
|
|
|
| 1026 |
for metric in metric_weights.keys():
|
| 1027 |
-
|
| 1028 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1029 |
stats_df['weighted_score'] = sum(
|
| 1030 |
-
stats_df[metric].fillna(0) * weight
|
| 1031 |
for metric, weight in metric_weights.items()
|
| 1032 |
)
|
| 1033 |
-
|
| 1034 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
| 1035 |
-
|
| 1036 |
recommendations = {
|
| 1037 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
| 1038 |
'best_settings': {
|
|
@@ -1059,7 +1069,7 @@ def analyze_results(stats_df):
|
|
| 1059 |
'expected_result_rank': int(best_config['expected_result_rank'])
|
| 1060 |
}
|
| 1061 |
}
|
| 1062 |
-
|
| 1063 |
return recommendations
|
| 1064 |
|
| 1065 |
####
|
|
@@ -1069,72 +1079,85 @@ def get_llm_suggested_settings(file, num_chunks=1):
|
|
| 1069 |
return {"error": "No file uploaded"}
|
| 1070 |
|
| 1071 |
chunks, _, _ = process_files(
|
| 1072 |
-
file.name,
|
| 1073 |
-
'HuggingFace',
|
| 1074 |
-
'paraphrase-miniLM',
|
| 1075 |
-
'recursive',
|
| 1076 |
-
250,
|
| 1077 |
50,
|
| 1078 |
custom_separators=None
|
| 1079 |
)
|
| 1080 |
-
|
| 1081 |
# Select a few random chunks
|
| 1082 |
sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
| 1105 |
-
|
| 1106 |
-
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
-
|
| 1110 |
-
|
| 1111 |
-
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
print("setting suggested")
|
| 1139 |
print(suggested_settings)
|
| 1140 |
# Parse the generated text to extract the dictionary
|
|
@@ -1160,7 +1183,7 @@ Provide your suggestions in a Python dictionary format."""
|
|
| 1160 |
def update_inputs_with_llm_suggestions(suggestions):
|
| 1161 |
if suggestions is None or "error" in suggestions:
|
| 1162 |
return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
|
| 1163 |
-
|
| 1164 |
return [
|
| 1165 |
gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
|
| 1166 |
gr.update(value=suggestions["split_strategy"]), # split_strategy_input
|
|
@@ -1178,16 +1201,16 @@ def update_inputs_with_llm_suggestions(suggestions):
|
|
| 1178 |
def parse_model_selections(default_models, custom_models):
|
| 1179 |
"""
|
| 1180 |
Parse selected default models and custom models into model configurations
|
| 1181 |
-
|
| 1182 |
Args:
|
| 1183 |
default_models (List[str]): Selected default models in format "type:name"
|
| 1184 |
custom_models (str): Custom models string with one model per line in format "type:name"
|
| 1185 |
-
|
| 1186 |
Returns:
|
| 1187 |
List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
|
| 1188 |
"""
|
| 1189 |
model_configs = []
|
| 1190 |
-
|
| 1191 |
# Process default models
|
| 1192 |
if default_models:
|
| 1193 |
for model in default_models:
|
|
@@ -1196,7 +1219,7 @@ def parse_model_selections(default_models, custom_models):
|
|
| 1196 |
'type': model_type,
|
| 1197 |
'name': model_name
|
| 1198 |
})
|
| 1199 |
-
|
| 1200 |
# Process custom models
|
| 1201 |
if custom_models:
|
| 1202 |
custom_model_lines = custom_models.strip().split('\n')
|
|
@@ -1207,7 +1230,7 @@ def parse_model_selections(default_models, custom_models):
|
|
| 1207 |
'type': model_type.strip(),
|
| 1208 |
'name': model_name.strip()
|
| 1209 |
})
|
| 1210 |
-
|
| 1211 |
return model_configs
|
| 1212 |
|
| 1213 |
def parse_comma_separated(text):
|
|
@@ -1217,12 +1240,12 @@ def parse_comma_separated(text):
|
|
| 1217 |
return [x.strip() for x in text.split(',') if x.strip()]
|
| 1218 |
|
| 1219 |
|
| 1220 |
-
|
| 1221 |
# Gradio Interface
|
| 1222 |
def launch_interface(debug=True):
|
| 1223 |
with gr.Blocks() as iface:
|
| 1224 |
gr.Markdown("# Advanced Embedding Comparison Tool")
|
| 1225 |
-
|
| 1226 |
with gr.Tab("Simple"):
|
| 1227 |
file_input = gr.File(label="Upload File (Optional)")
|
| 1228 |
query_input = gr.Textbox(label="Search Query")
|
|
@@ -1237,7 +1260,7 @@ def launch_interface(debug=True):
|
|
| 1237 |
label="Embedding Models"
|
| 1238 |
)
|
| 1239 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 1240 |
-
|
| 1241 |
with gr.Tab("Advanced"):
|
| 1242 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
| 1243 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
@@ -1247,7 +1270,7 @@ def launch_interface(debug=True):
|
|
| 1247 |
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
| 1248 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
| 1249 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 1250 |
-
|
| 1251 |
with gr.Tab("Expert"):
|
| 1252 |
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
|
| 1253 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
|
@@ -1265,7 +1288,7 @@ def launch_interface(debug=True):
|
|
| 1265 |
with gr.Row():
|
| 1266 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
| 1267 |
auto_query_input = gr.Textbox(label="Search Query")
|
| 1268 |
-
|
| 1269 |
with gr.Row():
|
| 1270 |
auto_expected_result_input = gr.Textbox(
|
| 1271 |
label="Expected Result (Optional)",
|
|
@@ -1275,18 +1298,18 @@ def launch_interface(debug=True):
|
|
| 1275 |
label="Model Feedback (Optional)",
|
| 1276 |
placeholder="Enter any feedback about model performance"
|
| 1277 |
)
|
| 1278 |
-
|
| 1279 |
with gr.Row():
|
| 1280 |
with gr.Column():
|
| 1281 |
# Default model selection
|
| 1282 |
default_models_input = gr.CheckboxGroup(
|
| 1283 |
-
choices=[f"{type}:{name}"
|
| 1284 |
-
for type, names in DEFAULT_MODELS.items()
|
| 1285 |
for name in names],
|
| 1286 |
label="Default Models",
|
| 1287 |
value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
|
| 1288 |
)
|
| 1289 |
-
|
| 1290 |
with gr.Column():
|
| 1291 |
# Custom model input
|
| 1292 |
custom_models_input = gr.TextArea(
|
|
@@ -1294,7 +1317,7 @@ def launch_interface(debug=True):
|
|
| 1294 |
placeholder="Enter one model per line in format: type:name",
|
| 1295 |
lines=3
|
| 1296 |
)
|
| 1297 |
-
|
| 1298 |
auto_split_strategies = gr.CheckboxGroup(
|
| 1299 |
choices=["token", "recursive"],
|
| 1300 |
label="Split Strategies to Test"
|
|
@@ -1313,21 +1336,21 @@ def launch_interface(debug=True):
|
|
| 1313 |
auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
|
| 1314 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
| 1315 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
| 1316 |
-
|
| 1317 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
| 1318 |
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
| 1319 |
recommendations_output = gr.JSON(label="Recommendations")
|
| 1320 |
-
|
| 1321 |
def run_automation(file_input, query_input, expected_result, default_models, custom_models,
|
| 1322 |
split_strategies, chunk_sizes, overlap_sizes,
|
| 1323 |
vector_store_types, search_types, top_k_values,
|
| 1324 |
optimize_vocab, use_query_optimization, use_reranking,
|
| 1325 |
model_feedback):
|
| 1326 |
"""Wrapper function to handle Gradio inputs and run automated tests"""
|
| 1327 |
-
|
| 1328 |
# Parse model configurations
|
| 1329 |
model_configs = parse_model_selections(default_models, custom_models)
|
| 1330 |
-
|
| 1331 |
# Parse test parameters
|
| 1332 |
test_params = {
|
| 1333 |
'split_strategy': split_strategies,
|
|
@@ -1346,7 +1369,7 @@ def launch_interface(debug=True):
|
|
| 1346 |
'custom_separators': [None],
|
| 1347 |
'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
|
| 1348 |
}
|
| 1349 |
-
|
| 1350 |
# Run automated tests
|
| 1351 |
results_df, stats_df = run_automated_tests(
|
| 1352 |
file_input.name if file_input else None,
|
|
@@ -1356,12 +1379,12 @@ def launch_interface(debug=True):
|
|
| 1356 |
expected_result if expected_result else None,
|
| 1357 |
model_feedback if model_feedback else None
|
| 1358 |
)
|
| 1359 |
-
|
| 1360 |
# Generate recommendations based on results
|
| 1361 |
recommendations = analyze_results(stats_df)
|
| 1362 |
-
|
| 1363 |
return results_df, stats_df, recommendations
|
| 1364 |
-
|
| 1365 |
auto_submit_button = gr.Button("Run Automated Tests")
|
| 1366 |
auto_submit_button.click(
|
| 1367 |
fn=run_automation,
|
|
@@ -1376,25 +1399,25 @@ def launch_interface(debug=True):
|
|
| 1376 |
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
| 1377 |
)
|
| 1378 |
###
|
| 1379 |
-
|
| 1380 |
with gr.Tab("Results"):
|
| 1381 |
with gr.Row():
|
| 1382 |
results_output = gr.DataFrame(label="Results")
|
| 1383 |
stats_output = gr.DataFrame(label="Statistics")
|
| 1384 |
-
|
| 1385 |
with gr.Row():
|
| 1386 |
plot_output = gr.Plot(label="Visualizations")
|
| 1387 |
model_rankings_output = gr.JSON(label="Model Rankings")
|
| 1388 |
-
|
| 1389 |
with gr.Row():
|
| 1390 |
recommendations_output = gr.JSON(label="Recommendations")
|
| 1391 |
-
|
| 1392 |
with gr.Tab("LLM Suggestions"):
|
| 1393 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
| 1394 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
| 1395 |
llm_suggest_button = gr.Button("Get LLM Suggestions")
|
| 1396 |
llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
|
| 1397 |
-
|
| 1398 |
llm_suggest_button.click(
|
| 1399 |
fn=get_llm_suggested_settings,
|
| 1400 |
inputs=[llm_file_input, llm_num_chunks],
|
|
@@ -1403,9 +1426,9 @@ def launch_interface(debug=True):
|
|
| 1403 |
fn=update_inputs_with_llm_suggestions,
|
| 1404 |
inputs=[llm_suggestions_output],
|
| 1405 |
outputs=[
|
| 1406 |
-
embedding_models_input, split_strategy_input, chunk_size_input,
|
| 1407 |
-
overlap_size_input, vector_store_type_input, search_type_input,
|
| 1408 |
-
top_k_input, apply_preprocessing_input, optimize_vocab_input,
|
| 1409 |
apply_phonetic_input, phonetic_weight_input
|
| 1410 |
]
|
| 1411 |
)
|
|
@@ -1526,7 +1549,7 @@ Create a simple chat interface and test with various queries about the AI Act. F
|
|
| 1526 |
User: "Was sind die Hauptziele des KI-Gesetzes?"
|
| 1527 |
"""
|
| 1528 |
|
| 1529 |
-
|
| 1530 |
tutorial_md = """
|
| 1531 |
# Advanced Embedding Comparison Tool Tutorial
|
| 1532 |
|
|
@@ -1675,13 +1698,13 @@ Measures how well an object fits within its own cluster compared to others. Scor
|
|
| 1675 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
| 1676 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 1677 |
text = f.read()
|
| 1678 |
-
|
| 1679 |
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 1680 |
tokenizer.pre_tokenizer = Whitespace()
|
| 1681 |
-
|
| 1682 |
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
|
| 1683 |
tokenizer.train_from_iterator([text], trainer)
|
| 1684 |
-
|
| 1685 |
return tokenizer
|
| 1686 |
````
|
| 1687 |
|
|
@@ -1713,39 +1736,39 @@ def rerank_results(results, query, reranker):
|
|
| 1713 |
|
| 1714 |
|
| 1715 |
## Useful Resources and Links
|
| 1716 |
-
|
| 1717 |
Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
|
| 1718 |
-
|
| 1719 |
### Embeddings and Vector Databases
|
| 1720 |
- [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
|
| 1721 |
- [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
|
| 1722 |
- [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
|
| 1723 |
-
|
| 1724 |
### Natural Language Processing
|
| 1725 |
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
|
| 1726 |
- [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
|
| 1727 |
- [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
|
| 1728 |
-
|
| 1729 |
### Retrieval-Augmented Generation (RAG)
|
| 1730 |
- [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
|
| 1731 |
- [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
|
| 1732 |
-
|
| 1733 |
### German Language Processing
|
| 1734 |
- [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
|
| 1735 |
- [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
|
| 1736 |
-
|
| 1737 |
### Benchmarks and Evaluation
|
| 1738 |
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
|
| 1739 |
- [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
|
| 1740 |
-
|
| 1741 |
### Tools and Libraries
|
| 1742 |
- [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
|
| 1743 |
- [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
|
| 1744 |
-
|
| 1745 |
### Support me
|
| 1746 |
- [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
|
| 1747 |
-
|
| 1748 |
-
|
| 1749 |
|
| 1750 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
| 1751 |
|
|
@@ -1768,7 +1791,7 @@ def create_chat_app(settings):
|
|
| 1768 |
settings['lang'],
|
| 1769 |
settings['apply_preprocessing']
|
| 1770 |
)
|
| 1771 |
-
|
| 1772 |
results, _, _, _ = search_embeddings(
|
| 1773 |
chunks,
|
| 1774 |
embedding_model,
|
|
@@ -1780,12 +1803,12 @@ def create_chat_app(settings):
|
|
| 1780 |
apply_phonetic=settings['apply_phonetic'],
|
| 1781 |
phonetic_weight=settings['phonetic_weight']
|
| 1782 |
)
|
| 1783 |
-
|
| 1784 |
# Generate a response based on the retrieved results
|
| 1785 |
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
| 1786 |
for i, result in enumerate(results[:settings['top_k']]):
|
| 1787 |
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
| 1788 |
-
|
| 1789 |
return response
|
| 1790 |
|
| 1791 |
with gr.Blocks() as chat_interface:
|
|
@@ -1823,7 +1846,7 @@ if __name__ == "__main__":
|
|
| 1823 |
launch_interface()
|
| 1824 |
# Uncomment the following line to launch the sample chat app
|
| 1825 |
´´´
|
| 1826 |
-
|
| 1827 |
"""
|
| 1828 |
|
| 1829 |
|
|
@@ -1832,10 +1855,10 @@ if __name__ == "__main__":
|
|
| 1832 |
["Embedding Comparison", "Tutorial", "Use Case"]
|
| 1833 |
)
|
| 1834 |
|
| 1835 |
-
iface.launch(debug=
|
| 1836 |
|
| 1837 |
# Enhanced Automated Testing
|
| 1838 |
-
def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
|
| 1839 |
test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
|
| 1840 |
model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 1841 |
"""
|
|
@@ -1844,16 +1867,16 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
| 1844 |
all_results = []
|
| 1845 |
all_stats = []
|
| 1846 |
model_manager = ModelManager()
|
| 1847 |
-
|
| 1848 |
# Create parameter grid excluding model configurations
|
| 1849 |
base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
|
| 1850 |
param_grid = ParameterGrid(base_params)
|
| 1851 |
-
|
| 1852 |
# Test each model configuration with all parameter combinations
|
| 1853 |
for model_config in tqdm(model_configs, desc="Testing models"):
|
| 1854 |
model_type = model_config['type']
|
| 1855 |
model_name = model_config['name']
|
| 1856 |
-
|
| 1857 |
for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
|
| 1858 |
try:
|
| 1859 |
# Process files and get chunks
|
|
@@ -1868,11 +1891,11 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
| 1868 |
params['lang'],
|
| 1869 |
params['apply_preprocessing']
|
| 1870 |
)
|
| 1871 |
-
|
| 1872 |
# Apply vocabulary optimization if specified
|
| 1873 |
if params['optimize_vocab']:
|
| 1874 |
tokenizer, chunks = optimize_vocabulary(chunks)
|
| 1875 |
-
|
| 1876 |
# Apply query optimization if specified
|
| 1877 |
current_query = query
|
| 1878 |
if params['use_query_optimization']:
|
|
@@ -1886,7 +1909,7 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
| 1886 |
params['top_k']
|
| 1887 |
)
|
| 1888 |
current_query = " ".join(optimized_queries)
|
| 1889 |
-
|
| 1890 |
# Perform search
|
| 1891 |
results, search_time, vector_store, raw_results = search_embeddings(
|
| 1892 |
chunks,
|
|
@@ -1900,25 +1923,25 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
| 1900 |
params['apply_phonetic'],
|
| 1901 |
params['phonetic_weight']
|
| 1902 |
)
|
| 1903 |
-
|
| 1904 |
# Apply reranking if specified
|
| 1905 |
if params['use_reranking']:
|
| 1906 |
-
reranker = pipeline("text-classification",
|
| 1907 |
model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 1908 |
raw_results = rerank_results(raw_results, current_query, reranker)
|
| 1909 |
-
|
| 1910 |
# Calculate statistics
|
| 1911 |
stats = ResultAnalyzer.calculate_statistics(
|
| 1912 |
raw_results, search_time, vector_store, num_tokens,
|
| 1913 |
embedding_model, current_query, params['top_k'],
|
| 1914 |
expected_result, model_feedback
|
| 1915 |
)
|
| 1916 |
-
|
| 1917 |
# Update model rankings
|
| 1918 |
model_id = f"{model_type}:{model_name}"
|
| 1919 |
ranking_score = calculate_model_ranking_score(stats)
|
| 1920 |
model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
|
| 1921 |
-
|
| 1922 |
# Add model information to stats
|
| 1923 |
stats.update({
|
| 1924 |
"model_type": model_type,
|
|
@@ -1926,15 +1949,15 @@ def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str
|
|
| 1926 |
"model": f"{model_type} - {model_name}",
|
| 1927 |
**params
|
| 1928 |
})
|
| 1929 |
-
|
| 1930 |
# Format and store results
|
| 1931 |
all_results.extend(format_results(raw_results, stats))
|
| 1932 |
all_stats.append(stats)
|
| 1933 |
-
|
| 1934 |
except Exception as e:
|
| 1935 |
print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
|
| 1936 |
continue
|
| 1937 |
-
|
| 1938 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
| 1939 |
|
| 1940 |
# Helper function to calculate model ranking score
|
|
@@ -1947,7 +1970,7 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
|
|
| 1947 |
'contains_expected': 0.3,
|
| 1948 |
'expected_result_rank': -0.2 # Negative weight because lower rank is better
|
| 1949 |
}
|
| 1950 |
-
|
| 1951 |
score = 0.0
|
| 1952 |
for metric, weight in weights.items():
|
| 1953 |
if metric in stats and not isinstance(stats[metric], str):
|
|
@@ -1958,9 +1981,8 @@ def calculate_model_ranking_score(stats: Dict[str, Any]) -> float:
|
|
| 1958 |
else:
|
| 1959 |
value = float(stats[metric])
|
| 1960 |
score += weight * value
|
| 1961 |
-
|
| 1962 |
return score
|
| 1963 |
|
| 1964 |
if __name__ == "__main__":
|
| 1965 |
launch_interface()
|
| 1966 |
-
|
|
|
|
| 41 |
from typing import List, Tuple, Optional
|
| 42 |
|
| 43 |
|
| 44 |
+
#hf_token = os.getenv("hf_token")
|
| 45 |
+
#login(token=hf_token)
|
| 46 |
|
| 47 |
# Define the model pipeline with additional generation parameters
|
| 48 |
#model_pipeline = pipeline(
|
|
|
|
| 154 |
}
|
| 155 |
}
|
| 156 |
|
| 157 |
+
|
| 158 |
def update_model_ranking(self, model_id: str, score: float, feedback: str = None):
|
| 159 |
"""Update model ranking based on performance and optional feedback"""
|
| 160 |
current_score = self.rankings.get(model_id, 0.0)
|
| 161 |
# Weighted average of current score and new score
|
| 162 |
self.rankings[model_id] = 0.7 * current_score + 0.3 * score
|
| 163 |
+
|
| 164 |
if feedback:
|
| 165 |
if model_id not in self.model_stats:
|
| 166 |
self.model_stats[model_id] = {"feedback_count": 0, "feedback": []}
|
| 167 |
self.model_stats[model_id]["feedback_count"] += 1
|
| 168 |
self.model_stats[model_id]["feedback"].append(feedback)
|
| 169 |
+
|
| 170 |
def get_top_models(self, n: int = 5) -> List[Tuple[str, float]]:
|
| 171 |
"""Get top n ranked models"""
|
| 172 |
return sorted(self.rankings.items(), key=lambda x: x[1], reverse=True)[:n]
|
| 173 |
+
|
| 174 |
def get_model_stats(self, model_id: str) -> Dict[str, Any]:
|
| 175 |
"""Get statistics for a specific model"""
|
| 176 |
return self.model_stats.get(model_id, {})
|
| 177 |
|
| 178 |
+
|
| 179 |
def add_model(self, provider, name, model_path):
|
| 180 |
if provider not in self.models:
|
| 181 |
self.models[provider] = {}
|
|
|
|
| 286 |
def preprocess_text(text, lang='german', apply_preprocessing=False):
|
| 287 |
if not apply_preprocessing:
|
| 288 |
return text
|
| 289 |
+
|
| 290 |
text = text.lower()
|
| 291 |
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 292 |
+
|
| 293 |
try:
|
| 294 |
tokens = word_tokenize(text, language=lang)
|
| 295 |
except LookupError:
|
| 296 |
print(f"Warning: NLTK punkt tokenizer for {lang} not found. Using simple tokenization.")
|
| 297 |
tokens = simple_tokenize(text)
|
| 298 |
+
|
| 299 |
try:
|
| 300 |
stop_words = set(stopwords.words(lang))
|
| 301 |
except LookupError:
|
| 302 |
print(f"Warning: Stopwords for {lang} not found. Skipping stopword removal.")
|
| 303 |
stop_words = set()
|
| 304 |
tokens = [token for token in tokens if token not in stop_words]
|
| 305 |
+
|
| 306 |
try:
|
| 307 |
stemmer = SnowballStemmer(lang)
|
| 308 |
tokens = [stemmer.stem(token) for token in tokens]
|
| 309 |
except ValueError:
|
| 310 |
print(f"Warning: SnowballStemmer for {lang} not available. Skipping stemming.")
|
| 311 |
+
|
| 312 |
return ' '.join(tokens)
|
| 313 |
|
| 314 |
def phonetic_match(text, query, method='levenshtein_distance', apply_phonetic=False):
|
|
|
|
| 341 |
) -> str:
|
| 342 |
"""
|
| 343 |
CPU-optimized version of query expansion using a small language model.
|
| 344 |
+
|
| 345 |
Args:
|
| 346 |
query: Original search query
|
| 347 |
query_optimization_model: Name or path of the model to use for optimization
|
|
|
|
| 351 |
search_type: Type of search being performed
|
| 352 |
top_k: Number of expansion terms to add
|
| 353 |
use_gpu: Whether to use GPU if available (defaults to False for CPU)
|
| 354 |
+
|
| 355 |
Returns:
|
| 356 |
Expanded query string
|
| 357 |
"""
|
| 358 |
try:
|
| 359 |
# Set device
|
| 360 |
device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
|
| 361 |
+
|
| 362 |
# 1. Basic text preprocessing (CPU-based)
|
| 363 |
tokens = word_tokenize(query.lower())
|
| 364 |
+
|
| 365 |
# 2. WordNet synonyms expansion (CPU-based)
|
| 366 |
expanded_terms = set()
|
| 367 |
for token in tokens:
|
|
|
|
| 370 |
for syn in synsets:
|
| 371 |
# Limit number of lemmas
|
| 372 |
expanded_terms.update([lemma.name() for lemma in syn.lemmas()[:2]])
|
| 373 |
+
|
| 374 |
# 3. Use provided model with reduced complexity
|
| 375 |
try:
|
| 376 |
# Load model with reduced memory footprint
|
|
|
|
| 384 |
low_cpu_mem_usage=True,
|
| 385 |
device_map="cpu"
|
| 386 |
)
|
| 387 |
+
|
| 388 |
# Move model to CPU and eval mode
|
| 389 |
model = model.to(device)
|
| 390 |
model.eval()
|
| 391 |
+
|
| 392 |
# Prepare input with reduced length
|
| 393 |
prompt = f"Enhance this search query with relevant terms: {query}"
|
| 394 |
inputs = tokenizer(
|
|
|
|
| 398 |
truncation=True,
|
| 399 |
padding=True
|
| 400 |
)
|
| 401 |
+
|
| 402 |
# Generate with minimal parameters
|
| 403 |
with torch.no_grad():
|
| 404 |
outputs = model.generate(
|
|
|
|
| 409 |
do_sample=False,
|
| 410 |
early_stopping=True
|
| 411 |
)
|
| 412 |
+
|
| 413 |
enhanced_query = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 414 |
+
|
| 415 |
# Clear CUDA cache if GPU was used
|
| 416 |
if device == "cuda":
|
| 417 |
torch.cuda.empty_cache()
|
| 418 |
+
|
| 419 |
except Exception as model_error:
|
| 420 |
print(f"Model-based expansion failed: {str(model_error)}")
|
| 421 |
enhanced_query = query
|
| 422 |
+
|
| 423 |
# 4. Combine original and expanded terms
|
| 424 |
final_terms = set(tokens)
|
| 425 |
final_terms.update(expanded_terms)
|
| 426 |
if enhanced_query != query:
|
| 427 |
final_terms.update(word_tokenize(enhanced_query.lower()))
|
| 428 |
+
|
| 429 |
# 5. Remove stopwords and select top_k most relevant terms
|
| 430 |
stopwords = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to'])
|
| 431 |
final_terms = [term for term in final_terms if term not in stopwords]
|
| 432 |
+
|
| 433 |
# Combine with original query
|
| 434 |
expanded_query = f"{query} {' '.join(list(final_terms)[:top_k])}"
|
| 435 |
+
|
| 436 |
# Clean up
|
| 437 |
del model
|
| 438 |
del tokenizer
|
| 439 |
if device == "cuda":
|
| 440 |
torch.cuda.empty_cache()
|
| 441 |
+
|
| 442 |
+
return expanded_query.strip() #[Document(page_content=expanded_query.strip())]
|
| 443 |
+
|
| 444 |
except Exception as e:
|
| 445 |
print(f"Query optimization failed: {str(e)}")
|
| 446 |
+
return query #[Document(page_content=query)] # Return original query if optimization fails
|
| 447 |
|
| 448 |
|
| 449 |
|
|
|
|
| 458 |
use_gpu=False # Explicitly use CPU
|
| 459 |
)
|
| 460 |
"""
|
| 461 |
+
|
| 462 |
|
| 463 |
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 464 |
tokenized_texts = [text.split() for text in texts]
|
| 465 |
+
|
| 466 |
if model_type == 'word2vec':
|
| 467 |
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 468 |
elif model_type == 'fasttext':
|
| 469 |
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 470 |
else:
|
| 471 |
raise ValueError("Unsupported model type")
|
| 472 |
+
|
| 473 |
return model
|
| 474 |
|
| 475 |
class CustomEmbeddings(HuggingFaceEmbeddings):
|
| 476 |
def __init__(self, model_path):
|
| 477 |
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
| 478 |
+
|
| 479 |
def embed_documents(self, texts):
|
| 480 |
return [self.model.wv[text.split()] for text in texts]
|
| 481 |
+
|
| 482 |
def embed_query(self, text):
|
| 483 |
return self.model.wv[text.split()]
|
| 484 |
|
|
|
|
| 520 |
chunk_size=chunk_size,
|
| 521 |
chunk_overlap=overlap_size,
|
| 522 |
add_start_index=True, # If `True`, includes chunk's start index in metadata
|
| 523 |
+
strip_whitespace=True, # If `True`, strips whitespace from the start and end of every document
|
| 524 |
separators=custom_separators or ["\n\n", "\n", " ", ""]
|
| 525 |
)
|
| 526 |
else:
|
|
|
|
| 534 |
multi_process=True,
|
| 535 |
# model_kwargs={"device": "cpu"},
|
| 536 |
#encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity
|
| 537 |
+
)
|
| 538 |
elif model_type == 'OpenAI':
|
| 539 |
return OpenAIEmbeddings(model=model_path)
|
| 540 |
elif model_type == 'Cohere':
|
|
|
|
| 566 |
phonetic_sim = phonetic_match(doc_text, query)
|
| 567 |
combined_sim = (1 - phonetic_weight) * embedding_sim + phonetic_weight * phonetic_sim
|
| 568 |
return combined_sim
|
| 569 |
+
|
| 570 |
def _create_vector_store(vector_store_type, chunks_tuple, embedding_model):
|
| 571 |
chunks = list(chunks_tuple)
|
| 572 |
+
|
| 573 |
if vector_store_type == 'FAISS':
|
| 574 |
return FAISS.from_texts(chunks, embedding_model)
|
| 575 |
elif vector_store_type == 'Chroma':
|
|
|
|
| 587 |
for file in os.listdir(FILES_DIR):
|
| 588 |
file_path = os.path.join(FILES_DIR, file)
|
| 589 |
text += FileHandler.extract_text(file_path)
|
| 590 |
+
|
| 591 |
if custom_tokenizer_file:
|
| 592 |
tokenizer = create_custom_tokenizer(custom_tokenizer_file, custom_tokenizer_model, custom_tokenizer_vocab_size, custom_tokenizer_special_tokens)
|
| 593 |
text = ' '.join(custom_tokenize(text, tokenizer))
|
|
|
|
| 603 |
|
| 604 |
def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k, expected_result=None, lang='german', apply_phonetic=False, phonetic_weight=0.3):
|
| 605 |
preprocessed_query = preprocess_text(query, lang) if apply_phonetic else query
|
| 606 |
+
|
| 607 |
vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
|
| 608 |
retriever = get_retriever(vector_store, search_type, {"k": top_k})
|
| 609 |
|
|
|
|
| 613 |
#this should be optional
|
| 614 |
def score_result(doc):
|
| 615 |
base_score = vector_store.similarity_search_with_score(doc.page_content, k=1)[0][1]
|
| 616 |
+
|
| 617 |
# Add bonus for containing expected result
|
| 618 |
expected_bonus = 0.3 if expected_result and expected_result in doc.page_content else 0
|
| 619 |
+
|
| 620 |
if apply_phonetic:
|
| 621 |
phonetic_score = phonetic_match(doc.page_content, query)
|
| 622 |
return (1 - phonetic_weight) * base_score + phonetic_weight * phonetic_score + expected_bonus
|
|
|
|
| 645 |
# Enhanced Result Analysis
|
| 646 |
class ResultAnalyzer:
|
| 647 |
@staticmethod
|
| 648 |
+
def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model, query,
|
| 649 |
top_k, expected_result=None, model_feedback=None):
|
| 650 |
stats = {
|
| 651 |
"num_results": len(results),
|
|
|
|
| 657 |
"embedding_dimension": len(embedding_model.embed_query(query)),
|
| 658 |
"top_k": top_k,
|
| 659 |
}
|
| 660 |
+
|
| 661 |
# Add vector store statistics
|
| 662 |
try:
|
| 663 |
if hasattr(vector_store, '_index'):
|
|
|
|
| 666 |
stats["vector_store_size"] = len(vector_store._collection.get())
|
| 667 |
except:
|
| 668 |
stats["vector_store_size"] = "N/A"
|
| 669 |
+
|
| 670 |
# Add expected result statistics if provided
|
| 671 |
if expected_result:
|
| 672 |
stats["contains_expected"] = any(expected_result in doc.page_content for doc in results)
|
| 673 |
+
stats["expected_result_rank"] = next((i for i, doc in enumerate(results)
|
| 674 |
if expected_result in doc.page_content), -1) + 1
|
| 675 |
+
|
| 676 |
# Calculate diversity metrics for larger result sets
|
| 677 |
if len(results) > 3: # Changed from 1000 to make it more practical
|
| 678 |
embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
|
|
| 681 |
else:
|
| 682 |
stats["result_diversity"] = "N/A"
|
| 683 |
stats["silhouette_score"] = "N/A"
|
| 684 |
+
|
| 685 |
# Add ranking correlation
|
| 686 |
query_embedding = embedding_model.embed_query(query)
|
| 687 |
result_embeddings = [embedding_model.embed_query(doc.page_content) for doc in results]
|
|
|
|
| 691 |
stats["rank_correlation"] = rank_correlation
|
| 692 |
else:
|
| 693 |
stats["rank_correlation"] = "N/A"
|
| 694 |
+
|
| 695 |
# Add model feedback if provided
|
| 696 |
if model_feedback:
|
| 697 |
stats["model_feedback"] = model_feedback
|
| 698 |
+
|
| 699 |
return stats
|
| 700 |
+
|
| 701 |
@staticmethod
|
| 702 |
def _calculate_diversity(embeddings: List[np.ndarray]) -> float:
|
| 703 |
"""Calculate diversity score for embeddings"""
|
| 704 |
embeddings_array = np.array(embeddings)
|
| 705 |
pairwise_similarities = np.inner(embeddings_array, embeddings_array)
|
| 706 |
return 1 - np.mean(pairwise_similarities[np.triu_indices(len(embeddings), k=1)])
|
| 707 |
+
|
| 708 |
@staticmethod
|
| 709 |
def _calculate_silhouette(embeddings: List[np.ndarray]) -> float:
|
| 710 |
"""Calculate silhouette score for embeddings"""
|
|
|
|
| 724 |
# Add model column if not present
|
| 725 |
if 'model' not in stats_df.columns:
|
| 726 |
stats_df['model'] = stats_df['model_type'] + ' - ' + stats_df['model_name']
|
| 727 |
+
|
| 728 |
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
|
| 729 |
+
|
| 730 |
# Handle empty dataframe case
|
| 731 |
if len(stats_df) == 0:
|
| 732 |
return fig
|
| 733 |
+
|
| 734 |
# Create plots with error handling
|
| 735 |
try:
|
| 736 |
sns.barplot(data=stats_df, x='model', y='search_time', ax=axs[0, 0])
|
|
|
|
| 738 |
axs[0, 0].tick_params(axis='x', rotation=45)
|
| 739 |
except Exception as e:
|
| 740 |
print(f"Error in search time plot: {e}")
|
| 741 |
+
|
| 742 |
try:
|
| 743 |
+
sns.scatterplot(data=stats_df, x='result_diversity', y='rank_correlation',
|
| 744 |
hue='model', ax=axs[0, 1])
|
| 745 |
axs[0, 1].set_title('Result Diversity vs. Rank Correlation')
|
| 746 |
except Exception as e:
|
| 747 |
print(f"Error in diversity plot: {e}")
|
| 748 |
+
|
| 749 |
try:
|
| 750 |
sns.boxplot(data=stats_df, x='model', y='avg_content_length', ax=axs[1, 0])
|
| 751 |
axs[1, 0].set_title('Distribution of Result Content Lengths')
|
| 752 |
axs[1, 0].tick_params(axis='x', rotation=45)
|
| 753 |
except Exception as e:
|
| 754 |
print(f"Error in content length plot: {e}")
|
| 755 |
+
|
| 756 |
try:
|
| 757 |
valid_embeddings = results_df['embedding'].dropna().values
|
| 758 |
if len(valid_embeddings) > 1:
|
| 759 |
tsne = TSNE(n_components=2, random_state=42)
|
| 760 |
embeddings_2d = tsne.fit_transform(np.vstack(valid_embeddings))
|
| 761 |
+
sns.scatterplot(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1],
|
| 762 |
+
hue=results_df['Model'][:len(valid_embeddings)],
|
| 763 |
ax=axs[1, 1])
|
| 764 |
axs[1, 1].set_title('t-SNE Visualization of Result Embeddings')
|
| 765 |
else:
|
| 766 |
+
axs[1, 1].text(0.5, 0.5, "Not enough embeddings for visualization",
|
| 767 |
ha='center', va='center')
|
| 768 |
except Exception as e:
|
| 769 |
print(f"Error in embedding visualization: {e}")
|
| 770 |
+
|
| 771 |
plt.tight_layout()
|
| 772 |
return fig
|
| 773 |
|
|
|
|
| 778 |
#plt.title("Distribution of document lengths in the knowledge base (in count of tokens)")
|
| 779 |
#plt.show()
|
| 780 |
|
| 781 |
+
|
| 782 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 783 |
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 784 |
|
| 785 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 786 |
+
|
| 787 |
optimized_texts = [
|
| 788 |
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
| 789 |
for text in texts
|
| 790 |
]
|
| 791 |
+
|
| 792 |
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 793 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 794 |
+
|
| 795 |
return tokenizer, optimized_texts
|
| 796 |
+
|
| 797 |
import numpy as np
|
| 798 |
from transformers import TextClassificationPipeline
|
| 799 |
from typing import List, Union, Any
|
| 800 |
|
| 801 |
+
|
| 802 |
|
| 803 |
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 804 |
|
| 805 |
|
| 806 |
def rerank_results(
|
| 807 |
+
results: List[Any],
|
| 808 |
+
query: str,
|
| 809 |
reranker: Union[TextClassificationPipeline, Any]
|
| 810 |
) -> List[Any]:
|
| 811 |
"""
|
| 812 |
+
|
| 813 |
"""
|
| 814 |
if not results:
|
| 815 |
return results
|
| 816 |
+
|
| 817 |
# Step 1: Encode the query and documents using SentenceTransformer
|
| 818 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
| 819 |
doc_contents = [doc.page_content for doc in results] # Assuming each result has a `page_content` attribute
|
| 820 |
doc_embeddings = model.encode(doc_contents, convert_to_tensor=True)
|
| 821 |
+
|
| 822 |
# Step 2: Compute cosine similarities between query and document embeddings
|
| 823 |
cosine_scores = util.cos_sim(query_embedding, doc_embeddings)[0] # Shape: (number of documents,)
|
| 824 |
+
|
| 825 |
# Step 3: Sort documents by similarity score in descending order
|
| 826 |
+
reranked_idx = np.argsort(cosine_scores.cpu().numpy())[::-1]
|
| 827 |
+
|
| 828 |
# Step 4: Return the reranked documents
|
| 829 |
reranked_results = [results[i] for i in reranked_idx]
|
| 830 |
+
|
| 831 |
return reranked_results
|
| 832 |
|
| 833 |
|
|
|
|
| 878 |
if optimize_vocab:
|
| 879 |
tokenizer, optimized_chunks = optimize_vocabulary(chunks)
|
| 880 |
chunks = optimized_chunks
|
| 881 |
+
|
| 882 |
search_query = query
|
| 883 |
+
|
| 884 |
if use_query_optimization:
|
| 885 |
optimized_queries = optimize_query(query, query_optimization_model, chunks, embedding_model, vector_store_type, search_type, top_k)
|
| 886 |
#query = " ".join(optimized_queries)
|
| 887 |
+
search_query = optimized_queries # " ".join([doc.page_content for doc in optimized_queries]) # Extract text from Document objects
|
| 888 |
|
| 889 |
results, search_time, vector_store, results_raw = search_embeddings(
|
| 890 |
chunks,
|
|
|
|
| 897 |
lang,
|
| 898 |
apply_phonetic,
|
| 899 |
phonetic_weight
|
| 900 |
+
)
|
| 901 |
+
|
| 902 |
if use_reranking:
|
| 903 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 904 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
|
| 953 |
def automated_testing(file, query, test_params, expected_result=None):
|
| 954 |
all_results = []
|
| 955 |
all_stats = []
|
| 956 |
+
|
| 957 |
param_grid = ParameterGrid(test_params)
|
| 958 |
print(param_grid)
|
| 959 |
for params in tqdm(param_grid, desc="Running tests"):
|
|
|
|
| 995 |
params['apply_phonetic'],
|
| 996 |
params['phonetic_weight']
|
| 997 |
)
|
| 998 |
+
|
| 999 |
if params['use_reranking']:
|
| 1000 |
reranker = pipeline("text-classification", model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 1001 |
results_raw = rerank_results(results_raw, query, reranker)
|
|
|
|
| 1022 |
'contains_expected': 0.5, # High weight for containing the expected result
|
| 1023 |
'expected_result_rank': -0.4 # Lower rank (closer to 1) is better
|
| 1024 |
}
|
| 1025 |
+
if stats_df.empty:
|
| 1026 |
+
print("stats_df is empty. Cannot compute best configuration.")
|
| 1027 |
+
return None
|
| 1028 |
+
|
| 1029 |
for metric in metric_weights.keys():
|
| 1030 |
+
|
| 1031 |
+
if metric in stats_df.columns:
|
| 1032 |
+
stats_df[metric] = pd.to_numeric(stats_df[metric], errors='coerce')
|
| 1033 |
+
else:
|
| 1034 |
+
stats_df[metric] = 0
|
| 1035 |
+
print("Column 'search_time' is missing in stats_df.")
|
| 1036 |
+
|
| 1037 |
+
|
| 1038 |
+
|
| 1039 |
stats_df['weighted_score'] = sum(
|
| 1040 |
+
stats_df[metric].fillna(0) * weight
|
| 1041 |
for metric, weight in metric_weights.items()
|
| 1042 |
)
|
| 1043 |
+
|
| 1044 |
best_config = stats_df.loc[stats_df['weighted_score'].idxmax()]
|
| 1045 |
+
|
| 1046 |
recommendations = {
|
| 1047 |
'best_model': f"{best_config['model_type']} - {best_config['model_name']}",
|
| 1048 |
'best_settings': {
|
|
|
|
| 1069 |
'expected_result_rank': int(best_config['expected_result_rank'])
|
| 1070 |
}
|
| 1071 |
}
|
| 1072 |
+
|
| 1073 |
return recommendations
|
| 1074 |
|
| 1075 |
####
|
|
|
|
| 1079 |
return {"error": "No file uploaded"}
|
| 1080 |
|
| 1081 |
chunks, _, _ = process_files(
|
| 1082 |
+
file.name,
|
| 1083 |
+
'HuggingFace',
|
| 1084 |
+
'paraphrase-miniLM',
|
| 1085 |
+
'recursive',
|
| 1086 |
+
250,
|
| 1087 |
50,
|
| 1088 |
custom_separators=None
|
| 1089 |
)
|
| 1090 |
+
|
| 1091 |
# Select a few random chunks
|
| 1092 |
sample_chunks = random.sample(chunks, min(num_chunks, len(chunks)))
|
| 1093 |
+
|
| 1094 |
+
|
| 1095 |
+
llm_pipeline = pipeline(model="meta-llama/Llama-3.2-1B-Instruct", device='cuda')
|
| 1096 |
+
|
| 1097 |
+
|
| 1098 |
+
prompt=f'''
|
| 1099 |
+
<|start_header_id|>system<|end_header_id|>
|
| 1100 |
+
You are an expert in information retrieval.
|
| 1101 |
+
You know about strenghs and weaknesses of all models.
|
| 1102 |
+
|
| 1103 |
+
Given the following text chunks from a document,
|
| 1104 |
+
suggest optimal settings for an embedding-based search system. The settings should include:
|
| 1105 |
+
|
| 1106 |
+
1. Embedding model type and name
|
| 1107 |
+
2. Split strategy (token or recursive)
|
| 1108 |
+
3. Chunk size
|
| 1109 |
+
4. Overlap size
|
| 1110 |
+
5. Vector store type (FAISS or Chroma)
|
| 1111 |
+
6. Search type (similarity, mmr, or custom)
|
| 1112 |
+
7. Top K results to retrieve
|
| 1113 |
+
8. Whether to apply preprocessing
|
| 1114 |
+
9. Whether to optimize vocabulary
|
| 1115 |
+
10. Whether to apply phonetic matching
|
| 1116 |
+
|
| 1117 |
+
Expected output format:
|
| 1118 |
+
{{
|
| 1119 |
+
"embedding_models": "embedding_model_type:embedding_model_name",
|
| 1120 |
+
"split_strategy": "token or recursive",
|
| 1121 |
+
"chunk_size": 250,
|
| 1122 |
+
"overlap_size": 50,
|
| 1123 |
+
"vector_store_type": "FAISS or Chroma",
|
| 1124 |
+
"search_type": "similarity, mmr, or custom",
|
| 1125 |
+
"top_k": 5,
|
| 1126 |
+
"apply_preprocessing": True,
|
| 1127 |
+
"optimize_vocab": True,
|
| 1128 |
+
"apply_phonetic": False,
|
| 1129 |
+
"phonetic_weight": 0.3 #
|
| 1130 |
+
}}
|
| 1131 |
+
|
| 1132 |
+
Provide your suggestions in a Python dictionary format.
|
| 1133 |
+
|
| 1134 |
+
show me settings You SHOULD NOT include any other text in the response.
|
| 1135 |
+
Fill out the seeting and chose usefull values.
|
| 1136 |
+
Respect the users use cases and content snipet. Choose the setting based on the chunks
|
| 1137 |
+
|
| 1138 |
+
<|eot_id|><|start_header_id|>user<|end_header_id|>
|
| 1139 |
+
User user case:
|
| 1140 |
+
{"small local", "large total context", ...}
|
| 1141 |
+
|
| 1142 |
+
total content lenght:
|
| 1143 |
+
{len(' '.join(chunks))}
|
| 1144 |
+
|
| 1145 |
+
Content snipet:
|
| 1146 |
+
{' '.join(sample_chunks)}
|
| 1147 |
+
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
| 1148 |
+
'''
|
| 1149 |
+
suggested_settings = llm_pipeline(
|
| 1150 |
+
prompt,
|
| 1151 |
+
do_sample=True,
|
| 1152 |
+
top_k=10,
|
| 1153 |
+
num_return_sequences=1,
|
| 1154 |
+
return_full_text=False,
|
| 1155 |
+
max_new_tokens=1900, # Control the length of the output,
|
| 1156 |
+
truncation=True, # Enable truncation
|
| 1157 |
+
)
|
| 1158 |
+
|
| 1159 |
+
|
| 1160 |
+
#suggested_settings = llm.invoke(prompt)
|
| 1161 |
print("setting suggested")
|
| 1162 |
print(suggested_settings)
|
| 1163 |
# Parse the generated text to extract the dictionary
|
|
|
|
| 1183 |
def update_inputs_with_llm_suggestions(suggestions):
|
| 1184 |
if suggestions is None or "error" in suggestions:
|
| 1185 |
return [gr.update() for _ in range(11)] # Return no updates if there's an error or None
|
| 1186 |
+
|
| 1187 |
return [
|
| 1188 |
gr.update(value=[suggestions["embedding_models"]]), # embedding_models_input
|
| 1189 |
gr.update(value=suggestions["split_strategy"]), # split_strategy_input
|
|
|
|
| 1201 |
def parse_model_selections(default_models, custom_models):
|
| 1202 |
"""
|
| 1203 |
Parse selected default models and custom models into model configurations
|
| 1204 |
+
|
| 1205 |
Args:
|
| 1206 |
default_models (List[str]): Selected default models in format "type:name"
|
| 1207 |
custom_models (str): Custom models string with one model per line in format "type:name"
|
| 1208 |
+
|
| 1209 |
Returns:
|
| 1210 |
List[Dict[str, str]]: List of model configurations with 'type' and 'name' keys
|
| 1211 |
"""
|
| 1212 |
model_configs = []
|
| 1213 |
+
|
| 1214 |
# Process default models
|
| 1215 |
if default_models:
|
| 1216 |
for model in default_models:
|
|
|
|
| 1219 |
'type': model_type,
|
| 1220 |
'name': model_name
|
| 1221 |
})
|
| 1222 |
+
|
| 1223 |
# Process custom models
|
| 1224 |
if custom_models:
|
| 1225 |
custom_model_lines = custom_models.strip().split('\n')
|
|
|
|
| 1230 |
'type': model_type.strip(),
|
| 1231 |
'name': model_name.strip()
|
| 1232 |
})
|
| 1233 |
+
|
| 1234 |
return model_configs
|
| 1235 |
|
| 1236 |
def parse_comma_separated(text):
|
|
|
|
| 1240 |
return [x.strip() for x in text.split(',') if x.strip()]
|
| 1241 |
|
| 1242 |
|
| 1243 |
+
|
| 1244 |
# Gradio Interface
|
| 1245 |
def launch_interface(debug=True):
|
| 1246 |
with gr.Blocks() as iface:
|
| 1247 |
gr.Markdown("# Advanced Embedding Comparison Tool")
|
| 1248 |
+
|
| 1249 |
with gr.Tab("Simple"):
|
| 1250 |
file_input = gr.File(label="Upload File (Optional)")
|
| 1251 |
query_input = gr.Textbox(label="Search Query")
|
|
|
|
| 1260 |
label="Embedding Models"
|
| 1261 |
)
|
| 1262 |
top_k_input = gr.Slider(1, 10, step=1, value=5, label="Top K")
|
| 1263 |
+
|
| 1264 |
with gr.Tab("Advanced"):
|
| 1265 |
custom_embedding_model_input = gr.Textbox(label="Custom Embedding Model (optional, format: type:name)")
|
| 1266 |
split_strategy_input = gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive")
|
|
|
|
| 1270 |
vector_store_type_input = gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS")
|
| 1271 |
search_type_input = gr.Radio(choices=["similarity", "mmr", "custom"], label="Search Type", value="similarity")
|
| 1272 |
lang_input = gr.Dropdown(choices=["german", "english", "french"], label="Language", value="german")
|
| 1273 |
+
|
| 1274 |
with gr.Tab("Expert"):
|
| 1275 |
apply_preprocessing_input = gr.Checkbox(label="Apply Text Preprocessing", value=False)
|
| 1276 |
optimize_vocab_input = gr.Checkbox(label="Optimize Vocabulary", value=False)
|
|
|
|
| 1288 |
with gr.Row():
|
| 1289 |
auto_file_input = gr.File(label="Upload File (Optional)")
|
| 1290 |
auto_query_input = gr.Textbox(label="Search Query")
|
| 1291 |
+
|
| 1292 |
with gr.Row():
|
| 1293 |
auto_expected_result_input = gr.Textbox(
|
| 1294 |
label="Expected Result (Optional)",
|
|
|
|
| 1298 |
label="Model Feedback (Optional)",
|
| 1299 |
placeholder="Enter any feedback about model performance"
|
| 1300 |
)
|
| 1301 |
+
|
| 1302 |
with gr.Row():
|
| 1303 |
with gr.Column():
|
| 1304 |
# Default model selection
|
| 1305 |
default_models_input = gr.CheckboxGroup(
|
| 1306 |
+
choices=[f"{type}:{name}"
|
| 1307 |
+
for type, names in DEFAULT_MODELS.items()
|
| 1308 |
for name in names],
|
| 1309 |
label="Default Models",
|
| 1310 |
value=[f"HuggingFace:{DEFAULT_MODELS['HuggingFace'][0]}"]
|
| 1311 |
)
|
| 1312 |
+
|
| 1313 |
with gr.Column():
|
| 1314 |
# Custom model input
|
| 1315 |
custom_models_input = gr.TextArea(
|
|
|
|
| 1317 |
placeholder="Enter one model per line in format: type:name",
|
| 1318 |
lines=3
|
| 1319 |
)
|
| 1320 |
+
|
| 1321 |
auto_split_strategies = gr.CheckboxGroup(
|
| 1322 |
choices=["token", "recursive"],
|
| 1323 |
label="Split Strategies to Test"
|
|
|
|
| 1336 |
auto_optimize_vocab = gr.Checkbox(label="Test Vocabulary Optimization", value=True)
|
| 1337 |
auto_use_query_optimization = gr.Checkbox(label="Test Query Optimization", value=True)
|
| 1338 |
auto_use_reranking = gr.Checkbox(label="Test Reranking", value=True)
|
| 1339 |
+
|
| 1340 |
auto_results_output = gr.Dataframe(label="Automated Test Results", interactive=False)
|
| 1341 |
auto_stats_output = gr.Dataframe(label="Automated Test Statistics", interactive=False)
|
| 1342 |
recommendations_output = gr.JSON(label="Recommendations")
|
| 1343 |
+
|
| 1344 |
def run_automation(file_input, query_input, expected_result, default_models, custom_models,
|
| 1345 |
split_strategies, chunk_sizes, overlap_sizes,
|
| 1346 |
vector_store_types, search_types, top_k_values,
|
| 1347 |
optimize_vocab, use_query_optimization, use_reranking,
|
| 1348 |
model_feedback):
|
| 1349 |
"""Wrapper function to handle Gradio inputs and run automated tests"""
|
| 1350 |
+
|
| 1351 |
# Parse model configurations
|
| 1352 |
model_configs = parse_model_selections(default_models, custom_models)
|
| 1353 |
+
|
| 1354 |
# Parse test parameters
|
| 1355 |
test_params = {
|
| 1356 |
'split_strategy': split_strategies,
|
|
|
|
| 1369 |
'custom_separators': [None],
|
| 1370 |
'query_optimization_model': ['google/flan-t5-base'] # Default query optimization model
|
| 1371 |
}
|
| 1372 |
+
|
| 1373 |
# Run automated tests
|
| 1374 |
results_df, stats_df = run_automated_tests(
|
| 1375 |
file_input.name if file_input else None,
|
|
|
|
| 1379 |
expected_result if expected_result else None,
|
| 1380 |
model_feedback if model_feedback else None
|
| 1381 |
)
|
| 1382 |
+
|
| 1383 |
# Generate recommendations based on results
|
| 1384 |
recommendations = analyze_results(stats_df)
|
| 1385 |
+
|
| 1386 |
return results_df, stats_df, recommendations
|
| 1387 |
+
|
| 1388 |
auto_submit_button = gr.Button("Run Automated Tests")
|
| 1389 |
auto_submit_button.click(
|
| 1390 |
fn=run_automation,
|
|
|
|
| 1399 |
outputs=[auto_results_output, auto_stats_output, recommendations_output]
|
| 1400 |
)
|
| 1401 |
###
|
| 1402 |
+
|
| 1403 |
with gr.Tab("Results"):
|
| 1404 |
with gr.Row():
|
| 1405 |
results_output = gr.DataFrame(label="Results")
|
| 1406 |
stats_output = gr.DataFrame(label="Statistics")
|
| 1407 |
+
|
| 1408 |
with gr.Row():
|
| 1409 |
plot_output = gr.Plot(label="Visualizations")
|
| 1410 |
model_rankings_output = gr.JSON(label="Model Rankings")
|
| 1411 |
+
|
| 1412 |
with gr.Row():
|
| 1413 |
recommendations_output = gr.JSON(label="Recommendations")
|
| 1414 |
+
|
| 1415 |
with gr.Tab("LLM Suggestions"):
|
| 1416 |
llm_file_input = gr.File(label="Upload File for LLM Suggestions")
|
| 1417 |
llm_num_chunks = gr.Slider(1, 10, step=1, value=5, label="Number of Sample Chunks")
|
| 1418 |
llm_suggest_button = gr.Button("Get LLM Suggestions")
|
| 1419 |
llm_suggestions_output = gr.JSON(label="LLM-suggested Settings")
|
| 1420 |
+
|
| 1421 |
llm_suggest_button.click(
|
| 1422 |
fn=get_llm_suggested_settings,
|
| 1423 |
inputs=[llm_file_input, llm_num_chunks],
|
|
|
|
| 1426 |
fn=update_inputs_with_llm_suggestions,
|
| 1427 |
inputs=[llm_suggestions_output],
|
| 1428 |
outputs=[
|
| 1429 |
+
embedding_models_input, split_strategy_input, chunk_size_input,
|
| 1430 |
+
overlap_size_input, vector_store_type_input, search_type_input,
|
| 1431 |
+
top_k_input, apply_preprocessing_input, optimize_vocab_input,
|
| 1432 |
apply_phonetic_input, phonetic_weight_input
|
| 1433 |
]
|
| 1434 |
)
|
|
|
|
| 1549 |
User: "Was sind die Hauptziele des KI-Gesetzes?"
|
| 1550 |
"""
|
| 1551 |
|
| 1552 |
+
|
| 1553 |
tutorial_md = """
|
| 1554 |
# Advanced Embedding Comparison Tool Tutorial
|
| 1555 |
|
|
|
|
| 1698 |
def create_custom_tokenizer(file_path, model_type='WordLevel', vocab_size=10000, special_tokens=None):
|
| 1699 |
with open(file_path, 'r', encoding='utf-8') as f:
|
| 1700 |
text = f.read()
|
| 1701 |
+
|
| 1702 |
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]")) if model_type == 'WordLevel' else Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 1703 |
tokenizer.pre_tokenizer = Whitespace()
|
| 1704 |
+
|
| 1705 |
trainer = trainers.WordLevelTrainer(special_tokens=special_tokens or ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=vocab_size)
|
| 1706 |
tokenizer.train_from_iterator([text], trainer)
|
| 1707 |
+
|
| 1708 |
return tokenizer
|
| 1709 |
````
|
| 1710 |
|
|
|
|
| 1736 |
|
| 1737 |
|
| 1738 |
## Useful Resources and Links
|
| 1739 |
+
|
| 1740 |
Here are some valuable resources to help you better understand and work with embeddings, retrieval systems, and natural language processing:
|
| 1741 |
+
|
| 1742 |
### Embeddings and Vector Databases
|
| 1743 |
- [Understanding Embeddings](https://www.tensorflow.org/text/guide/word_embeddings): A guide by TensorFlow on word embeddings
|
| 1744 |
- [FAISS: A Library for Efficient Similarity Search](https://github.com/facebookresearch/faiss): Facebook AI's vector similarity search library
|
| 1745 |
- [Chroma: The AI-native open-source embedding database](https://www.trychroma.com/): An embedding database designed for AI applications
|
| 1746 |
+
|
| 1747 |
### Natural Language Processing
|
| 1748 |
- [NLTK (Natural Language Toolkit)](https://www.nltk.org/): A leading platform for building Python programs to work with human language data
|
| 1749 |
- [spaCy](https://spacy.io/): Industrial-strength Natural Language Processing in Python
|
| 1750 |
- [Hugging Face Transformers](https://huggingface.co/transformers/): State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
|
| 1751 |
+
|
| 1752 |
### Retrieval-Augmented Generation (RAG)
|
| 1753 |
- [LangChain](https://python.langchain.com/docs/get_started/introduction): A framework for developing applications powered by language models
|
| 1754 |
- [OpenAI's RAG Tutorial](https://platform.openai.com/docs/tutorials/web-qa-embeddings): A guide on building a QA system with embeddings
|
| 1755 |
+
|
| 1756 |
### German Language Processing
|
| 1757 |
- [Kölner Phonetik](https://en.wikipedia.org/wiki/Cologne_phonetics): Information about the Kölner Phonetik algorithm
|
| 1758 |
- [German NLP Resources](https://github.com/adbar/German-NLP): A curated list of open-access resources for German NLP
|
| 1759 |
+
|
| 1760 |
### Benchmarks and Evaluation
|
| 1761 |
- [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard): Massive Text Embedding Benchmark leaderboard
|
| 1762 |
- [GLUE Benchmark](https://gluebenchmark.com/): General Language Understanding Evaluation benchmark
|
| 1763 |
+
|
| 1764 |
### Tools and Libraries
|
| 1765 |
- [Gensim](https://radimrehurek.com/gensim/): Topic modelling for humans
|
| 1766 |
- [Sentence-Transformers](https://www.sbert.net/): A Python framework for state-of-the-art sentence, text and image embeddings
|
| 1767 |
+
|
| 1768 |
### Support me
|
| 1769 |
- [Visual Crew Builder](https://visual-crew.builder.ai/): Tool for create AI systems, workflows and api. Or just a notebook.
|
| 1770 |
+
|
| 1771 |
+
|
| 1772 |
|
| 1773 |
This tool empowers you to fine-tune your RAG system for optimal performance. Experiment with different settings, run automated tests, and use insights to create an efficient information retrieval and generation system.
|
| 1774 |
|
|
|
|
| 1791 |
settings['lang'],
|
| 1792 |
settings['apply_preprocessing']
|
| 1793 |
)
|
| 1794 |
+
|
| 1795 |
results, _, _, _ = search_embeddings(
|
| 1796 |
chunks,
|
| 1797 |
embedding_model,
|
|
|
|
| 1803 |
apply_phonetic=settings['apply_phonetic'],
|
| 1804 |
phonetic_weight=settings['phonetic_weight']
|
| 1805 |
)
|
| 1806 |
+
|
| 1807 |
# Generate a response based on the retrieved results
|
| 1808 |
response = f"Based on the query '{message}', here are the top {settings['top_k']} relevant results:\n\n"
|
| 1809 |
for i, result in enumerate(results[:settings['top_k']]):
|
| 1810 |
response += f"{i+1}. {result['content'][:100]}...\n\n"
|
| 1811 |
+
|
| 1812 |
return response
|
| 1813 |
|
| 1814 |
with gr.Blocks() as chat_interface:
|
|
|
|
| 1846 |
launch_interface()
|
| 1847 |
# Uncomment the following line to launch the sample chat app
|
| 1848 |
´´´
|
| 1849 |
+
|
| 1850 |
"""
|
| 1851 |
|
| 1852 |
|
|
|
|
| 1855 |
["Embedding Comparison", "Tutorial", "Use Case"]
|
| 1856 |
)
|
| 1857 |
|
| 1858 |
+
iface.launch(debug=True, share=True)
|
| 1859 |
|
| 1860 |
# Enhanced Automated Testing
|
| 1861 |
+
def run_automated_tests(file_path: str, query: str, model_configs: List[Dict[str, str]],
|
| 1862 |
test_params: Dict[str, List[Any]], expected_result: Optional[str] = None,
|
| 1863 |
model_feedback: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 1864 |
"""
|
|
|
|
| 1867 |
all_results = []
|
| 1868 |
all_stats = []
|
| 1869 |
model_manager = ModelManager()
|
| 1870 |
+
|
| 1871 |
# Create parameter grid excluding model configurations
|
| 1872 |
base_params = {k: v for k, v in test_params.items() if k not in ['model_type', 'model_name']}
|
| 1873 |
param_grid = ParameterGrid(base_params)
|
| 1874 |
+
|
| 1875 |
# Test each model configuration with all parameter combinations
|
| 1876 |
for model_config in tqdm(model_configs, desc="Testing models"):
|
| 1877 |
model_type = model_config['type']
|
| 1878 |
model_name = model_config['name']
|
| 1879 |
+
|
| 1880 |
for params in tqdm(param_grid, desc=f"Testing parameters for {model_type}:{model_name}"):
|
| 1881 |
try:
|
| 1882 |
# Process files and get chunks
|
|
|
|
| 1891 |
params['lang'],
|
| 1892 |
params['apply_preprocessing']
|
| 1893 |
)
|
| 1894 |
+
|
| 1895 |
# Apply vocabulary optimization if specified
|
| 1896 |
if params['optimize_vocab']:
|
| 1897 |
tokenizer, chunks = optimize_vocabulary(chunks)
|
| 1898 |
+
|
| 1899 |
# Apply query optimization if specified
|
| 1900 |
current_query = query
|
| 1901 |
if params['use_query_optimization']:
|
|
|
|
| 1909 |
params['top_k']
|
| 1910 |
)
|
| 1911 |
current_query = " ".join(optimized_queries)
|
| 1912 |
+
|
| 1913 |
# Perform search
|
| 1914 |
results, search_time, vector_store, raw_results = search_embeddings(
|
| 1915 |
chunks,
|
|
|
|
| 1923 |
params['apply_phonetic'],
|
| 1924 |
params['phonetic_weight']
|
| 1925 |
)
|
| 1926 |
+
|
| 1927 |
# Apply reranking if specified
|
| 1928 |
if params['use_reranking']:
|
| 1929 |
+
reranker = pipeline("text-classification",
|
| 1930 |
model="cross-encoder/ms-marco-MiniLM-L-12-v2")
|
| 1931 |
raw_results = rerank_results(raw_results, current_query, reranker)
|
| 1932 |
+
|
| 1933 |
# Calculate statistics
|
| 1934 |
stats = ResultAnalyzer.calculate_statistics(
|
| 1935 |
raw_results, search_time, vector_store, num_tokens,
|
| 1936 |
embedding_model, current_query, params['top_k'],
|
| 1937 |
expected_result, model_feedback
|
| 1938 |
)
|
| 1939 |
+
|
| 1940 |
# Update model rankings
|
| 1941 |
model_id = f"{model_type}:{model_name}"
|
| 1942 |
ranking_score = calculate_model_ranking_score(stats)
|
| 1943 |
model_manager.update_model_ranking(model_id, ranking_score, model_feedback)
|
| 1944 |
+
|
| 1945 |
# Add model information to stats
|
| 1946 |
stats.update({
|
| 1947 |
"model_type": model_type,
|
|
|
|
| 1949 |
"model": f"{model_type} - {model_name}",
|
| 1950 |
**params
|
| 1951 |
})
|
| 1952 |
+
|
| 1953 |
# Format and store results
|
| 1954 |
all_results.extend(format_results(raw_results, stats))
|
| 1955 |
all_stats.append(stats)
|
| 1956 |
+
|
| 1957 |
except Exception as e:
|
| 1958 |
print(f"Error testing {model_type}:{model_name} with parameters {params}: {str(e)}")
|
| 1959 |
continue
|
| 1960 |
+
|
| 1961 |
return pd.DataFrame(all_results), pd.DataFrame(all_stats)
|
| 1962 |
|
| 1963 |
# Helper function to calculate model ranking score
|
|
|
|
| 1970 |
'contains_expected': 0.3,
|
| 1971 |
'expected_result_rank': -0.2 # Negative weight because lower rank is better
|
| 1972 |
}
|
| 1973 |
+
|
| 1974 |
score = 0.0
|
| 1975 |
for metric, weight in weights.items():
|
| 1976 |
if metric in stats and not isinstance(stats[metric], str):
|
|
|
|
| 1981 |
else:
|
| 1982 |
value = float(stats[metric])
|
| 1983 |
score += weight * value
|
| 1984 |
+
|
| 1985 |
return score
|
| 1986 |
|
| 1987 |
if __name__ == "__main__":
|
| 1988 |
launch_interface()
|
|
|