|
|
""" |
|
|
Dataset Tab Component |
|
|
|
|
|
Displays task and dataset information. |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import html |
|
|
|
|
|
|
|
|
class DatasetTab: |
|
|
""" |
|
|
Dataset information tab component. |
|
|
|
|
|
Shows details about the evaluation tasks and datasets. |
|
|
""" |
|
|
|
|
|
def build(self) -> None: |
|
|
"""Build the dataset tab UI.""" |
|
|
gr.Markdown("### MTEB Turkish + Turkish Legal Dataset Overview") |
|
|
|
|
|
|
|
|
task_to_dataset = { |
|
|
'WebFAQRetrieval': 'PaDaS-Lab/webfaq-retrieval', |
|
|
'XQuADRetrieval': 'google/xquad', |
|
|
'TurHistQuadRetrieval': 'asparius/TurHistQuAD', |
|
|
'MKQARetrieval': 'apple/mkqa', |
|
|
'MassiveIntentClassification': 'mteb/amazon_massive_intent', |
|
|
'MassiveScenarioClassification': 'mteb/amazon_massive_scenario', |
|
|
'MultilingualSentimentClassification': 'mteb/multilingual-sentiment-classification', |
|
|
'SIB200Classification': 'mteb/sib200', |
|
|
'TurkishMovieSentimentClassification': 'asparius/Turkish-Movie-Review', |
|
|
'TurkishProductSentimentClassification': 'asparius/Turkish-Product-Review', |
|
|
'SIB200ClusteringS2S': 'mteb/sib200', |
|
|
'XNLI': 'mteb/xnli', |
|
|
'XNLIV2': 'mteb/xnli2.0-multi-pair', |
|
|
'STS22.v2': 'mteb/sts22-crosslingual-sts' |
|
|
} |
|
|
|
|
|
|
|
|
clickable_task_names = [] |
|
|
task_list = [ |
|
|
'WebFAQRetrieval', 'XQuADRetrieval', 'TurHistQuadRetrieval', 'MKQARetrieval', |
|
|
'MassiveIntentClassification', 'MassiveScenarioClassification', |
|
|
'MultilingualSentimentClassification', 'SIB200Classification', |
|
|
'TurkishMovieSentimentClassification', 'TurkishProductSentimentClassification', |
|
|
'SIB200ClusteringS2S', 'XNLI', 'XNLIV2', 'STS22.v2' |
|
|
] |
|
|
|
|
|
for task_name in task_list: |
|
|
dataset_path = task_to_dataset[task_name] |
|
|
hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}" |
|
|
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>' |
|
|
clickable_task_names.append(clickable_name) |
|
|
|
|
|
|
|
|
dataset_data = pd.DataFrame({ |
|
|
'Task Name': clickable_task_names, |
|
|
'Task Type': [ |
|
|
'Retrieval', 'Retrieval', 'Retrieval', 'Retrieval', |
|
|
'Classification', 'Classification', |
|
|
'Classification', 'Classification', |
|
|
'Classification', 'Classification', |
|
|
'Clustering', 'PairClassification', 'PairClassification', 'STS' |
|
|
], |
|
|
'Description': [ |
|
|
'Turkish FAQ retrieval task', |
|
|
'Turkish question answering retrieval', |
|
|
'Historical Turkish document retrieval', |
|
|
'Multilingual knowledge QA retrieval', |
|
|
'Intent classification for Turkish', |
|
|
'Scenario classification for Turkish', |
|
|
'Multilingual sentiment classification', |
|
|
'SIB200 language identification', |
|
|
'Turkish movie review sentiment', |
|
|
'Turkish product review sentiment', |
|
|
'SIB200 clustering task', |
|
|
'Turkish natural language inference', |
|
|
'Enhanced Turkish NLI task', |
|
|
'Turkish semantic textual similarity' |
|
|
], |
|
|
'Domain': [ |
|
|
'FAQ/QA', 'QA', 'Historical', 'Knowledge QA', |
|
|
'Intent', 'Scenario', |
|
|
'Sentiment', 'Language ID', |
|
|
'Movies', 'Products', |
|
|
'Language ID', 'NLI', 'NLI', 'STS' |
|
|
], |
|
|
'Samples': [ |
|
|
'~145K', '~1.19K', '~1.33K', '~10K', |
|
|
'~5K', '~5K', |
|
|
'211', '~899', |
|
|
'~2.64K', '800', |
|
|
'99', '~7.5K', '~5.01K', '~208' |
|
|
] |
|
|
}) |
|
|
|
|
|
gr.Dataframe( |
|
|
value=dataset_data, |
|
|
label="MTEB Turkish Task Details", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["html", "str", "str", "str", "str"] |
|
|
) |
|
|
|
|
|
|
|
|
self._build_legal_tasks_section() |
|
|
|
|
|
|
|
|
self._build_task_distribution_section() |
|
|
|
|
|
|
|
|
self._build_metrics_explanation_section() |
|
|
|
|
|
def _build_legal_tasks_section(self): |
|
|
"""Build the Turkish Legal Tasks section.""" |
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### Turkish Legal Tasks") |
|
|
|
|
|
legal_task_to_dataset = { |
|
|
'TurkishLegalQA': 'newmindai/contract-retrieval', |
|
|
'TurkishTaxRulings': 'newmindai/regulation-retrieval', |
|
|
'TurkishCourtOfCassation': 'newmindai/caselaw-retrieval' |
|
|
} |
|
|
|
|
|
clickable_legal_task_names = [] |
|
|
for task_name in ['TurkishLegalQA', 'TurkishTaxRulings', 'TurkishCourtOfCassation']: |
|
|
dataset_path = legal_task_to_dataset[task_name] |
|
|
hf_link = f"https://huggingface.co/datasets/{html.escape(dataset_path)}" |
|
|
clickable_name = f'<a href="{hf_link}" target="_blank" style="color: #2563eb; text-decoration: underline;">{html.escape(task_name)}</a>' |
|
|
clickable_legal_task_names.append(clickable_name) |
|
|
|
|
|
legal_task_data = pd.DataFrame({ |
|
|
'Task Name': clickable_legal_task_names, |
|
|
'Task Type': ['Contracts', 'Regulation', 'Case Law'], |
|
|
'Description': [ |
|
|
'Turkish legal question answering retrieval', |
|
|
'Turkish legal tax rulings retrieval', |
|
|
'Turkish Court of Cassation caselaw retrieval' |
|
|
], |
|
|
'Domain': ['Contracts', 'Regulation', 'Caselaw'], |
|
|
'Samples': ['272', '~120K', '~1.39K'] |
|
|
}) |
|
|
|
|
|
gr.Dataframe( |
|
|
value=legal_task_data, |
|
|
label="Turkish Legal Task Details", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
datatype=["html", "str", "str", "str", "str"] |
|
|
) |
|
|
|
|
|
def _build_task_distribution_section(self): |
|
|
"""Build the task distribution section.""" |
|
|
gr.Markdown(""" |
|
|
### Task Distribution: |
|
|
|
|
|
**Turkish Tasks (14):** |
|
|
- **Classification**: 6 tasks (sentiment, intent, scenario, language identification) |
|
|
- **Retrieval**: 4 tasks (FAQ, QA, historical documents, knowledge QA) |
|
|
- **Pair Classification**: 2 tasks (natural language inference) |
|
|
- **Clustering**: 1 task (language clustering) |
|
|
- **STS**: 1 task (semantic textual similarity) |
|
|
|
|
|
**Turkish Legal Tasks (3):** |
|
|
- **Contracts**: 1 task (Turkish legal QA retrieval) |
|
|
- **Regulation**: 1 task (Turkish tax rulings retrieval) |
|
|
- **Caselaw**: 1 task (Turkish Court of Cassation case law retrieval) |
|
|
|
|
|
**Total: 17 tasks across 8 categories** |
|
|
""") |
|
|
|
|
|
|
|
|
stats_data = pd.DataFrame({ |
|
|
'Metric': [ |
|
|
'Total Tasks', |
|
|
'Turkish Tasks', |
|
|
'Legal Tasks', |
|
|
'Task Categories', |
|
|
'Languages', |
|
|
'Avg. Tokens per Sample' |
|
|
], |
|
|
'Value': [ |
|
|
'17 tasks', |
|
|
'14 tasks', |
|
|
'3 tasks', |
|
|
'8 categories', |
|
|
'Turkish', |
|
|
'~150 tokens' |
|
|
], |
|
|
'Notes': [ |
|
|
'Comprehensive evaluation: Turkish NLP + Legal', |
|
|
'Classification, Retrieval, STS, NLI, Clustering', |
|
|
'Contracts, Regulation, Caselaw', |
|
|
'Turkish: 5 types, Legal: 3 types', |
|
|
'Turkish-focused', |
|
|
'Varies by task type and domain' |
|
|
] |
|
|
}) |
|
|
|
|
|
gr.Dataframe( |
|
|
value=stats_data, |
|
|
label="Dataset Statistics Summary", |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
def _build_metrics_explanation_section(self): |
|
|
"""Build the metrics explanation section.""" |
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### Metrics Explanation: |
|
|
|
|
|
**Task Categories:** |
|
|
- **MTEB Score**: Average performance by task categories (refers to Mean (TaskType)) |
|
|
- **Mean (Task)**: Average performance across all individual tasks |
|
|
- **Classification**: Performance on Turkish classification tasks |
|
|
- **Clustering**: Performance on Turkish clustering tasks |
|
|
- **Pair Classification**: Performance on pair classification tasks (like NLI) |
|
|
- **Retrieval**: Performance on Turkish information retrieval tasks |
|
|
- **STS**: Performance on Semantic Textual Similarity tasks |
|
|
|
|
|
**Turkish Legal Categories:** |
|
|
- **Contracts**: Performance on Turkish legal contract analysis tasks |
|
|
- **Regulation**: Performance on Turkish legal regulation analysis tasks |
|
|
- **Caselaw**: Performance on Turkish Court of Cassation case law retrieval tasks |
|
|
|
|
|
### Tokenizer Quality Metrics: |
|
|
- **Unique Token Count**: Number of unique tokens generated by the tokenizer on Turkish MMLU dataset |
|
|
- **Turkish Token Count**: How many unique tokens are valid Turkish words/morphemes |
|
|
- **Turkish Token %**: Percentage of unique tokens that are linguistically valid Turkish |
|
|
- **Pure Token Count**: How many unique tokens are morphologically pure (root words) |
|
|
- **Pure Token %**: Percentage of unique tokens that are root words without suffixes |
|
|
|
|
|
### Model Information: |
|
|
- **Parameters**: Number of model parameters |
|
|
- **Embed Dim**: Embedding dimension size |
|
|
- **Max Seq Length**: Maximum sequence length the model can process |
|
|
- **Vocab Size**: Size of the model's vocabulary |
|
|
- **Model Architecture**: The underlying model architecture |
|
|
- **Tokenizer Type**: The tokenizer implementation used |
|
|
""") |
|
|
|
|
|
|
|
|
self._build_about_section() |
|
|
|
|
|
def _build_about_section(self): |
|
|
"""Build the about, contact, and links section.""" |
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### About Mizan: |
|
|
This leaderboard presents results from the **Mizan** benchmark, which evaluates embedding models |
|
|
on Turkish language tasks across multiple domains including: |
|
|
- Text classification and sentiment analysis |
|
|
- Information retrieval and search |
|
|
- Semantic textual similarity |
|
|
- Text clustering and pair classification |
|
|
- **Turkish Legal**: Contract analysis, regulation, and case law retrieval |
|
|
|
|
|
### Submit Your Model: |
|
|
Use the **Submit** tab to submit your Turkish embedding model for evaluation. |
|
|
Your request will be reviewed by administrators and you'll receive email notifications about the progress. |
|
|
|
|
|
### Contact: |
|
|
For any questions or feedback, please contact info@newmind.ai |
|
|
|
|
|
### Links: |
|
|
- **GitHub**: [embeddings-benchmark/mteb v1.38.51](https://github.com/embeddings-benchmark/mteb/tree/1.38.51) - Mizan is currently based on MTEB v1.38.51 (MTEB v2.0.0 support coming soon) |
|
|
- **Github**: [malibayram/tokenizer_benchmark](https://github.com/malibayram/tokenizer_benchmark) - Tokenizer evaluation is done with code from this repository, developed by Mehmet Ali Bayram, which utilizes ITU NLP tools for Turkish linguistic analysis. |
|
|
""") |
|
|
|