Spaces:
Runtime error
Runtime error
Anonymous
commited on
Commit
·
707f578
1
Parent(s):
7c008e6
changes
Browse files- app.py +1 -1
- requirements.txt +3 -1
- tasks/summarization.py +20 -20
app.py
CHANGED
|
@@ -41,7 +41,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 41 |
with gr.Row():
|
| 42 |
task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
|
| 43 |
language = gr.Dropdown(label="Source Language", choices=languages, value="English")
|
| 44 |
-
model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English"], value='
|
| 45 |
config_recommendation = gr.Button("Recommend Configuration")
|
| 46 |
with gr.Row():
|
| 47 |
config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,
|
|
|
|
| 41 |
with gr.Row():
|
| 42 |
task = gr.Dropdown(label="Task", choices=list(tasks_datasets.keys()), value=QA)
|
| 43 |
language = gr.Dropdown(label="Source Language", choices=languages, value="English")
|
| 44 |
+
model_type = gr.Dropdown(label="Model Type", choices=["Multilingual", "English-Centric"], value='Multilingual')
|
| 45 |
config_recommendation = gr.Button("Recommend Configuration")
|
| 46 |
with gr.Row():
|
| 47 |
config_prompt = gr.Textbox(label="Recommended Configuration", interactive=False,
|
requirements.txt
CHANGED
|
@@ -3,4 +3,6 @@ numpy
|
|
| 3 |
datasets
|
| 4 |
easygoogletranslate
|
| 5 |
evaluate
|
| 6 |
-
langchain
|
|
|
|
|
|
|
|
|
| 3 |
datasets
|
| 4 |
easygoogletranslate
|
| 5 |
evaluate
|
| 6 |
+
langchain
|
| 7 |
+
tqdm
|
| 8 |
+
iso639
|
tasks/summarization.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
from typing import List, Dict, Optional, Union
|
| 2 |
-
|
| 3 |
import numpy as np
|
| 4 |
from datasets import Dataset, load_dataset
|
| 5 |
from easygoogletranslate import EasyGoogleTranslate
|
| 6 |
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
|
| 7 |
-
|
| 8 |
LANGUAGE_TO_SUFFIX = {
|
| 9 |
"chinese_simplified": "zh-CN",
|
| 10 |
"french": "fr",
|
|
@@ -24,12 +23,13 @@ LANGUAGE_TO_SUFFIX = {
|
|
| 24 |
"persian": "fa",
|
| 25 |
"azerbaijani": "az",
|
| 26 |
"korean": "ko",
|
|
|
|
| 27 |
}
|
| 28 |
|
|
|
|
| 29 |
def choose_few_shot_examples(
|
| 30 |
train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
|
| 31 |
) -> List[Dict[str, Union[str, int]]]:
|
| 32 |
-
|
| 33 |
selected_examples = []
|
| 34 |
|
| 35 |
example_idxs = []
|
|
@@ -56,15 +56,15 @@ def choose_few_shot_examples(
|
|
| 56 |
def _translate_instruction(basic_instruction: str, target_language: str) -> str:
|
| 57 |
translator = EasyGoogleTranslate(
|
| 58 |
source_language="en",
|
| 59 |
-
target_language=
|
| 60 |
timeout=50,
|
| 61 |
)
|
| 62 |
return translator.translate(basic_instruction)
|
| 63 |
|
| 64 |
|
| 65 |
def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
|
| 66 |
-
translator = EasyGoogleTranslate(source_language=
|
| 67 |
-
target_language=
|
| 68 |
timeout=30)
|
| 69 |
try:
|
| 70 |
return {'text': translator.translate(example['text']), 'summary': ''}
|
|
@@ -85,20 +85,20 @@ def create_instruction(lang: str, expected_output: str):
|
|
| 85 |
)
|
| 86 |
|
| 87 |
|
| 88 |
-
def load_xlsum_data(lang, split, limit
|
| 89 |
"""Loads the xlsum dataset"""
|
| 90 |
dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
|
| 91 |
return dataset.select(range(limit))
|
| 92 |
|
| 93 |
|
| 94 |
def construct_prompt(
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
):
|
| 103 |
if not instruction:
|
| 104 |
print(lang)
|
|
@@ -110,14 +110,14 @@ def construct_prompt(
|
|
| 110 |
|
| 111 |
zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
print(test_data)
|
| 116 |
-
print(num_examples)
|
| 117 |
-
print(lang)
|
| 118 |
ic_examples = []
|
| 119 |
if not zero_shot:
|
| 120 |
-
|
| 121 |
ic_examples = choose_few_shot_examples(
|
| 122 |
train_dataset=test_data,
|
| 123 |
few_shot_size=num_examples,
|
|
@@ -139,7 +139,7 @@ def construct_prompt(
|
|
| 139 |
)
|
| 140 |
|
| 141 |
print("lang", lang)
|
| 142 |
-
print(config["input"]
|
| 143 |
if config["input"] != lang:
|
| 144 |
test_example = _translate_example(
|
| 145 |
example=test_example, src_language=lang, target_language=config["input"]
|
|
|
|
| 1 |
from typing import List, Dict, Optional, Union
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from datasets import Dataset, load_dataset
|
| 4 |
from easygoogletranslate import EasyGoogleTranslate
|
| 5 |
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
|
| 6 |
+
from iso639 import to_iso639_1
|
| 7 |
LANGUAGE_TO_SUFFIX = {
|
| 8 |
"chinese_simplified": "zh-CN",
|
| 9 |
"french": "fr",
|
|
|
|
| 23 |
"persian": "fa",
|
| 24 |
"azerbaijani": "az",
|
| 25 |
"korean": "ko",
|
| 26 |
+
"hebrew": "he",
|
| 27 |
}
|
| 28 |
|
| 29 |
+
|
| 30 |
def choose_few_shot_examples(
|
| 31 |
train_dataset: Dataset, few_shot_size: int, context: List[str], selection_criteria: str, lang: str,
|
| 32 |
) -> List[Dict[str, Union[str, int]]]:
|
|
|
|
| 33 |
selected_examples = []
|
| 34 |
|
| 35 |
example_idxs = []
|
|
|
|
| 56 |
def _translate_instruction(basic_instruction: str, target_language: str) -> str:
|
| 57 |
translator = EasyGoogleTranslate(
|
| 58 |
source_language="en",
|
| 59 |
+
target_language=to_iso639_1(target_language),
|
| 60 |
timeout=50,
|
| 61 |
)
|
| 62 |
return translator.translate(basic_instruction)
|
| 63 |
|
| 64 |
|
| 65 |
def _translate_example(example: Dict[str, str], src_language: str, target_language: str):
|
| 66 |
+
translator = EasyGoogleTranslate(source_language=to_iso639_1(str(src_language).capitalize()),
|
| 67 |
+
target_language=to_iso639_1(str(target_language).capitalize()),
|
| 68 |
timeout=30)
|
| 69 |
try:
|
| 70 |
return {'text': translator.translate(example['text']), 'summary': ''}
|
|
|
|
| 85 |
)
|
| 86 |
|
| 87 |
|
| 88 |
+
def load_xlsum_data(lang, split, limit=5):
|
| 89 |
"""Loads the xlsum dataset"""
|
| 90 |
dataset = load_dataset("csebuetnlp/xlsum", lang)[split]
|
| 91 |
return dataset.select(range(limit))
|
| 92 |
|
| 93 |
|
| 94 |
def construct_prompt(
|
| 95 |
+
instruction: str,
|
| 96 |
+
test_example: dict,
|
| 97 |
+
zero_shot: bool,
|
| 98 |
+
dataset: str,
|
| 99 |
+
num_examples: int,
|
| 100 |
+
lang: str,
|
| 101 |
+
config: Dict[str, str],
|
| 102 |
):
|
| 103 |
if not instruction:
|
| 104 |
print(lang)
|
|
|
|
| 110 |
|
| 111 |
zero_shot_template = f"""{instruction}""" + "\n Input: {text} " ""
|
| 112 |
|
| 113 |
+
if not zero_shot:
|
| 114 |
+
try:
|
| 115 |
+
test_data = load_xlsum_data(lang=lang, split="test", limit=100)
|
| 116 |
+
except Exception as e:
|
| 117 |
+
raise KeyError(f"{lang} is not supported in XlSum dataset, choose supported language in few-shot")
|
| 118 |
|
|
|
|
|
|
|
|
|
|
| 119 |
ic_examples = []
|
| 120 |
if not zero_shot:
|
|
|
|
| 121 |
ic_examples = choose_few_shot_examples(
|
| 122 |
train_dataset=test_data,
|
| 123 |
few_shot_size=num_examples,
|
|
|
|
| 139 |
)
|
| 140 |
|
| 141 |
print("lang", lang)
|
| 142 |
+
print(config["input"], lang)
|
| 143 |
if config["input"] != lang:
|
| 144 |
test_example = _translate_example(
|
| 145 |
example=test_example, src_language=lang, target_language=config["input"]
|