| | import json |
| | from typing import Any, Dict, List |
| |
|
| | from distilabel.steps.tasks.typing import ChatType |
| | from distilabel.steps.tasks.text_generation import TextGeneration |
| | from distilabel.steps import StepInput, StepOutput, Step |
| |
|
| | from dotenv import load_dotenv |
| |
|
| | from defaults import ( |
| | DEFAULT_DOMAIN, |
| | DEFAULT_PERSPECTIVES, |
| | DEFAULT_TOPICS, |
| | DEFAULT_EXAMPLES, |
| | DEFAULT_SYSTEM_PROMPT, |
| | N_PERSPECTIVES, |
| | N_TOPICS, |
| | N_EXAMPLES, |
| | ) |
| |
|
| | load_dotenv() |
| |
|
| | |
| | APPLICATION_DESCRIPTION = f"""You are an AI assistant than generates queries around the domain of {DEFAULT_DOMAIN}. |
| | Your should not expect basic but profound questions from your users. |
| | The queries should reflect a diversity of vision and economic positions and political positions. |
| | The queries may know about different methods of {DEFAULT_DOMAIN}. |
| | The queries can be positioned politically, economically, socially, or practically. |
| | Also take into account the impact of diverse causes on diverse domains.""" |
| |
|
| |
|
| | TOPICS = DEFAULT_TOPICS[:N_TOPICS] |
| | PERSPECTIVES = DEFAULT_PERSPECTIVES[:N_PERSPECTIVES] |
| | EXAMPLES = DEFAULT_EXAMPLES[:N_EXAMPLES] |
| |
|
| |
|
| | def create_examples_template(examples: List[Dict[str, str]]) -> List[str]: |
| | questions = """ Examples of high quality questions:""" |
| | answers = """ Examples of high quality answers:""" |
| | for example in examples: |
| | questions += f"""\n- Question: {example["question"]}\n""" |
| | answers += f"""\n- Answer: {example["answer"]}\n""" |
| |
|
| | _template: str = ( |
| | """{instruction}\nThis is the the instruction.\n Examples: """ |
| | + questions |
| | + answers |
| | ) |
| | return _template |
| |
|
| |
|
| | def create_topics(topics: List[str], positions: List[str]) -> List[str]: |
| | return [ |
| | f"{topic} from a {position} perspective" |
| | for topic in topics |
| | for position in positions |
| | ] |
| |
|
| |
|
| | class DomainExpert(TextGeneration): |
| | """A customized task to generate text as a domain expert in the domain of farming and agriculture.""" |
| |
|
| | _system_prompt: (str) = DEFAULT_SYSTEM_PROMPT |
| | _template: str = """{instruction}\nThis is the the instruction.\n Examples: """ |
| |
|
| | def format_input(self, input: Dict[str, Any]) -> "ChatType": |
| | return [ |
| | { |
| | "role": "system", |
| | "content": self._system_prompt, |
| | }, |
| | { |
| | "role": "user", |
| | "content": self._template.format(**input), |
| | }, |
| | ] |
| |
|
| |
|
| | class CleanNumberedList(Step): |
| | """A step to clean the numbered list of questions.""" |
| |
|
| | def process(self, inputs: StepInput) -> StepOutput: |
| | import re |
| |
|
| | pattern = r"^\d+\.\s" |
| |
|
| | for input in inputs: |
| | input["question"] = re.sub(pattern, "", input["question"]) |
| | yield inputs |
| |
|