| | import json |
| | import string |
| |
|
| | import wikipedia |
| | from langchain import PromptTemplate |
| | from langchain.vectorstores import Chroma |
| | from langchain.text_splitter import CharacterTextSplitter |
| |
|
| | from src.tools.llms import openai_llm |
| | from src.tools.wiki import Wiki |
| |
|
| |
|
| |
|
| |
|
| | async def get_wikilist(task: {}) -> str: |
| | """ |
| | get the titles of wiki pages interesting for solving the given task |
| | """ |
| |
|
| | llm = openai_llm |
| | |
| | template = (f"\n" |
| | f" Your task consists in finding the list of wikipedia page titles which provide useful content " |
| | f" for a paragraph whose description is delimited by triple backticks: ```{task['description']}```\n" |
| | f" \n" |
| | f" The paragraph belongs at the top level of the hierarchy to a document" |
| | f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| | f" Make sure that the paragraph relates the top level of the document\n" |
| | f" \n" |
| | f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| | f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| | f" \n" |
| | f" The paragraphs comes after previous paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| | f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| | f" \n" |
| | f" The paragraphs comes before next paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| | f" \n" |
| | f" Format your response as a JSON list of strings separated by commas.\n" |
| | f" \n" |
| | f"\n" |
| | f" ") |
| |
|
| | prompt = PromptTemplate( |
| | input_variables=[], |
| | template=template |
| | ) |
| |
|
| | |
| | llm_list = llm(template) |
| | wikilist = extract_list(llm_list) |
| |
|
| | expanded_wikilist = [] |
| |
|
| | expand_factor = 2 |
| |
|
| | for wikipage in wikilist: |
| | expanded_wikilist += wikipedia.search(wikipage, expand_factor) |
| |
|
| | wikilist = list(set(expanded_wikilist)) |
| |
|
| | return wikilist |
| |
|
| |
|
| | def extract_list(llm_list: str): |
| | print(llm_list) |
| |
|
| | def filter_(el: str): |
| | resp = 2 < len(el) |
| | usable_length = len([c for c in el if c in string.ascii_letters]) |
| | resp = resp and len(el)*3/4 < usable_length |
| | return resp |
| |
|
| | try: |
| | wikilist = llm_list[1:-1].split('"') |
| | wikilist = [el for el in wikilist if filter_(el)] |
| | print(wikilist) |
| | except: |
| | wikilist = [] |
| | print('issues with the wikilist') |
| | return wikilist |
| |
|
| |
|
| | def get_public_paragraph(task: {}) -> str: |
| | """returns the task directly performed by chat GPT""" |
| | print(task) |
| | llm = openai_llm |
| | template = (f"\n" |
| | f" Your task consists in generating a paragraph\\n" |
| | f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| | f"\n" |
| | f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| | f" Make sure that the paragraph relates the top level of the document\n" |
| | f" \n" |
| | f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| | f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| | f" \n" |
| | f" The paragraphs comes after previous paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| | f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| | f" \n" |
| | f" The paragraphs comes before next paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| | f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| | f" \n" |
| | f" \n" |
| | f"\n" |
| | f" ") |
| |
|
| | p = llm(template) |
| |
|
| | return p |
| |
|
| |
|
| | def create_index(wikilist: [str]): |
| | """ |
| | useful for creating the index of wikipages |
| | """ |
| | fetch = Wiki().fetch |
| |
|
| | pages = [(title, fetch(title)) for title in wikilist if type(fetch(title)) != str] |
| | texts = [] |
| | chunk = 800 |
| | for title, page in pages: |
| | texts.append(WikiPage(title=title, fulltext=page.page_content)) |
| |
|
| | doc_splitter = CharacterTextSplitter( |
| | separator=".", |
| | chunk_size=chunk, |
| | chunk_overlap=100, |
| | length_function=len, |
| | ) |
| |
|
| | paragraphs = texts[0].get_paragraphs(chunk=800) |
| |
|
| | split_texts = [] |
| | for p in paragraphs: |
| | split_texts += doc_splitter.split_text(p) |
| |
|
| | for split_text in split_texts: |
| | assert type(split_text) == str |
| | assert 0 < len(split_text) < 2 * 500 |
| |
|
| | wiki_index = Chroma.from_texts(split_texts) |
| |
|
| | return wiki_index |
| |
|
| |
|
| | def get_wiki_paragraph(wiki_index, task: {}) -> str: |
| | """useful to get a summary in one line from wiki index""" |
| |
|
| | task_description = get_public_paragraph(task) |
| | wiki_paragraphs = semantic_search(wiki_index, task_description) |
| | text_content = "" |
| | for p in wiki_paragraphs: |
| | text_content += p.page_content + "/n/n" |
| |
|
| | template = (f"\n" |
| | f" Your task consists in generating a paragraph\\n" |
| | f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| | f"\n" |
| | f" The text generation is based in the documents provided in these sections \n" |
| | f" delimited by by triple backticks: ``` {text_content}``` \n" |
| | f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| | f" Make sure that the paragraph relates the top level of the document\n" |
| | f" \n" |
| | f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| | f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| | f" \n" |
| | f" The paragraphs comes after previous paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| | f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| | f" \n" |
| | f" The paragraphs comes before next paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| | f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| | f" \n" |
| | f" \n" |
| | f"\n" |
| | f" ") |
| |
|
| | llm = openai_llm |
| | p = llm(template) |
| |
|
| | return p |
| |
|
| |
|
| | def get_private_paragraph(texts, task: {}) -> str: |
| | """useful to get a summary in one line from wiki index""" |
| |
|
| | text_content = "" |
| | for t in texts: |
| | text_content += t + "/n/n" |
| |
|
| | template = (f"\n" |
| | f" Your task consists in generating a paragraph\\n" |
| | f" whose description is delimited by triple backticks: ```{task['description']}```\n" |
| | f"\n" |
| | f" The text generation is based in the documents provided in these sections \n" |
| | f" delimited by by triple backticks: ``` {text_content}``` \n" |
| | f" The paragraph belongs at the top level of the hierarchy to a document \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['doc_description']}```\n" |
| | f" Make sure that the paragraph relates the top level of the document\n" |
| | f" \n" |
| | f" The paragraph belongs to a higher paragraph in the hierarchy \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['above']}```\n" |
| | f" Make sure that the paragraph relates with the paragraph in the hierarchy of the document\n" |
| | f" \n" |
| | f" The paragraphs comes after previous paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['before']}```\n" |
| | f" Make sure that the paragraph relates with previous paragraph without any repetition\n" |
| | f" \n" |
| | f" The paragraphs comes before next paragraphs \\n" |
| | f" whose description is delimited by triple backticks: ``` {task['after']}```\n" |
| | f" Make sure that the paragraph prepares the transition to the next paragraph without any repetition\n" |
| | f" \n" |
| | f" \n" |
| | f"\n" |
| | f" ") |
| |
|
| | llm = openai_llm |
| | p = llm(template) |
| |
|
| | return p |
| |
|