Agent_UB / generate_dataset_tests.py
t-pris's picture
Upload folder using huggingface_hub
7b295db verified
import json
import ollama
import os
from tqdm import tqdm
import copy
import pandas as pd
def ask_a_question(path_document, df_QA, nb_question=3):
with open(path_document) as json_file:
document = json.load(json_file)
prompt = f"""Task: Generate {nb_question} question and answer pairs in french based on the content of the following document.
Constraint: Each answer must be a direct excerpt (verbatim or nearly verbatim) from the document. Do not summarize or paraphrase.
Format:
Q: [Your question]
A: [Exact excerpt from the document]
Document:
{document}
"""
URL = document["url"]
response = ollama.chat(
model="llama3",
messages=[
{
"role": "user",
"content": prompt,
},
],
)
response = response["message"]["content"]
res = response.split("Q: ")
for r in res:
if "A: " in r:
qa = r.split("\nA: ")
if len(qa) == 1:
qa = qa[0].split("A: ")
qa += [URL]
# QA += [{'Question': qa[0], 'Réponse': qa[1], 'Lien du site': URL}]
df_QA = pd.concat([pd.DataFrame([qa], columns=df_QA.columns), df_QA], ignore_index=True)
# print(df_QA)
return df_QA
def remove_menu_files(paths):
paths_wt_menu = copy.copy(paths)
for p1 in paths:
for p2 in paths:
p21 = p1[:-5]
p22 = p2[:-5]
if p21 != p22 and p21 in p22:
paths_wt_menu.remove(p1)
break
return paths_wt_menu
def get_list_dir(DATA_DIR):
list_dir = os.listdir(DATA_DIR)
list_dir = [d for d in list_dir if os.path.isdir(os.path.join(DATA_DIR, d))]
return list_dir
def get_data_paths(DATA_DIR):
list_files = []
list_dir = get_list_dir(DATA_DIR)
for d in list_dir:
start_path = os.path.join(DATA_DIR, d)
filenames = os.listdir(start_path)
filenames = [f for f in filenames if f.endswith(".json")]
paths = [os.path.join(start_path, f) for f in filenames]
list_files += paths
return list_files
def ask_question_on_multiple_doc(data_path, filename_save, nb_question=3):
df_QA = pd.DataFrame(columns=['Question', 'Réponse', 'Lien du site'])
files = get_data_paths(data_path)
# print(*files, sep="\n")
path_files = [f for f in files if not "actualite" in f
and not "!trash" in f
and f != ".json"
and f != "Accueil.json"
and f.endswith(".json")]
# print(*path_files, sep="\n")
path_files = remove_menu_files(path_files)
# path_files = path_files[:10]
for pf in tqdm(path_files):
df_QA = ask_a_question(pf, df_QA, nb_question)
df_QA.to_csv(filename_save, index=False)
# print(data)
# with open(filename_save, 'w') as fp:
# json.dump(data, fp, ensure_ascii=False)
if __name__ == "__main__":
# path_document = "../data_sh/espace-candidats_actifs-et-personnes-en-reprise-detudes_candidater.json"
path_document = "data_websites/"
QA_filename = "QA_generated.csv"
NB_QA = 1
ask_question_on_multiple_doc(path_document, QA_filename, NB_QA)