|
|
import json |
|
|
import ollama |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
import copy |
|
|
import pandas as pd |
|
|
|
|
|
def ask_a_question(path_document, df_QA, nb_question=3): |
|
|
with open(path_document) as json_file: |
|
|
document = json.load(json_file) |
|
|
prompt = f"""Task: Generate {nb_question} question and answer pairs in french based on the content of the following document. |
|
|
Constraint: Each answer must be a direct excerpt (verbatim or nearly verbatim) from the document. Do not summarize or paraphrase. |
|
|
Format: |
|
|
Q: [Your question] |
|
|
A: [Exact excerpt from the document] |
|
|
|
|
|
Document: |
|
|
{document} |
|
|
""" |
|
|
|
|
|
URL = document["url"] |
|
|
|
|
|
response = ollama.chat( |
|
|
model="llama3", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": prompt, |
|
|
}, |
|
|
], |
|
|
) |
|
|
|
|
|
response = response["message"]["content"] |
|
|
|
|
|
res = response.split("Q: ") |
|
|
|
|
|
for r in res: |
|
|
if "A: " in r: |
|
|
qa = r.split("\nA: ") |
|
|
if len(qa) == 1: |
|
|
qa = qa[0].split("A: ") |
|
|
qa += [URL] |
|
|
|
|
|
df_QA = pd.concat([pd.DataFrame([qa], columns=df_QA.columns), df_QA], ignore_index=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return df_QA |
|
|
|
|
|
def remove_menu_files(paths): |
|
|
paths_wt_menu = copy.copy(paths) |
|
|
for p1 in paths: |
|
|
for p2 in paths: |
|
|
p21 = p1[:-5] |
|
|
p22 = p2[:-5] |
|
|
if p21 != p22 and p21 in p22: |
|
|
paths_wt_menu.remove(p1) |
|
|
break |
|
|
return paths_wt_menu |
|
|
|
|
|
def get_list_dir(DATA_DIR): |
|
|
list_dir = os.listdir(DATA_DIR) |
|
|
list_dir = [d for d in list_dir if os.path.isdir(os.path.join(DATA_DIR, d))] |
|
|
return list_dir |
|
|
|
|
|
def get_data_paths(DATA_DIR): |
|
|
list_files = [] |
|
|
list_dir = get_list_dir(DATA_DIR) |
|
|
for d in list_dir: |
|
|
start_path = os.path.join(DATA_DIR, d) |
|
|
filenames = os.listdir(start_path) |
|
|
filenames = [f for f in filenames if f.endswith(".json")] |
|
|
paths = [os.path.join(start_path, f) for f in filenames] |
|
|
list_files += paths |
|
|
return list_files |
|
|
|
|
|
def ask_question_on_multiple_doc(data_path, filename_save, nb_question=3): |
|
|
df_QA = pd.DataFrame(columns=['Question', 'Réponse', 'Lien du site']) |
|
|
|
|
|
files = get_data_paths(data_path) |
|
|
|
|
|
path_files = [f for f in files if not "actualite" in f |
|
|
and not "!trash" in f |
|
|
and f != ".json" |
|
|
and f != "Accueil.json" |
|
|
and f.endswith(".json")] |
|
|
|
|
|
|
|
|
path_files = remove_menu_files(path_files) |
|
|
|
|
|
|
|
|
|
|
|
for pf in tqdm(path_files): |
|
|
df_QA = ask_a_question(pf, df_QA, nb_question) |
|
|
|
|
|
df_QA.to_csv(filename_save, index=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
path_document = "data_websites/" |
|
|
QA_filename = "QA_generated.csv" |
|
|
NB_QA = 1 |
|
|
ask_question_on_multiple_doc(path_document, QA_filename, NB_QA) |