File size: 3,302 Bytes
7b295db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import ollama
import os
from tqdm import tqdm
import copy
import pandas as pd

def ask_a_question(path_document, df_QA, nb_question=3):
    with open(path_document) as json_file:
        document = json.load(json_file)
    prompt = f"""Task: Generate {nb_question} question and answer pairs in french based on the content of the following document.
Constraint: Each answer must be a direct excerpt (verbatim or nearly verbatim) from the document. Do not summarize or paraphrase.
Format:
Q: [Your question]
A: [Exact excerpt from the document]

Document:
{document}
    """

    URL = document["url"]

    response = ollama.chat(
        model="llama3",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
    )

    response = response["message"]["content"]

    res = response.split("Q: ")

    for r in res: 
        if "A: " in r:
            qa = r.split("\nA: ")
            if len(qa) == 1:
                qa = qa[0].split("A: ")
            qa += [URL]
            # QA += [{'Question': qa[0], 'Réponse': qa[1], 'Lien du site': URL}]
            df_QA = pd.concat([pd.DataFrame([qa], columns=df_QA.columns), df_QA], ignore_index=True)


    # print(df_QA)

    return df_QA

def remove_menu_files(paths):
    paths_wt_menu = copy.copy(paths)
    for p1 in paths:
        for p2 in paths:
            p21 = p1[:-5]
            p22 = p2[:-5]
            if p21 != p22 and p21 in p22:
                paths_wt_menu.remove(p1)
                break
    return paths_wt_menu

def get_list_dir(DATA_DIR):
    list_dir = os.listdir(DATA_DIR)
    list_dir = [d for d in list_dir if os.path.isdir(os.path.join(DATA_DIR, d))]
    return list_dir

def get_data_paths(DATA_DIR):
    list_files = []
    list_dir = get_list_dir(DATA_DIR)
    for d in list_dir:
        start_path = os.path.join(DATA_DIR, d)
        filenames = os.listdir(start_path)
        filenames = [f for f in filenames if f.endswith(".json")]
        paths = [os.path.join(start_path, f) for f in filenames]
        list_files += paths
    return list_files

def ask_question_on_multiple_doc(data_path, filename_save, nb_question=3):
    df_QA = pd.DataFrame(columns=['Question', 'Réponse', 'Lien du site'])
    
    files = get_data_paths(data_path)
    # print(*files, sep="\n")
    path_files = [f for f in files if not "actualite" in f 
                                        and not "!trash" in f
                                        and f != ".json"
                                        and f != "Accueil.json"
                                        and f.endswith(".json")]

    # print(*path_files, sep="\n")
    path_files = remove_menu_files(path_files)

    # path_files = path_files[:10]

    for pf in tqdm(path_files):
        df_QA = ask_a_question(pf, df_QA, nb_question)

    df_QA.to_csv(filename_save, index=False)

    # print(data)

    # with open(filename_save, 'w') as fp:
    #     json.dump(data, fp, ensure_ascii=False)

if __name__ == "__main__":
    # path_document = "../data_sh/espace-candidats_actifs-et-personnes-en-reprise-detudes_candidater.json"
    path_document = "data_websites/"
    QA_filename = "QA_generated.csv"
    NB_QA = 1
    ask_question_on_multiple_doc(path_document, QA_filename, NB_QA)