Spaces:

t-pris
/

Agent_UB

Sleeping

App Files Files Community

Agent_UB / generate_dataset_tests.py

t-pris

Upload folder using huggingface_hub

7b295db verified 3 months ago

raw

history blame contribute delete

3.3 kB

	import json
	import ollama
	import os
	from tqdm import tqdm
	import copy
	import pandas as pd

	def ask_a_question(path_document, df_QA, nb_question=3):
	with open(path_document) as json_file:
	document = json.load(json_file)
	prompt = f"""Task: Generate {nb_question} question and answer pairs in french based on the content of the following document.
	Constraint: Each answer must be a direct excerpt (verbatim or nearly verbatim) from the document. Do not summarize or paraphrase.
	Format:
	Q: [Your question]
	A: [Exact excerpt from the document]

	Document:
	{document}
	"""

	URL = document["url"]

	response = ollama.chat(
	model="llama3",
	messages=[
	{
	"role": "user",
	"content": prompt,
	},
	],
	)

	response = response["message"]["content"]

	res = response.split("Q: ")

	for r in res:
	if "A: " in r:
	qa = r.split("\nA: ")
	if len(qa) == 1:
	qa = qa[0].split("A: ")
	qa += [URL]
	# QA += [{'Question': qa[0], 'Réponse': qa[1], 'Lien du site': URL}]
	df_QA = pd.concat([pd.DataFrame([qa], columns=df_QA.columns), df_QA], ignore_index=True)


	# print(df_QA)

	return df_QA

	def remove_menu_files(paths):
	paths_wt_menu = copy.copy(paths)
	for p1 in paths:
	for p2 in paths:
	p21 = p1[:-5]
	p22 = p2[:-5]
	if p21 != p22 and p21 in p22:
	paths_wt_menu.remove(p1)
	break
	return paths_wt_menu

	def get_list_dir(DATA_DIR):
	list_dir = os.listdir(DATA_DIR)
	list_dir = [d for d in list_dir if os.path.isdir(os.path.join(DATA_DIR, d))]
	return list_dir

	def get_data_paths(DATA_DIR):
	list_files = []
	list_dir = get_list_dir(DATA_DIR)
	for d in list_dir:
	start_path = os.path.join(DATA_DIR, d)
	filenames = os.listdir(start_path)
	filenames = [f for f in filenames if f.endswith(".json")]
	paths = [os.path.join(start_path, f) for f in filenames]
	list_files += paths
	return list_files

	def ask_question_on_multiple_doc(data_path, filename_save, nb_question=3):
	df_QA = pd.DataFrame(columns=['Question', 'Réponse', 'Lien du site'])

	files = get_data_paths(data_path)
	# print(*files, sep="\n")
	path_files = [f for f in files if not "actualite" in f
	and not "!trash" in f
	and f != ".json"
	and f != "Accueil.json"
	and f.endswith(".json")]

	# print(*path_files, sep="\n")
	path_files = remove_menu_files(path_files)

	# path_files = path_files[:10]

	for pf in tqdm(path_files):
	df_QA = ask_a_question(pf, df_QA, nb_question)

	df_QA.to_csv(filename_save, index=False)

	# print(data)

	# with open(filename_save, 'w') as fp:
	# json.dump(data, fp, ensure_ascii=False)

	if __name__ == "__main__":
	# path_document = "../data_sh/espace-candidats_actifs-et-personnes-en-reprise-detudes_candidater.json"
	path_document = "data_websites/"
	QA_filename = "QA_generated.csv"
	NB_QA = 1
	ask_question_on_multiple_doc(path_document, QA_filename, NB_QA)