Spaces:

gaia-mistral
/

volAI_Avril

Sleeping

volAI_Avril / scripts /RAG_Mistral.py

Florian.Moret

rangement du repo

f02db94 11 months ago

6.13 kB

	import os
	import numpy as np
	import fitz # PyMuPDF pour extraction PDF
	import faiss
	import pickle
	import matplotlib.pyplot as plt
	from concurrent.futures import ThreadPoolExecutor
	from mistralai import Mistral
	from sklearn.manifold import TSNE
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
	from dotenv import load_dotenv

	# Charger les variables d'environnement
	load_dotenv()
	MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY_static')

	# 📌 Initialisation du client Mistral
	client = Mistral(api_key=MISTRAL_API_KEY)
	model_embedding = "mistral-embed"
	model_chat = "ministral-8b-latest"
	temperature = 0.1 # Réduction de la température pour privilégier la RAG

	# 📌 Paramètres de segmentation
	chunk_size = 256 # Réduction du chunk size pour un meilleur contrôle du contexte
	chunk_overlap = 15

	# 📌 Définition des chemins de stockage
	index_path = "faiss_index.bin"
	chunks_path = "chunked_docs.pkl"
	metadata_path = "metadata.pkl"
	embeddings_path = "embeddings.npy"

	# 📌 Vérification et chargement des données
	if os.path.exists(index_path) and os.path.exists(chunks_path) and os.path.exists(metadata_path) and os.path.exists(embeddings_path):
	print("🔄 Chargement des données existantes...")
	index = faiss.read_index(index_path)
	with open(chunks_path, "rb") as f:
	chunked_docs = pickle.load(f)
	with open(metadata_path, "rb") as f:
	metadata_list = pickle.load(f)
	embeddings = np.load(embeddings_path)
	print("✅ Index, chunks, embeddings et métadonnées chargés avec succès !")
	else:
	print("⚡ Création et stockage d'un nouvel index FAISS...")

	# 📌 Extraction des documents et métadonnées
	def extract_and_chunk_pdfs(pdf_folder):
	documents = SimpleDirectoryReader(pdf_folder, recursive=True).load_data()
	chunked_docs, metadata_list = [], []
	for doc in documents:
	doc_text = doc.text
	file_name = doc.metadata.get("file_name", "Inconnu")
	title = doc.metadata.get("title") or os.path.splitext(file_name)[0] # Utilisation du nom de fichier comme fallback
	doc_metadata = {"source": file_name, "title": title}
	for i in range(0, len(doc_text), chunk_size):
	chunk = doc_text[i:i + chunk_size]
	chunked_docs.append({"text": chunk, "metadata": doc_metadata})
	metadata_list.append(doc_metadata)
	return chunked_docs, metadata_list

	pdf_folder = 'C:/Users/MIPO10053340/OneDrive - Groupe Avril/Bureau/Salon_Agriculture_2024/Micka_API_Call/Docs_pdf/'
	chunked_docs, metadata_list = extract_and_chunk_pdfs(pdf_folder)

	# 📌 Génération des embeddings en parallèle
	def get_embeddings_in_batches(text_chunks, batch_size=5):
	embeddings = []

	def process_batch(batch):
	response = client.embeddings.create(model=model_embedding, inputs=[chunk["text"] for chunk in batch])
	return [data.embedding for data in response.data]

	with ThreadPoolExecutor(max_workers=5) as executor:
	future_batches = [executor.submit(process_batch, text_chunks[i:i+batch_size]) for i in range(0, len(text_chunks), batch_size)]
	for future in future_batches:
	embeddings.extend(future.result())

	return np.array(embeddings).astype('float32')

	embeddings = get_embeddings_in_batches(chunked_docs)

	# 📌 Création et stockage de l'index FAISS
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(embeddings)
	faiss.write_index(index, index_path)

	# 📌 Sauvegarde des données
	with open(chunks_path, "wb") as f:
	pickle.dump(chunked_docs, f)
	with open(metadata_path, "wb") as f:
	pickle.dump(metadata_list, f)
	np.save(embeddings_path, embeddings) # Sauvegarde des embeddings
	print("✅ Index, chunks, embeddings et métadonnées sauvegardés !")

	# 📌 Récupération des chunks les plus pertinents
	def retrieve_relevant_chunks(question, k=5):
	question_embedding_response = client.embeddings.create(
	model=model_embedding,
	inputs=[question],
	)
	question_embedding = np.array(question_embedding_response.data[0].embedding).astype('float32').reshape(1, -1)
	distances, indices = index.search(question_embedding, k)
	if len(indices[0]) == 0:
	print("⚠️ Avertissement : Aucun chunk pertinent trouvé, réponse possible moins précise.")
	return [], []
	return [chunked_docs[i] for i in indices[0]]

	# 📌 Génération de réponse avec MistralAI
	def generate_response(context, question, sources):
	chunk_references = [f"[{i+1}]" for i in range(len(sources))]
	chunk_texts = "\n\n".join([f"{chunk_references[i]} (Source: {src['metadata']['source']}) :\n{src['text']}" for i, src in enumerate(sources)])
	messages = [
	{"role": "system", "content": f"Voici les informations extraites des documents :\n{chunk_texts}\n\nUtilise ces informations pour répondre."},
	{"role": "user", "content": question}
	]
	response = client.chat.complete(model=model_chat, messages=messages, temperature=temperature)
	return response.choices[0].message.content + " " + "".join(chunk_references), chunk_texts

	# 📌 Exécuter une requête utilisateur
	user_question = "Quels sont les besoins en protéines des poulets de chair en phase de croissance ?"
	relevant_chunks = retrieve_relevant_chunks(user_question)
	context = "\n".join([chunk["text"] for chunk in relevant_chunks])
	answer, citations = generate_response(context, user_question, relevant_chunks)

	# 📊 Affichage de la réponse avec sources
	print("\n🔹 Réponse Mistral :")
	print(answer)
	print("\n📌 Chunks utilisés :")
	print(citations)

	# 💾 Sauvegarde des résultats
	with open("mistral_response_types.txt", "w", encoding="utf-8") as f:
	f.write(f"Question : {user_question}\n")
	f.write(f"Réponse :\n{answer}\n")
	f.write(f"{citations}\n")

	print("\n✅ Réponse enregistrée avec les chunks exacts et références dans 'mistral_response_types.txt'")