Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

App Files Files Community

RAG_APP / src /tests /test_utils_mcp.py

sxid003

Upload 83 files

3107242 verified 6 months ago

raw

history blame contribute delete

7 kB

	import unittest
	import numpy as np
	from src.mcp import utils
	from unittest.mock import patch, mock_open
	import tempfile
	import os
	import json
	import pandas as pd

	class TestUtils(unittest.TestCase):
	"""
	Unit tests for src/mcp/utils.py functions.
	All file-writing and file-reading operations are mocked or redirected to temporary locations.
	No real files in output/ are touched.
	"""
	def test_detect_language(self):
	"""Test language detection for French, Arabic, and empty string."""
	self.assertEqual(utils.detect_language("Ceci est un texte en français."), 'fr')
	self.assertEqual(utils.detect_language("هذا نص باللغة العربية."), 'ar')
	self.assertIn(utils.detect_language(""), ['unknown', ''])

	def test_filter_by_language(self):
	"""Test filtering metadata by language code."""
	metadatas = [
	{"Langue": "fr"},
	{"Langue": "ar"},
	{"Langue": "fr"},
	]
	indices = utils.filter_by_language(metadatas, 'fr')
	self.assertEqual(indices, [0, 2])
	indices = utils.filter_by_language(metadatas, 'ar')
	self.assertEqual(indices, [1])

	def test_select_documents(self):
	"""
	Test semantic document selection with a dummy model and embeddings.
	The dummy model encodes text to match the first or second document.
	"""
	metadatas = [
	{"Nom du document": "Doc1", "Catégorie": "CatA", "Langue": "fr"},
	{"Nom du document": "Doc2", "Catégorie": "CatB", "Langue": "fr"},
	]
	embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
	class DummyModel:
	def encode(self, text):
	if "Doc1" in text or "CatA" in text:
	return np.array([1, 0, 0], dtype=np.float32)
	else:
	return np.array([0, 1, 0], dtype=np.float32)
	with patch('src.mcp.utils.get_model', return_value=DummyModel()):
	results = utils.select_documents("Doc1 CatA", embeddings, metadatas, lang=None, top_k=1)
	self.assertEqual(results[0]["Nom du document"], "Doc1")

	@patch('src.mcp.utils.requests.post')
	def test_detect_intention(self, mock_post):
	"""
	Test intention detection with mocked Llama3 API responses.
	Ensures output is cleaned and mapped to expected values.
	"""
	mock_post.return_value.json.return_value = {"response": "parlement"}
	self.assertEqual(utils.detect_intention("Parlement marocain"), "parlement")
	mock_post.return_value.json.return_value = {"response": "lois/règlements"}
	self.assertEqual(utils.detect_intention("Code pénal"), "lois/règlements")
	mock_post.return_value.json.return_value = {"response": "'parlement'"}
	self.assertEqual(utils.detect_intention("Débat parlementaire"), "parlement")

	def test_select_parlement_transcript(self):
	"""
	Test semantic search for parliamentary transcripts with dummy data and model.
	All file reads are mocked.
	"""
	metadatas = [
	{"id": 1, "titre": "Budget 2024", "date": "2024-01-01", "langue": "fr", "lien": "url1"},
	{"id": 2, "titre": "Santé publique", "date": "2024-01-02", "langue": "fr", "lien": "url2"},
	]
	embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
	class DummyModel:
	def encode(self, text):
	if "Budget" in text:
	return np.array([1, 0, 0], dtype=np.float32)
	else:
	return np.array([0, 1, 0], dtype=np.float32)
	with patch('src.mcp.utils.get_model', return_value=DummyModel()):
	with patch('numpy.load', return_value=embeddings):
	with patch('builtins.open', mock_open(read_data=json.dumps(metadatas))):
	with patch('json.load', return_value=metadatas):
	results = utils.select_parlement_transcript("Budget", top_k=1)
	self.assertEqual(results[0]["titre"], "Budget 2024")

	@patch('numpy.save')
	@patch('json.dump')
	@patch('os.makedirs')
	@patch('builtins.open', new_callable=mock_open)
	@patch('pandas.read_csv')
	def test_preprocess_and_save_documents(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
	"""
	Test document preprocessing and embedding saving with all file operations mocked.
	Ensures no files are written to output/ and all steps are called.
	"""
	df = pd.DataFrame({
	'Nom du document': ['Doc1'],
	'Catégorie': ['CatA'],
	'Lien': ['url1'],
	'Langue': ['fr'],
	'Id': [1]
	})
	mock_read_csv.return_value = df
	class DummyModel:
	def encode(self, text):
	return np.array([1, 2, 3], dtype=np.float32)
	with patch('src.mcp.utils.get_model', return_value=DummyModel()):
	with tempfile.TemporaryDirectory() as tmpdir:
	embeddings_path = os.path.join(tmpdir, 'embeddings.npy')
	metadata_path = os.path.join(tmpdir, 'metadatas.json')
	utils.preprocess_and_save_documents(
	csv_path='dummy.csv',
	embeddings_path=embeddings_path,
	metadata_path=metadata_path
	)
	mock_np_save.assert_called()
	mock_json_dump.assert_called()

	@patch('numpy.save')
	@patch('json.dump')
	@patch('os.makedirs')
	@patch('builtins.open', new_callable=mock_open)
	@patch('pandas.read_csv')
	def test_preprocess_and_save_parlement(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
	"""
	Test parliament transcript preprocessing and embedding saving with all file operations mocked.
	Ensures no files are written to output/ and all steps are called.
	"""
	df = pd.DataFrame({
	'id': [1],
	'titre': ['Titre1'],
	'date': ['2024-01-01'],
	'langue': ['fr'],
	'lien': ['url1']
	})
	mock_read_csv.return_value = df
	class DummyModel:
	def encode(self, text):
	return np.array([1, 2, 3], dtype=np.float32)
	with patch('src.mcp.utils.get_model', return_value=DummyModel()):
	with tempfile.TemporaryDirectory() as tmpdir:
	embeddings_path = os.path.join(tmpdir, 'parlement_embeddings.npy')
	metadata_path = os.path.join(tmpdir, 'parlement_metadatas.json')
	utils.preprocess_and_save_parlement(
	csv_path='dummy.csv',
	embeddings_path=embeddings_path,
	metadata_path=metadata_path
	)
	mock_np_save.assert_called()
	mock_json_dump.assert_called()

	if __name__ == "__main__":
	unittest.main()