import unittest import numpy as np from src.mcp import utils from unittest.mock import patch, mock_open import tempfile import os import json import pandas as pd class TestUtils(unittest.TestCase): """ Unit tests for src/mcp/utils.py functions. All file-writing and file-reading operations are mocked or redirected to temporary locations. No real files in output/ are touched. """ def test_detect_language(self): """Test language detection for French, Arabic, and empty string.""" self.assertEqual(utils.detect_language("Ceci est un texte en français."), 'fr') self.assertEqual(utils.detect_language("هذا نص باللغة العربية."), 'ar') self.assertIn(utils.detect_language(""), ['unknown', '']) def test_filter_by_language(self): """Test filtering metadata by language code.""" metadatas = [ {"Langue": "fr"}, {"Langue": "ar"}, {"Langue": "fr"}, ] indices = utils.filter_by_language(metadatas, 'fr') self.assertEqual(indices, [0, 2]) indices = utils.filter_by_language(metadatas, 'ar') self.assertEqual(indices, [1]) def test_select_documents(self): """ Test semantic document selection with a dummy model and embeddings. The dummy model encodes text to match the first or second document. """ metadatas = [ {"Nom du document": "Doc1", "Catégorie": "CatA", "Langue": "fr"}, {"Nom du document": "Doc2", "Catégorie": "CatB", "Langue": "fr"}, ] embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) class DummyModel: def encode(self, text): if "Doc1" in text or "CatA" in text: return np.array([1, 0, 0], dtype=np.float32) else: return np.array([0, 1, 0], dtype=np.float32) with patch('src.mcp.utils.get_model', return_value=DummyModel()): results = utils.select_documents("Doc1 CatA", embeddings, metadatas, lang=None, top_k=1) self.assertEqual(results[0]["Nom du document"], "Doc1") @patch('src.mcp.utils.requests.post') def test_detect_intention(self, mock_post): """ Test intention detection with mocked Llama3 API responses. Ensures output is cleaned and mapped to expected values. """ mock_post.return_value.json.return_value = {"response": "parlement"} self.assertEqual(utils.detect_intention("Parlement marocain"), "parlement") mock_post.return_value.json.return_value = {"response": "lois/règlements"} self.assertEqual(utils.detect_intention("Code pénal"), "lois/règlements") mock_post.return_value.json.return_value = {"response": "'parlement'"} self.assertEqual(utils.detect_intention("Débat parlementaire"), "parlement") def test_select_parlement_transcript(self): """ Test semantic search for parliamentary transcripts with dummy data and model. All file reads are mocked. """ metadatas = [ {"id": 1, "titre": "Budget 2024", "date": "2024-01-01", "langue": "fr", "lien": "url1"}, {"id": 2, "titre": "Santé publique", "date": "2024-01-02", "langue": "fr", "lien": "url2"}, ] embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) class DummyModel: def encode(self, text): if "Budget" in text: return np.array([1, 0, 0], dtype=np.float32) else: return np.array([0, 1, 0], dtype=np.float32) with patch('src.mcp.utils.get_model', return_value=DummyModel()): with patch('numpy.load', return_value=embeddings): with patch('builtins.open', mock_open(read_data=json.dumps(metadatas))): with patch('json.load', return_value=metadatas): results = utils.select_parlement_transcript("Budget", top_k=1) self.assertEqual(results[0]["titre"], "Budget 2024") @patch('numpy.save') @patch('json.dump') @patch('os.makedirs') @patch('builtins.open', new_callable=mock_open) @patch('pandas.read_csv') def test_preprocess_and_save_documents(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save): """ Test document preprocessing and embedding saving with all file operations mocked. Ensures no files are written to output/ and all steps are called. """ df = pd.DataFrame({ 'Nom du document': ['Doc1'], 'Catégorie': ['CatA'], 'Lien': ['url1'], 'Langue': ['fr'], 'Id': [1] }) mock_read_csv.return_value = df class DummyModel: def encode(self, text): return np.array([1, 2, 3], dtype=np.float32) with patch('src.mcp.utils.get_model', return_value=DummyModel()): with tempfile.TemporaryDirectory() as tmpdir: embeddings_path = os.path.join(tmpdir, 'embeddings.npy') metadata_path = os.path.join(tmpdir, 'metadatas.json') utils.preprocess_and_save_documents( csv_path='dummy.csv', embeddings_path=embeddings_path, metadata_path=metadata_path ) mock_np_save.assert_called() mock_json_dump.assert_called() @patch('numpy.save') @patch('json.dump') @patch('os.makedirs') @patch('builtins.open', new_callable=mock_open) @patch('pandas.read_csv') def test_preprocess_and_save_parlement(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save): """ Test parliament transcript preprocessing and embedding saving with all file operations mocked. Ensures no files are written to output/ and all steps are called. """ df = pd.DataFrame({ 'id': [1], 'titre': ['Titre1'], 'date': ['2024-01-01'], 'langue': ['fr'], 'lien': ['url1'] }) mock_read_csv.return_value = df class DummyModel: def encode(self, text): return np.array([1, 2, 3], dtype=np.float32) with patch('src.mcp.utils.get_model', return_value=DummyModel()): with tempfile.TemporaryDirectory() as tmpdir: embeddings_path = os.path.join(tmpdir, 'parlement_embeddings.npy') metadata_path = os.path.join(tmpdir, 'parlement_metadatas.json') utils.preprocess_and_save_parlement( csv_path='dummy.csv', embeddings_path=embeddings_path, metadata_path=metadata_path ) mock_np_save.assert_called() mock_json_dump.assert_called() if __name__ == "__main__": unittest.main()