Spaces:
Sleeping
Sleeping
| import unittest | |
| import numpy as np | |
| from src.mcp import utils | |
| from unittest.mock import patch, mock_open | |
| import tempfile | |
| import os | |
| import json | |
| import pandas as pd | |
| class TestUtils(unittest.TestCase): | |
| """ | |
| Unit tests for src/mcp/utils.py functions. | |
| All file-writing and file-reading operations are mocked or redirected to temporary locations. | |
| No real files in output/ are touched. | |
| """ | |
| def test_detect_language(self): | |
| """Test language detection for French, Arabic, and empty string.""" | |
| self.assertEqual(utils.detect_language("Ceci est un texte en français."), 'fr') | |
| self.assertEqual(utils.detect_language("هذا نص باللغة العربية."), 'ar') | |
| self.assertIn(utils.detect_language(""), ['unknown', '']) | |
| def test_filter_by_language(self): | |
| """Test filtering metadata by language code.""" | |
| metadatas = [ | |
| {"Langue": "fr"}, | |
| {"Langue": "ar"}, | |
| {"Langue": "fr"}, | |
| ] | |
| indices = utils.filter_by_language(metadatas, 'fr') | |
| self.assertEqual(indices, [0, 2]) | |
| indices = utils.filter_by_language(metadatas, 'ar') | |
| self.assertEqual(indices, [1]) | |
| def test_select_documents(self): | |
| """ | |
| Test semantic document selection with a dummy model and embeddings. | |
| The dummy model encodes text to match the first or second document. | |
| """ | |
| metadatas = [ | |
| {"Nom du document": "Doc1", "Catégorie": "CatA", "Langue": "fr"}, | |
| {"Nom du document": "Doc2", "Catégorie": "CatB", "Langue": "fr"}, | |
| ] | |
| embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) | |
| class DummyModel: | |
| def encode(self, text): | |
| if "Doc1" in text or "CatA" in text: | |
| return np.array([1, 0, 0], dtype=np.float32) | |
| else: | |
| return np.array([0, 1, 0], dtype=np.float32) | |
| with patch('src.mcp.utils.get_model', return_value=DummyModel()): | |
| results = utils.select_documents("Doc1 CatA", embeddings, metadatas, lang=None, top_k=1) | |
| self.assertEqual(results[0]["Nom du document"], "Doc1") | |
| def test_detect_intention(self, mock_post): | |
| """ | |
| Test intention detection with mocked Llama3 API responses. | |
| Ensures output is cleaned and mapped to expected values. | |
| """ | |
| mock_post.return_value.json.return_value = {"response": "parlement"} | |
| self.assertEqual(utils.detect_intention("Parlement marocain"), "parlement") | |
| mock_post.return_value.json.return_value = {"response": "lois/règlements"} | |
| self.assertEqual(utils.detect_intention("Code pénal"), "lois/règlements") | |
| mock_post.return_value.json.return_value = {"response": "'parlement'"} | |
| self.assertEqual(utils.detect_intention("Débat parlementaire"), "parlement") | |
| def test_select_parlement_transcript(self): | |
| """ | |
| Test semantic search for parliamentary transcripts with dummy data and model. | |
| All file reads are mocked. | |
| """ | |
| metadatas = [ | |
| {"id": 1, "titre": "Budget 2024", "date": "2024-01-01", "langue": "fr", "lien": "url1"}, | |
| {"id": 2, "titre": "Santé publique", "date": "2024-01-02", "langue": "fr", "lien": "url2"}, | |
| ] | |
| embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) | |
| class DummyModel: | |
| def encode(self, text): | |
| if "Budget" in text: | |
| return np.array([1, 0, 0], dtype=np.float32) | |
| else: | |
| return np.array([0, 1, 0], dtype=np.float32) | |
| with patch('src.mcp.utils.get_model', return_value=DummyModel()): | |
| with patch('numpy.load', return_value=embeddings): | |
| with patch('builtins.open', mock_open(read_data=json.dumps(metadatas))): | |
| with patch('json.load', return_value=metadatas): | |
| results = utils.select_parlement_transcript("Budget", top_k=1) | |
| self.assertEqual(results[0]["titre"], "Budget 2024") | |
| def test_preprocess_and_save_documents(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save): | |
| """ | |
| Test document preprocessing and embedding saving with all file operations mocked. | |
| Ensures no files are written to output/ and all steps are called. | |
| """ | |
| df = pd.DataFrame({ | |
| 'Nom du document': ['Doc1'], | |
| 'Catégorie': ['CatA'], | |
| 'Lien': ['url1'], | |
| 'Langue': ['fr'], | |
| 'Id': [1] | |
| }) | |
| mock_read_csv.return_value = df | |
| class DummyModel: | |
| def encode(self, text): | |
| return np.array([1, 2, 3], dtype=np.float32) | |
| with patch('src.mcp.utils.get_model', return_value=DummyModel()): | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| embeddings_path = os.path.join(tmpdir, 'embeddings.npy') | |
| metadata_path = os.path.join(tmpdir, 'metadatas.json') | |
| utils.preprocess_and_save_documents( | |
| csv_path='dummy.csv', | |
| embeddings_path=embeddings_path, | |
| metadata_path=metadata_path | |
| ) | |
| mock_np_save.assert_called() | |
| mock_json_dump.assert_called() | |
| def test_preprocess_and_save_parlement(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save): | |
| """ | |
| Test parliament transcript preprocessing and embedding saving with all file operations mocked. | |
| Ensures no files are written to output/ and all steps are called. | |
| """ | |
| df = pd.DataFrame({ | |
| 'id': [1], | |
| 'titre': ['Titre1'], | |
| 'date': ['2024-01-01'], | |
| 'langue': ['fr'], | |
| 'lien': ['url1'] | |
| }) | |
| mock_read_csv.return_value = df | |
| class DummyModel: | |
| def encode(self, text): | |
| return np.array([1, 2, 3], dtype=np.float32) | |
| with patch('src.mcp.utils.get_model', return_value=DummyModel()): | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| embeddings_path = os.path.join(tmpdir, 'parlement_embeddings.npy') | |
| metadata_path = os.path.join(tmpdir, 'parlement_metadatas.json') | |
| utils.preprocess_and_save_parlement( | |
| csv_path='dummy.csv', | |
| embeddings_path=embeddings_path, | |
| metadata_path=metadata_path | |
| ) | |
| mock_np_save.assert_called() | |
| mock_json_dump.assert_called() | |
| if __name__ == "__main__": | |
| unittest.main() |