Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

File size: 6,996 Bytes
import unittest
import numpy as np
from src.mcp import utils
from unittest.mock import patch, mock_open
import tempfile
import os
import json
import pandas as pd

class TestUtils(unittest.TestCase):
    """
    Unit tests for src/mcp/utils.py functions.
    All file-writing and file-reading operations are mocked or redirected to temporary locations.
    No real files in output/ are touched.
    """
    def test_detect_language(self):
        """Test language detection for French, Arabic, and empty string."""
        self.assertEqual(utils.detect_language("Ceci est un texte en français."), 'fr')
        self.assertEqual(utils.detect_language("هذا نص باللغة العربية."), 'ar')
        self.assertIn(utils.detect_language(""), ['unknown', ''])

    def test_filter_by_language(self):
        """Test filtering metadata by language code."""
        metadatas = [
            {"Langue": "fr"},
            {"Langue": "ar"},
            {"Langue": "fr"},
        ]
        indices = utils.filter_by_language(metadatas, 'fr')
        self.assertEqual(indices, [0, 2])
        indices = utils.filter_by_language(metadatas, 'ar')
        self.assertEqual(indices, [1])

    def test_select_documents(self):
        """
        Test semantic document selection with a dummy model and embeddings.
        The dummy model encodes text to match the first or second document.
        """
        metadatas = [
            {"Nom du document": "Doc1", "Catégorie": "CatA", "Langue": "fr"},
            {"Nom du document": "Doc2", "Catégorie": "CatB", "Langue": "fr"},
        ]
        embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
        class DummyModel:
            def encode(self, text):
                if "Doc1" in text or "CatA" in text:
                    return np.array([1, 0, 0], dtype=np.float32)
                else:
                    return np.array([0, 1, 0], dtype=np.float32)
        with patch('src.mcp.utils.get_model', return_value=DummyModel()):
            results = utils.select_documents("Doc1 CatA", embeddings, metadatas, lang=None, top_k=1)
            self.assertEqual(results[0]["Nom du document"], "Doc1")

    @patch('src.mcp.utils.requests.post')
    def test_detect_intention(self, mock_post):
        """
        Test intention detection with mocked Llama3 API responses.
        Ensures output is cleaned and mapped to expected values.
        """
        mock_post.return_value.json.return_value = {"response": "parlement"}
        self.assertEqual(utils.detect_intention("Parlement marocain"), "parlement")
        mock_post.return_value.json.return_value = {"response": "lois/règlements"}
        self.assertEqual(utils.detect_intention("Code pénal"), "lois/règlements")
        mock_post.return_value.json.return_value = {"response": "'parlement'"}
        self.assertEqual(utils.detect_intention("Débat parlementaire"), "parlement")

    def test_select_parlement_transcript(self):
        """
        Test semantic search for parliamentary transcripts with dummy data and model.
        All file reads are mocked.
        """
        metadatas = [
            {"id": 1, "titre": "Budget 2024", "date": "2024-01-01", "langue": "fr", "lien": "url1"},
            {"id": 2, "titre": "Santé publique", "date": "2024-01-02", "langue": "fr", "lien": "url2"},
        ]
        embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
        class DummyModel:
            def encode(self, text):
                if "Budget" in text:
                    return np.array([1, 0, 0], dtype=np.float32)
                else:
                    return np.array([0, 1, 0], dtype=np.float32)
        with patch('src.mcp.utils.get_model', return_value=DummyModel()):
            with patch('numpy.load', return_value=embeddings):
                with patch('builtins.open', mock_open(read_data=json.dumps(metadatas))):
                    with patch('json.load', return_value=metadatas):
                        results = utils.select_parlement_transcript("Budget", top_k=1)
                        self.assertEqual(results[0]["titre"], "Budget 2024")

    @patch('numpy.save')
    @patch('json.dump')
    @patch('os.makedirs')
    @patch('builtins.open', new_callable=mock_open)
    @patch('pandas.read_csv')
    def test_preprocess_and_save_documents(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
        """
        Test document preprocessing and embedding saving with all file operations mocked.
        Ensures no files are written to output/ and all steps are called.
        """
        df = pd.DataFrame({
            'Nom du document': ['Doc1'],
            'Catégorie': ['CatA'],
            'Lien': ['url1'],
            'Langue': ['fr'],
            'Id': [1]
        })
        mock_read_csv.return_value = df
        class DummyModel:
            def encode(self, text):
                return np.array([1, 2, 3], dtype=np.float32)
        with patch('src.mcp.utils.get_model', return_value=DummyModel()):
            with tempfile.TemporaryDirectory() as tmpdir:
                embeddings_path = os.path.join(tmpdir, 'embeddings.npy')
                metadata_path = os.path.join(tmpdir, 'metadatas.json')
                utils.preprocess_and_save_documents(
                    csv_path='dummy.csv',
                    embeddings_path=embeddings_path,
                    metadata_path=metadata_path
                )
        mock_np_save.assert_called()
        mock_json_dump.assert_called()

    @patch('numpy.save')
    @patch('json.dump')
    @patch('os.makedirs')
    @patch('builtins.open', new_callable=mock_open)
    @patch('pandas.read_csv')
    def test_preprocess_and_save_parlement(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
        """
        Test parliament transcript preprocessing and embedding saving with all file operations mocked.
        Ensures no files are written to output/ and all steps are called.
        """
        df = pd.DataFrame({
            'id': [1],
            'titre': ['Titre1'],
            'date': ['2024-01-01'],
            'langue': ['fr'],
            'lien': ['url1']
        })
        mock_read_csv.return_value = df
        class DummyModel:
            def encode(self, text):
                return np.array([1, 2, 3], dtype=np.float32)
        with patch('src.mcp.utils.get_model', return_value=DummyModel()):
            with tempfile.TemporaryDirectory() as tmpdir:
                embeddings_path = os.path.join(tmpdir, 'parlement_embeddings.npy')
                metadata_path = os.path.join(tmpdir, 'parlement_metadatas.json')
                utils.preprocess_and_save_parlement(
                    csv_path='dummy.csv',
                    embeddings_path=embeddings_path,
                    metadata_path=metadata_path
                )
        mock_np_save.assert_called()
        mock_json_dump.assert_called()

if __name__ == "__main__":
    unittest.main()