RAG_APP / src /tests /test_utils_mcp.py
sxid003's picture
Upload 83 files
3107242 verified
import unittest
import numpy as np
from src.mcp import utils
from unittest.mock import patch, mock_open
import tempfile
import os
import json
import pandas as pd
class TestUtils(unittest.TestCase):
"""
Unit tests for src/mcp/utils.py functions.
All file-writing and file-reading operations are mocked or redirected to temporary locations.
No real files in output/ are touched.
"""
def test_detect_language(self):
"""Test language detection for French, Arabic, and empty string."""
self.assertEqual(utils.detect_language("Ceci est un texte en français."), 'fr')
self.assertEqual(utils.detect_language("هذا نص باللغة العربية."), 'ar')
self.assertIn(utils.detect_language(""), ['unknown', ''])
def test_filter_by_language(self):
"""Test filtering metadata by language code."""
metadatas = [
{"Langue": "fr"},
{"Langue": "ar"},
{"Langue": "fr"},
]
indices = utils.filter_by_language(metadatas, 'fr')
self.assertEqual(indices, [0, 2])
indices = utils.filter_by_language(metadatas, 'ar')
self.assertEqual(indices, [1])
def test_select_documents(self):
"""
Test semantic document selection with a dummy model and embeddings.
The dummy model encodes text to match the first or second document.
"""
metadatas = [
{"Nom du document": "Doc1", "Catégorie": "CatA", "Langue": "fr"},
{"Nom du document": "Doc2", "Catégorie": "CatB", "Langue": "fr"},
]
embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
class DummyModel:
def encode(self, text):
if "Doc1" in text or "CatA" in text:
return np.array([1, 0, 0], dtype=np.float32)
else:
return np.array([0, 1, 0], dtype=np.float32)
with patch('src.mcp.utils.get_model', return_value=DummyModel()):
results = utils.select_documents("Doc1 CatA", embeddings, metadatas, lang=None, top_k=1)
self.assertEqual(results[0]["Nom du document"], "Doc1")
@patch('src.mcp.utils.requests.post')
def test_detect_intention(self, mock_post):
"""
Test intention detection with mocked Llama3 API responses.
Ensures output is cleaned and mapped to expected values.
"""
mock_post.return_value.json.return_value = {"response": "parlement"}
self.assertEqual(utils.detect_intention("Parlement marocain"), "parlement")
mock_post.return_value.json.return_value = {"response": "lois/règlements"}
self.assertEqual(utils.detect_intention("Code pénal"), "lois/règlements")
mock_post.return_value.json.return_value = {"response": "'parlement'"}
self.assertEqual(utils.detect_intention("Débat parlementaire"), "parlement")
def test_select_parlement_transcript(self):
"""
Test semantic search for parliamentary transcripts with dummy data and model.
All file reads are mocked.
"""
metadatas = [
{"id": 1, "titre": "Budget 2024", "date": "2024-01-01", "langue": "fr", "lien": "url1"},
{"id": 2, "titre": "Santé publique", "date": "2024-01-02", "langue": "fr", "lien": "url2"},
]
embeddings = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32)
class DummyModel:
def encode(self, text):
if "Budget" in text:
return np.array([1, 0, 0], dtype=np.float32)
else:
return np.array([0, 1, 0], dtype=np.float32)
with patch('src.mcp.utils.get_model', return_value=DummyModel()):
with patch('numpy.load', return_value=embeddings):
with patch('builtins.open', mock_open(read_data=json.dumps(metadatas))):
with patch('json.load', return_value=metadatas):
results = utils.select_parlement_transcript("Budget", top_k=1)
self.assertEqual(results[0]["titre"], "Budget 2024")
@patch('numpy.save')
@patch('json.dump')
@patch('os.makedirs')
@patch('builtins.open', new_callable=mock_open)
@patch('pandas.read_csv')
def test_preprocess_and_save_documents(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
"""
Test document preprocessing and embedding saving with all file operations mocked.
Ensures no files are written to output/ and all steps are called.
"""
df = pd.DataFrame({
'Nom du document': ['Doc1'],
'Catégorie': ['CatA'],
'Lien': ['url1'],
'Langue': ['fr'],
'Id': [1]
})
mock_read_csv.return_value = df
class DummyModel:
def encode(self, text):
return np.array([1, 2, 3], dtype=np.float32)
with patch('src.mcp.utils.get_model', return_value=DummyModel()):
with tempfile.TemporaryDirectory() as tmpdir:
embeddings_path = os.path.join(tmpdir, 'embeddings.npy')
metadata_path = os.path.join(tmpdir, 'metadatas.json')
utils.preprocess_and_save_documents(
csv_path='dummy.csv',
embeddings_path=embeddings_path,
metadata_path=metadata_path
)
mock_np_save.assert_called()
mock_json_dump.assert_called()
@patch('numpy.save')
@patch('json.dump')
@patch('os.makedirs')
@patch('builtins.open', new_callable=mock_open)
@patch('pandas.read_csv')
def test_preprocess_and_save_parlement(self, mock_read_csv, mock_openfile, mock_makedirs, mock_json_dump, mock_np_save):
"""
Test parliament transcript preprocessing and embedding saving with all file operations mocked.
Ensures no files are written to output/ and all steps are called.
"""
df = pd.DataFrame({
'id': [1],
'titre': ['Titre1'],
'date': ['2024-01-01'],
'langue': ['fr'],
'lien': ['url1']
})
mock_read_csv.return_value = df
class DummyModel:
def encode(self, text):
return np.array([1, 2, 3], dtype=np.float32)
with patch('src.mcp.utils.get_model', return_value=DummyModel()):
with tempfile.TemporaryDirectory() as tmpdir:
embeddings_path = os.path.join(tmpdir, 'parlement_embeddings.npy')
metadata_path = os.path.join(tmpdir, 'parlement_metadatas.json')
utils.preprocess_and_save_parlement(
csv_path='dummy.csv',
embeddings_path=embeddings_path,
metadata_path=metadata_path
)
mock_np_save.assert_called()
mock_json_dump.assert_called()
if __name__ == "__main__":
unittest.main()