| | |
| | |
| |
|
| | import os |
| | import shutil |
| | import tempfile |
| | import unittest |
| | from typing import Optional |
| |
|
| |
|
| | class TestFileChunker(unittest.TestCase): |
| | _tmpdir: Optional[str] = None |
| | _tmpfile: Optional[str] = None |
| | _line_content = "Hello, World\n" |
| | _num_bytes = None |
| | _num_lines = 200 |
| | _num_splits = 20 |
| |
|
| | @classmethod |
| | def setUpClass(cls) -> None: |
| | cls._num_bytes = len(cls._line_content.encode("utf-8")) |
| | cls._tmpdir = tempfile.mkdtemp() |
| | with open(os.path.join(cls._tmpdir, "test.txt"), "w") as f: |
| | cls._tmpfile = f.name |
| | for _i in range(cls._num_lines): |
| | f.write(cls._line_content) |
| | f.flush() |
| |
|
| | @classmethod |
| | def tearDownClass(cls) -> None: |
| | |
| | if cls._tmpdir is not None: |
| | shutil.rmtree(cls._tmpdir) |
| |
|
| | def test_find_offsets(self): |
| | from fairseq.file_chunker_utils import find_offsets |
| |
|
| | offsets = find_offsets(self._tmpfile, self._num_splits) |
| | self.assertEqual(len(offsets), self._num_splits + 1) |
| | (zero, *real_offsets, last) = offsets |
| | self.assertEqual(zero, 0) |
| | for i, o in enumerate(real_offsets): |
| | self.assertEqual( |
| | o, |
| | self._num_bytes |
| | + ((i + 1) * self._num_bytes * self._num_lines / self._num_splits), |
| | ) |
| | self.assertEqual(last, self._num_bytes * self._num_lines) |
| |
|
| | def test_readchunks(self): |
| | from fairseq.file_chunker_utils import Chunker, find_offsets |
| |
|
| | offsets = find_offsets(self._tmpfile, self._num_splits) |
| | for start, end in zip(offsets, offsets[1:]): |
| | with Chunker(self._tmpfile, start, end) as lines: |
| | all_lines = list(lines) |
| | num_lines = self._num_lines / self._num_splits |
| | self.assertAlmostEqual( |
| | len(all_lines), num_lines, delta=1 |
| | ) |
| | self.assertListEqual( |
| | all_lines, [self._line_content for _ in range(len(all_lines))] |
| | ) |
| |
|